def _web_validator(self, proxies): rs = (request(choice(WEBSITES), proxy=proxy, is_map=True) for proxy in proxies) resps = grequests.map(rs, gtimeout=TIMEOUT, exception_handler=eh) for resp, proxy in zip(resps, proxies): COLLECTION.update_one({'proxy': proxy}, { '$set': { 'proxy': proxy }, '$inc': { 'detect_times': 1 }, }, upsert=True) if not resp or resp.status_code != 200: continue self.proxy_validated_q.put(proxy) self.count_website_ok += 1 self.logger.info('websites available: %s %s %s', self.count_website_ok, proxy, resp.url) COLLECTION.update_one({'proxy': proxy}, { '$addToSet': { 'type': 'normal' }, '$inc': { 'alive_times': 1 } })
def real_time(self): while True: proxy, protocol = self.real_time_q.get() # if not proxy: # time.sleep(60*2) self.logger.info('real time testing: %s', proxy) # TODO: websites http https resp = request('https://www.baidu.com/', proxy=proxy, timeout=5) if not resp: self.logger.info('dead: %s', proxy) continue self.logger.info('still alive: %s', proxy) self.real_time_q.put(proxy) HIGH_ANONYMOUS.update_one({'proxy': proxy}, { '$set': { 'proxy': proxy, 'protocol': protocol }, '$inc': { 'alive_times': 1 }, '$push': { 'alive_time_base': datetime.now() } }, upsert=True)
def collect(self, concurrent_num=3): while not self.q.empty(): elements = cut_queue(self.q, concurrent_num) reqs = (request(ele[0], is_map=True) for ele in elements) resps = grequests.map(reqs, gtimeout=10, exception_handler=eh) for resp, element in zip(resps, elements): if not resp: self.q.put(element) self.logger.error('recycle %s', element[0]) continue self.logger.info('fetched %s', resp.url) element[1](resp)
def collect(self, concurrent_num=10): while not self.q.empty(): if self.proxy_q.qsize() < concurrent_num: self.get_proxies() proxies = cut_queue(self.proxy_q, concurrent_num) _urls = cut_queue(self.q, concurrent_num) reqs = (request(u, proxy=p, is_map=True) for u, p in zip(_urls, proxies)) resps = grequests.map(reqs, gtimeout=10) for url, resp, proxy in zip(_urls, resps, proxies): if not resp or len(RE_HOST.findall(resp.text)) < 5: self.logger.error('recycle: %s', url) self.q.put(url) # requests.post(api_detect_url, data=json.dumps(proxy)) continue self.logger.info('fetched: %s', url) self.proxy_q.put(proxy) # requests.post(api_alive_url, data=json.dumps(proxy)) self.parse_regex(resp)
def _test_validator(self, proxy): COLLECTION.update_one({'proxy': proxy}, {'$inc': {'detect_times': 1}}) detect_times_label = 0 http, https = { 'http': proxy.get('http') }, { 'https': proxy.get('https') } reqs = (request(choice(TESTSITES.get('http')), proxy=http, is_map=True), request(choice(TESTSITES.get('https')), proxy=https, is_map=True)) resps = grequests.map(reqs, gtimeout=10, exception_handler=eh) for index, resp in enumerate(resps): if not resp: continue matches = RE_HOST.findall(resp.text) if not matches or self.my_ip in matches: continue p = (https, 'https') if index > 0 else (http, 'http') r = request(p[0], timeout=10) if r: continue self.real_time_q.put(p) self.logger.info('%s, %s', matches, proxy) detect_times_label += 1 if detect_times_label == 2: COLLECTION.update_one({'proxy': proxy}, {'$inc': { 'detect_times': 1 }}) self.count_testsite_ok += 1 cursor = COLLECTION.find_one({'proxy': proxy}) this_id = cursor.get('_id') alive_times = cursor.get('alive_times') + 1 detect_times = cursor.get('detect_times') # [0, 1] score = alive_times * 2 / (alive_times + detect_times) self.logger.info('high anonymity: %s score: %s %s %s', self.count_testsite_ok, score, p[0], resp.url) COLLECTION.update_one({'_id': ObjectId(this_id)}, { '$set': { 'score': score }, '$push': { 'alive_time_base': datetime.now() }, '$addToSet': { 'type': 'high', 'protocol': p[1] }, '$inc': { 'alive_times': 1 } })
def get_my_ip(self): resp = request('http://httpbin.org/ip') if not resp: raise ValueError('cannot get my ip!') self.logger.info('my ip: %s', resp.json().get('origin')) return resp.json().get('origin')
def _get_speciality(self): while True: url = self.speciality_q.get() resp = utils.request(url) self._parse_speciality(resp, url) self.speciality_q.task_done()
def _get_homepage(self): while not self.homepage_q.empty(): url = self.homepage_q.get() resp = utils.request(url) self._parse_homepage(resp, url) self.speciality_q.join()