def verify(self): item = self.queue.get() proxy = item.get("proxy") if ConfigManager.setting_config.setting.get("custom_verify_url"): verify_result = self.customVerifyProxy(proxy) else: verify_result = self.defaultVerifyProxy(proxy) if verify_result: if self.checkProxyInfo(item): self.updateUsefulProxy(item) ProxyManager.proxy_manager.tickUsefulProxyVaildSucc(proxy) self.stat["succ"] = self.stat["succ"] + 1 log.debug("useful_proxy:{proxy} verify succ".format(proxy=proxy)) else: ProxyManager.proxy_manager.tickUsefulProxyVaildFail(proxy) self.stat["fail"] = self.stat["fail"] + 1 log.debug("useful_proxy:{proxy} verify fail".format(proxy=proxy)) self.queue.task_done() ProxyManager.proxy_manager.tickUsefulProxyVaildTotal(proxy) ProxyManager.proxy_manager.updateUsefulProxyNextVerifyTime(proxy, self.start_time) self.stat["total"] = self.stat["total"] + 1
def defaultVerifyProxy(self, proxy): result = None if isinstance(proxy, bytes): proxy = proxy.decode('utf8') proxies = { "http": proxy, } http_url = "http://httpbin.org/ip" try: r = requests.get(http_url, proxies=proxies, timeout=10, verify=False) data = r.json() status_result = r.status_code == 200 content_result = "origin" in data if status_result and content_result: result = True except Exception as e: log.debug( "proxy:{proxy} http verify proxy fail, error:{error}".format( proxy=proxy, error=e)) result = False return result
def getQualityUsefulProxy(self, **kwargs): https = kwargs.get("https", None) region = kwargs.get("region", None) type_ = kwargs.get("type", None) result = None operation_list = [ { "$match": { "total": { "$ne": 0}, "last_status": { "$eq": ProxyManager.PROXY_LAST_STATUS["SUCC"] }, } }, { "$project": { "proxy": 1, "total": 1, "succ_rate": { "$divide": ["$succ", "$total"] } }, }, { "$sort": { "succ_rate": -1, "total": -1 }, }, ] if https: operation_list[0]["$match"]["https"] = { "$eq": https } if type_: operation_list[0]["$match"]["type"] = { "$eq": type_ } if region: operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region) log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list))) result = self.mc.aggregate(operation_list) return result
def customVerifyProxy(self, proxy): result = None if isinstance(proxy, bytes): proxy = proxy.decode('utf8') proxies = { "http": proxy, "https": proxy, } verify_url = ConfigManager.setting_config.setting.get("custom_verify_url") try: content_result = True r = requests.get(verify_url, proxies=proxies, timeout=10, verify=False) pattern = ConfigManager.setting_config.setting.get("custom_verify_content") if pattern: content = r.content.decode('utf-8') search_result = re.search(pattern, content) content_result = search_result != None status_result = r.status_code == 200 if status_result and content_result: result = True except Exception as e: log.debug("proxy:{proxy} http verify proxy fail, error:{error}".format(proxy=proxy, error=e)) result = False return result
def getSampleUsefulProxy(self, **kwargs): https = kwargs.get("https", None) region = kwargs.get("region", None) type_ = kwargs.get("type", None) result = None operation_list = [ { "$match": { "total": { "$ne": 0}, "last_status": { "$eq": ProxyManager.PROXY_LAST_STATUS["SUCC"] }, } }, { "$sample": { "size": 1} } ] if https: operation_list[0]["$match"]["https"] = { "$eq": https } if type_: operation_list[0]["$match"]["type"] = { "$eq": type_ } if region: operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region) log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list))) data = self.mc.aggregate(operation_list) if data: result = data[0] return result
def getQualityUsefulProxy(self, **kwargs): https = kwargs.get("https", None) region = kwargs.get("region", None) type_ = kwargs.get("type", None) result = None operation_list = [ { "$match": { "total": { "$ne": 0 }, } }, { "$sort": { "quality": -1, "total": -1 }, }, ] if https: operation_list[0]["$match"]["https"] = { "$eq": https } if type_: operation_list[0]["$match"]["type"] = { "$eq": type_ } if region: operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region) log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list))) result = self.mc.aggregate(operation_list) return result
def getAllValidUsefulProxy(self, **kwargs): https = kwargs.get("https", None) region = kwargs.get("region", None) type_ = kwargs.get("type", None) result = [] operation_list = [ { "$match": { "total": { "$ne": 0 } } } ] if https: operation_list[0]["$match"]["https"] = { "$eq": https } if type_: operation_list[0]["$match"]["type"] = { "$eq": type_ } if region: operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region) log.debug("getAllValidUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list))) result = self.mc.aggregate(operation_list) return result
def start(self): start_time = time.time() log.debug("useful_proxy proxy verify start") self.stat = dict( total=0, succ=0, fail=0, ) concurrency = ConfigManager.setting_config.setting.get( "verify_useful_proxy_concurrency") queue_size = self.queue.qsize() if concurrency > queue_size: spawn_num = queue_size else: spawn_num = concurrency greenlet_list = [] for _ in range(spawn_num): greenlet_list.append(gevent.spawn(self.run)) gevent.joinall(greenlet_list) end_time = time.time() elapsed_time = int(end_time - start_time) log.info( 'useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s' .format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], elapsed_time=elapsed_time))
def run(self): self.db.changeTable(self.useful_proxy_queue) thread_id = threading.currentThread().ident log.info("thread_id:{thread_id} useful_proxy proxy check start".format( thread_id=thread_id)) total = 0 succ = 0 fail = 0 while self.queue.qsize(): proxy = self.queue.get() (http_result, _) = validUsefulProxy(proxy) if http_result: self.tickUsefulProxyVaildSucc(proxy) succ = succ + 1 log.debug( "ProxyCheck: {proxy} validation pass".format(proxy=proxy)) else: self.tickUsefulProxyVaildFail(proxy) fail = fail + 1 log.debug( "ProxyCheck: {proxy} validation fail".format(proxy=proxy)) self.queue.task_done() total = total + 1 self.tickUsefulProxyVaildTotal(proxy) log.info( 'thread_id:{thread_id} proxy check end, total:{total}, succ:{succ}, fail:{fail}' .format(thread_id=thread_id, total=total, succ=succ, fail=fail))
def getSampleProxy(self, **kwargs): item = self.db.getSampleUsefulProxy(**kwargs) result = None if item: result = item["proxy"] log.debug("getSampleUsefulProxy, item:{item}".format(item=str(item))) return result
def getProxyInfo(self, proxy): info = {} data = proxy.split(':') info["ip"] = data[0] info["port"] = data[1] info["address"] = proxy proxies = { "http": proxy, "https": proxy, } http_url = "http://httpbin.org/ip" https_url = "https://httpbin.org/ip" result = False info["https"] = ProxyManager.PROXY_HTTPS["UNKNOWN"] info["type"] = ProxyManager.PROXY_TYPE["UNKNOWN"] # http verify try: r = requests.get(http_url, proxies=proxies, timeout=10, verify=False) data = r.json() ip_string = data["origin"] ip_list = ip_string.split(", ") status_result = r.status_code == 200 content_result = "origin" in data if status_result and content_result: result = True if len(ip_list) > 1: info["type"] = ProxyManager.PROXY_TYPE["CLEAR"] else: info["type"] = ProxyManager.PROXY_TYPE["ANONYMOUS"] except Exception as e: log.debug("proxy:[{proxy}] http verify fail, error:{error}".format(proxy=proxy, error=e)) result = False if result: # https verify try: r = requests.get(https_url, proxies=proxies, timeout=10, verify=False) status_result = r.status_code == 200 content_result = "origin" in data if status_result and content_result: info["https"] = ProxyManager.PROXY_HTTPS["ENABLE"] except Exception as e: log.debug("proxy [{proxy}] https verify fail, error:{error}".format(proxy=proxy, error=e)) info["https"] = ProxyManager.PROXY_HTTPS["DISABLE"] return info
def check_fetch_new_proxy(self): total_number = ProxyManager.proxy_manager.getRawProxyNumber() hold_number = ConfigManager.setting_config.setting.get("hold_raw_proxy_number") if total_number < hold_number or hold_number == -1: log.debug("fetch new proxy start, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number)) result = True else: log.debug("fetch new proxy skip, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number)) result = False return result
def getSampleUsefulProxy(self, **kwargs): item = self.db.getSampleUsefulProxy(**kwargs) result = None if item: result = item["proxy"] token = kwargs.get("token", None) if token: self.db.addProxyUsedToken(result, token) log.debug("getSampleUsefulProxy, item:{item}".format(item=str(item))) return result
def get(self): result = {"data": {}} options = { "https": self.args.get('https'), "type": self.args.get('type'), "region": self.args.get('region'), } log.debug("receive params: {}".format(options)) item = proxy_manager.getSampleUsefulProxy(**options) if item: del item["_id"] result["data"] = item return result
def get(self): item = None item_list = [] self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: item_list = list(item_dict.keys()) else: item_list = item_dict.keys() if item_list: item = random.choice(item_list) log.debug('Get Random Proxy {item} of {total}'.format( item=item, total=len(item_list))) return item
def validProxy(self): thread_id = threading.currentThread().ident log.info("thread_id:{thread_id}, Start ValidProxy `raw_proxy_queue`".format(thread_id=thread_id)) total = 0 succ = 0 fail = 0 while self.queue.qsize(): proxy = self.queue.get() if proxy not in self.remaining_proxies: (http_result, https_result) = validUsefulProxy(proxy) if http_result: self.saveUsefulProxy(proxy, https_result) self.deleteRawProxy(proxy) self.remaining_proxies.append(proxy) succ = succ + 1 else: self.tickRawProxyVaildFail(proxy) fail = fail + 1 log.debug('ProxyRefreshSchedule: %s validation fail' % proxy) # self.tickRawProxyVaildSucc(proxy) log.debug('ProxyRefreshSchedule: %s validation pass' % proxy) else: self.deleteRawProxy(proxy) log.debug('ProxyRefreshSchedule: %s repetition, skip!' % proxy) self.queue.task_done() self.tickRawProxyVaildTotal(proxy) total = total + 1 log.info('thread_id:{thread_id}, ValidProxy Complete `raw_proxy_queue`, total:{total}, succ:{succ}, fail:{fail}'.format(thread_id=thread_id, total=total, succ=succ, fail=fail))
def verify(self): raw_proxy_item = self.queue.get() raw_proxy = raw_proxy_item.get("proxy") if isinstance(raw_proxy, bytes): raw_proxy = raw_proxy.decode('utf8') if raw_proxy not in self.useful_proxies: if ConfigManager.setting_config.setting.get("custom_verify_url"): verify_result = self.customVerifyProxy(raw_proxy) else: verify_result = self.defaultVerifyProxy(raw_proxy) if verify_result: ProxyManager.proxy_manager.saveUsefulProxy(raw_proxy) ProxyManager.proxy_manager.deleteRawProxy(raw_proxy) self.useful_proxies[raw_proxy] = True self.stat["succ"] = self.stat["succ"] + 1 log.debug("raw_proxy:{raw_proxy} verify succ".format( raw_proxy=raw_proxy)) else: ProxyManager.proxy_manager.tickRawProxyVaildFail(raw_proxy) self.stat["fail"] = self.stat["fail"] + 1 log.debug("raw_proxy:{raw_proxy} verify fail".format( raw_proxy=raw_proxy)) else: ProxyManager.proxy_manager.deleteRawProxy(raw_proxy) self.stat["skip"] = self.stat["skip"] + 1 log.debug("raw_proxy:{raw_proxy} verify repetition".format( raw_proxy=raw_proxy)) self.queue.task_done() self.stat["total"] = self.stat["total"] + 1
def refresh(self): proxy_getter_functions = config.cf.options("ProxyGetter") for proxyGetter in proxy_getter_functions: try: log.info( "Fetch Proxy Start, func:{func}".format(func=proxyGetter)) total = 0 succ = 0 fail = 0 for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if proxy and verifyProxyFormat( proxy) and not self.checkRawProxyExists(proxy): self.saveRawProxy(proxy) succ = succ + 1 log.debug('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) else: fail = fail + 1 log.error('{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) total = total + 1 log.info( "fetch proxy end, func:{func}, total:{total}, succ:{succ} fail:{fail}" .format(func=proxyGetter, total=total, succ=succ, fail=fail)) except Exception as e: log.error( "func_name:{func_name} fetch proxy fail, error:{error}". format(func_name=proxyGetter, error=e)) continue
def fetch(self): start_time = time.time() total = 0 succ = 0 fail = 0 skip = 0 fetcher = self.queue.get() name = fetcher["name"] fetcher_class = FetcherManager.getFetcherClass(name) log.debug("fetch [{name}] proxy start".format(name=name)) try: f = fetcher_class() for proxy in f.run(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy) and \ not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy): ProxyManager.proxy_manager.saveUsefulProxy(proxy) succ = succ + 1 log.debug("fetch [{name}] proxy {proxy} succ".format( name=name, proxy=proxy)) else: skip = skip + 1 log.debug("fetch [{name}] proxy {proxy} skip".format( name=name, proxy=proxy)) total = total + 1 except Exception as e: log.error("fetch [{name}] proxy fail: {error}".format(name=name, error=e)) fail = fail + 1 self.queue.task_done() now = int(time.time()) elapsed_time = int(now - start_time) next_fetch_time = self.start_time + (fetcher["interval"] * 60) data = { "$inc": { "succ": succ, "fail": fail, "skip": skip, "total": total, }, "$set": { "next_fetch_time": next_fetch_time, } } ProxyManager.proxy_manager.updateFetcher(name, data) log.info("fetch [{name:^15}] proxy finish, \ total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s" . \ format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
def fetch(self): start_time = time.time() total = 0 succ = 0 fail = 0 skip = 0 fetcher_name = self.queue.get() fetcher_class = FetcherManager.get_class(fetcher_name) log.debug("fetch [{fetcher_name}] proxy start".format( fetcher_name=fetcher_name)) try: f = fetcher_class() for proxy in f.run(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy) and \ not proxy_manager.checkRawProxyExists(proxy) and \ not proxy_manager.checkUsefulProxyExists(proxy): proxy_manager.saveRawProxy(proxy) succ = succ + 1 log.debug( "fetch [{fetcher_name}] proxy {proxy} succ".format( fetcher_name=fetcher_name, proxy=proxy)) else: skip = skip + 1 log.debug( "fetch [{fetcher_name}] proxy {proxy} skip".format( fetcher_name=fetcher_name, proxy=proxy)) total = total + 1 except Exception as e: log.error("fetch [{fetcher_name}] proxy fail: {error}".format( fetcher_name=fetcher_name, error=e)) fail = fail + 1 end_time = time.time() elapsed_time = int(end_time - start_time) self.queue.task_done() stat = dict( total=total, succ=succ, fail=fail, skip=skip, ) ConfigManager.fetcher_config.update_stat(fetcher_name, stat) log.info( "fetch [{fetcher_name}] proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s" .format(fetcher_name=fetcher_name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))