class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.useful_proxy_queue = 'useful_proxy_queue' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy.strip(): proxy_set.add(proxy.strip()) # store raw proxy self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.get() # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def get_status(self): self.db.changeTable(self.raw_proxy_queue) quan_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) quan_useful_queue = self.db.get_status() return {'raw_proxy': quan_raw_proxy, 'useful_proxy_queue': quan_useful_queue}
class LoginCore: def __init__(self): self.db = DbClient() def cookie_login(self,driver,worker,account_dict): """ 网站登录并获取cookies """ cookies = worker.login(driver,account_dict) return cookies def check_cookies(self,driver,worker,cookies): result = worker.check_cookies(driver,cookies) return result def save_cookies(self,cookies,account_dict,cookie_queue): """ 保存cookie到cookie池里面 """ try: self.db.put(account_dict['account'],data=cookies) except Exception as e: raise e def run(self,driver,worker,account_dict): cookies = self.cookie_login(driver,worker,account_dict) time.sleep(2) result = self.check_cookies(driver,worker,cookies) if result: self.save_cookies(cookies,account_dict,worker.cookie_queue) driver.quit()
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.useful_proxy_queue = 'useful_proxy_queue' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy if proxy_set: for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy_set.add(proxy) # store raw proxy self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll()
class AsdlProxyManager(): def __init__(self): self.asdl_proxy_queue = "asdl_proxy" self.db = DbClient() def add_asdl_proxy(self, proxy_str): proxy = Proxy(proxy_str, last_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S')) self.db.changeTable(self.asdl_proxy_queue) self.db.put(proxy) def delete_asdl_proxy(self, proxy_str): self.db.changeTable(self.asdl_proxy_queue) self.db.delete(proxy_str) def get_all_proxy(self): self.db.changeTable(self.asdl_proxy_queue) item_list = self.db.getAll() return [Proxy.newProxyFromJson(_) for _ in item_list]
class PayProxyTest(object): def __init__(self): self.db = DbClient() self.db.changeTable('useful_proxy') def GetProxy(self): try: time1 = time.time() url = 'https://api.2808proxy.com/proxy/unify/get?token=Y3AEO9WES4U3WKQAJXZO8DYM7LAZFOQN&amount=1&proxy_type=http&format=json&splitter=rn&expire=300' resp = requests.get(url) ip = resp.json().get('data')[0].get('ip') http_port = resp.json().get('data')[0].get('http_port') proxy = '%s:%s' % (ip, http_port) print(proxy) time2 = time.time() print(resp.json()) print('总耗时:', time2 - time1) except Exception as e: print(e) def InsertProxy(self, proxy): self.db.put(proxy)
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: try: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) except Exception, e: print e continue
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' self.adsl_queue = 'adsl' def refresh(self): """ fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue } def initProxyPool(self): """ 第一次启动时调用这个方法 :return: """ self.deleteAll() self.db.changeTable(self.adsl_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.values()) if item_dict else list() return item_dict.values() if item_dict else list() def deleteAll(self): """ 清空代理池 :param proxy: :return: """ # 删除所有 proxies = self.getAll() for proxy in proxies: self.delete(proxy) def refreshADSL(self, proxy): """ 重新拨号 :param proxy: :return: """ if isinstance(proxy, bytes): proxy = proxy.decode('utf8') ip = proxy.split(':')[0] try: # 调用接口重新拨号 refreshApi = "http://{ip}:8000/refresh".format(ip=ip) r = requests.get(refreshApi, timeout=5, verify=False) if r.status_code == 200: print('{proxy} refres done') except Exception as e: print(str(e))
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy self.db.changeTable(self.raw_proxy_queue) for proxy in proxy_set: self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.get() # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def get_status(self): # TODO rename get_count.. self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.get_status() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.get_status() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' @staticmethod def __dynamic_import__(name): components = name.split('.') mod = __import__(components[0]) for comp in components[1:]: mod = getattr(mod, comp) return mod def refresh(self): """ fetch proxy into Db by ProxyGetter user defined proxy getter class :return: """ self.db.changeTable(self.raw_proxy_queue) try: proxy_getter_class = self.__dynamic_import__( config.proxy_getter_lib) except Exception as e: raise Exception('%s not found in ProxyGetter' % config.proxy_getter_lib) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(proxy_getter_class, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def fetch(self): """ fetch proxy into db by ProxyGetter :return: """ self.db.changeTable(self.raw_proxy_queue) proxy_set = set() self.log.info("ProxyFetch : start") for proxyGetter in config.proxy_getter_functions: self.log.info( "ProxyFetch - {func}: start".format(func=proxyGetter)) try: for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if not proxy or not verifyProxyFormat(proxy): self.log.error('ProxyFetch - {func}: ' '{proxy} illegal'.format( func=proxyGetter, proxy=proxy.ljust(20))) continue elif proxy in proxy_set: self.log.info('ProxyFetch - {func}: ' '{proxy} exist'.format( func=proxyGetter, proxy=proxy.ljust(20))) continue else: self.log.info('ProxyFetch - {func}: ' '{proxy} success'.format( func=proxyGetter, proxy=proxy.ljust(20))) self.db.put(Proxy(proxy, source=proxyGetter)) proxy_set.add(proxy) except Exception as e: self.log.error( "ProxyFetch - {func}: error".format(func=proxyGetter)) self.log.error(str(e)) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_list = self.db.getAll() if item_list: random_choice = random.choice(item_list) return Proxy.newProxyFromJson(random_choice) return None def get_http(self): """ return a http proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_list = self.db.getAll() if item_list: for _ in item_list: random_choice = random.choice(item_list) proxy_type = json.loads(random_choice)['proxy'].split("://")[0] if proxy_type == 'http': return Proxy.newProxyFromJson(random_choice) return None def get_socks(self): """ return a useful socks proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_list = self.db.getAll() if item_list: for _ in item_list: random_choice = random.choice(item_list) proxy_type = json.loads(random_choice)['proxy'].split("://")[0] if proxy_type == 'socks4': return Proxy.newProxyFromJson(random_choice) return None def delete(self, proxy_str): """ delete proxy from pool :param proxy_str: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy_str) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_list = self.db.getAll() return [Proxy.newProxyFromJson(_) for _ in item_list] def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyManager(object): """ ProxyManager """ def __init__(self, mode): self.mode = mode self.db = DbClient(mode) self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info( 'Mode:{mode} {func}: fetch proxy {proxy}'.format( mode=self.mode, func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( 'Mode:{mode} {func}: fetch proxy {proxy} error'. format(mode=self.mode, func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error("Mode:{mode} {func}: fetch proxy fail".format( mode=self.mode, func=proxyGetter)) continue def get_http(self): """ return a useful proxy (http) :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def get_https(self): """ return a useful proxy (https) :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy['ip'])) self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy['ip']): continue self.db.changeTable(self.raw_proxy_queue) proxy['country'] = self.get_ip_country(proxy['ip']) self.db.put(proxy) def get_ip_country(self, ip): match = geolite2.lookup(ip) return match.country if match else None def get(self, filters): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.random_one(filters) def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() def clean(self): self.db.clean() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def fetch(self): """ fetch proxy into db by ProxyGetter :return: """ self.db.changeTable(self.raw_proxy_queue) proxy_set = set() self.log.info("ProxyFetch : start") for proxyGetter in config.proxy_getter_functions: self.log.info( "ProxyFetch - {func}: start".format(func=proxyGetter)) try: for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if not proxy or not verifyProxyFormat(proxy): self.log.error('ProxyFetch - {func}: ' '{proxy} illegal'.format( func=proxyGetter, proxy=proxy.ljust(20))) continue elif proxy in proxy_set: self.log.info('ProxyFetch - {func}: ' '{proxy} exist'.format( func=proxyGetter, proxy=proxy.ljust(20))) continue else: self.log.info('ProxyFetch - {func}: ' '{proxy} success'.format( func=proxyGetter, proxy=proxy.ljust(20))) self.db.put(Proxy(proxy, source=proxyGetter)) proxy_set.add(proxy) except Exception as e: self.log.error( "ProxyFetch - {func}: error".format(func=proxyGetter)) self.log.error(str(e)) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_list = self.db.getAll() if item_list: random_choice = random.choice(item_list) return Proxy.newProxyFromJson(random_choice) return None def delete(self, proxy_str): """ delete proxy from pool :param proxy_str: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy_str) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_list = self.db.getAll() return [Proxy.newProxyFromJson(_) for _ in item_list] def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue } def getAllByName(self, name): all_proxies = self.getAll() self.db.changeTable(self.useful_proxy_queue + '_fail_' + name) fail_list = self.db.getAll() fail_proxies = [Proxy.newProxyFromJson(_) for _ in fail_list] # todo: 优化 filter_proxies = [] for proxy in all_proxies: isFailed = False for failed in fail_proxies: if failed.proxy == proxy.proxy: failed_date = datetime.strptime(failed.last_time, "%Y-%m-%d %H:%M:%S") if failed_date + timedelta(hours=24) > datetime.now(): isFailed = True break if not isFailed: filter_proxies.append(proxy) return filter_proxies def deleteByName(self, name, proxy): failed_proxy = Proxy( proxy=proxy, last_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) self.db.changeTable(self.useful_proxy_queue + '_fail_' + name) self.db.put(failed_proxy) def getByName(self, name): proxies = self.getAllByName(name) if proxies: return random.choice(proxies) return None
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.raw_proxy_queue = 'raw_proxy_test' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy_test' def refresh(self): """从已有站点上抓取proxy,并存放到redis raw_proxy fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ max_conn = 100 meta: dict = {} self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) host, port = proxy.split(":") meta["host"] = host meta["port"] = port meta["max_conn"] = max_conn self.db.put(proxy, json.dumps(meta)) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error(e) self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue } def add_proxy(proxy, meta): # 向proxy-center中增加proxy节点,同时更新redis host, port = proxy.split(":") url = f'http://10.143.55.90:9381/api/proxies/{host}%3A{port}/' jmeta = json.dumps(meta) r = requests.post(url, data=jmeta) # print(r.status_code) print(r.text) def delete_proxy(proxy): # 从proxy-center中删除proxy节点,同时更新redis host, port = proxy.split(":") url = f'http://10.143.55.90:9381/api/proxies/{host}%3A{port}/' r = requests.delete(url)
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() # fetch raw proxy for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy: self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy.strip()) # store raw proxy for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue}
class ProxyManager(object): def __init__(self): self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = Loghandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy is not False: if proxy and verifyProxyFormat(proxy): self.log.info("{func}: fetch proxy {proxy}".format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( "{func}: fetch proxy {proxy} error".format( func=proxyGetter, proxy=proxy)) except Exception as s: self.log.error("refresh: {}".format(s)) self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None def get_new(self): self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: return random.choice(item_dict) def delete(self, proxy): self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): self.db.changeTable(self.useful_proxy_queue) return self.db.getAll() # if EnvUtil.PY3: # return list(item_dict.keys()) if item_dict else list() # return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_proxy = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_proxy } def getNumber_new(self): # self.db.changeTable(self.raw_proxy_queue) raw, useful = self.db.getNumber(self.raw_proxy_queue, self.useful_proxy_queue) # self.db.changeTable(self.useful_proxy_queue) # total_useful_proxy = self.db.getNumber(self.useful_proxy_queue) print('{}---,{}'.format(raw, useful)) return raw, useful
class ProxyManager(object): def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ 抓取代理地址存入DB中 :return: """ for proxyGetter in self.config.proxy_getter_functions: proxy_set = set() try: self.log.info("{func}:fetch proxy start".format(func=proxyGetter)) proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()] except Exception as e: self.log.error("{func}:fetch proxy fail".format(func=proxyGetter)) continue for proxy in proxy_iter: proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info("{func}:fetch proxy {proxy}".format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy) else: self.log.info("{func}:fetch proxy {proxy} error".format(func=proxyGetter, proxy=proxy)) # 存储到DB for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) def get(self): """ 返回一个有用的代理 :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None def delete(self, proxy): self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): self.db.changeTable(self.useful_proxy_queue) items = self.db.getAll() if EnvUtil.PY3: return list(items.keys()) if items else list() return items.key() if items else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_proxy = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_proxy }
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: # fetch proxy_set = set() try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) proxy_iter = [ _ for _ in getattr(GetFreeProxy, proxyGetter.strip())() ] except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue for proxy in proxy_iter: proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) proxy_set.add(proxy) else: self.log.error('{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) # store for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy) def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def validateProxy(self, proxy): return validUsefulProxy(proxy) def referProxy(self, proxy, city): print(proxy, city) if proxy != None: if self.validateProxy(proxy): return proxy old_proxy_info = GetProxyLocInfo(proxy) old_proxy_loc = old_proxy_info.get_proxy_loc_info() city = old_proxy_loc[1] count = 1 while count < 15: tmp_proxy = self.get() print(tmp_proxy) if self.validateProxy(tmp_proxy): try: tmp_proxy_info = GetProxyLocInfo(tmp_proxy) tmp_proxy_loc = tmp_proxy_info.get_proxy_loc_info() if tmp_proxy_loc[1] == city: return tmp_proxy except: pass count += 1 return None def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = DbClient() self.config = GetConfig() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in self.config.proxy_getter_functions: # fetch # proxy_set = set() try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) # proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()] for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class MongoSsdbUrlManager(object): def __init__(self, host="localhost", client=None): self.config = GetConfig() self.client = MongoClient('mongodb://localhost:27017/') if client is None else client self.ssdb_client = DbClient() self.db = self.client.spider if self.db.lazada.count() is 0: self.db.lazada.create_index([("status", ASCENDING), ("pr", DESCENDING)]) def enqueue_url(self, url, status, depth): md5 = hashlib.md5(url).hexdigest() i = 0 while i < 2: try: num = self.ssdb_client.get(md5) if num is not None: self.ssdb_client.update(key=md5, value=int(num) + 1) return self.ssdb_client.put(md5) i = 2 except Exception as error: print error i += 1 self.db.lazada.save({ '_id': md5, 'url': url, 'status': status, 'queue_time': datetime.utcnow(), 'depth': depth, 'pr': 0 }) def dequeue_url(self, depth): # record = self.db.lazada.find_one_and_update( # {'status': 'downloading', 'depth': depth}, # {'$set': {'status': 'downloading'}}, # upsert=False, # sort=[('pr', DESCENDING)], # sort by pr in descending # returnNewDocument=False # ) record = self.db.lazada.find_one( {'status': 'downloading', 'depth': depth} ) if record: return record else: return None def finish_url(self, url): record = {'status': 'done', 'done_time': datetime.utcnow()} self.db.lazada.update({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False) def clear(self): self.ssdb_client.clear() def save_error(self, url, status, depth): md5 = hashlib.md5(url).hexdigest() self.db.lazada_error.save({ '_id': md5, 'url': url, 'status': status, 'queue_time': datetime.utcnow(), 'depth': depth, 'pr': 0 }) def update(self): self.db.lazada.update({'status': 'downloading'}, {'$set': {'status': 'new'}}, multi=True) def save(self, record): self.db.record.save(record)