def __init__(self,targets=targets): """ 初始化 :param targets: 默认加载config中的目标url列表targets """ self.targets = targets self.db = Database(_DB_SETTINGS)
def __init__(self): self.standbyDB = Database(_DB_SETTINGS) self.stableDB = Database(_DB_SETTINGS) self.standbyDB.table = _TABLE['standby'] self.stableDB.table = _TABLE['stable'] self.standby_data = [] self.stable_data = []
class Detector(object): """ 本地检测器,主要职责有三: 1. 负责检测本地standby数据库中存入的有效代理IP数据是否有符合高分稳定条件的, 有则存入高分稳定数据库stable数据库 2. 检测standby数据库的同时,如果符合高分条件的代理已经在stable中,则将standby中 该代理的最新数据同步更新到stable数据库中 3. 负责检测stable数据库中的高分稳定代理是否有不符合高分条件的,有则从stable中删除 """ def __init__(self): self.standbyDB = Database(_DB_SETTINGS) self.stableDB = Database(_DB_SETTINGS) self.standbyDB.table = _TABLE['standby'] self.stableDB.table = _TABLE['stable'] self.standby_data = [] self.stable_data = [] def begin(self): self.stableDB.connect() self.standbyDB.connect() def end(self): self.standbyDB.close() self.stableDB.close() def run(self): """ 运行本地检测器,利用asyncio提供的异步读写 """ logger.info('Running Detector.') self.begin() loop = asyncio.get_event_loop() while 1: try: self.detect_standby(loop) self.detect_stable(loop) time.sleep(DETECT_LOCAL) except Exception as e: logger.error('Error class : %s , msg : %s ' % (e.__class__, e)) self.end() loop.close() logger.info('Detector shuts down.') return def detect_standby(self,loop): """ 检测standby数据库 :param loop: 异步事件循环 """ if self.standby_data: pen = len(self.standby_data) logger.info('Imported the "standby" database\' data,length: %d ' % pen) pop_len = pen if pen <= DETECT_AMOUNT else DETECT_AMOUNT logger.info('Start to detect the local valid data,amount: %d ' % pop_len) standby_data = [self.standby_data.pop() for i in range(pop_len)] tasks = [self._detect_standby(i) for i in standby_data] loop.run_until_complete(asyncio.gather(*tasks)) logger.info('Detection finished.Left standby data length:%d' % len(self.standby_data)) else: self.standby_data = self.standbyDB.all() def detect_stable(self,loop): """ 检测stable数据库 :param loop: 异步事件循环 """ if self.stable_data: pen = len(self.stable_data) logger.info('Imported the "stable" database\' data,length: %d ' % pen) pop_len = pen if pen <= DETECT_HIGH_AMOUNT else DETECT_HIGH_AMOUNT logger.info('Start to detect the high scored data,amount: %d ' % pop_len) stable_data = [self.stable_data.pop() for i in range(pop_len)] tasks = [self._detect_stable(i) for i in stable_data] loop.run_until_complete(asyncio.gather(*tasks)) logger.info('Detection finished.Left stable data length:%d' % len(self.stable_data)) else: self.stable_data = self.stableDB.all() async def _detect_standby(self,data): """ 异步协程,对单个standby数据库中的数据文档进行检测 其中的 data['test_count']<STABLE_MIN_COUNT 表示 测试总数小于config中配置的数值 round(float(data['success_rate'].replace('%',''))/100,4)< STABLE_MIN_RATE 表示 成功率小于config中配置的数值 data['combo_fail'] >= DELETE_COMBO 表示 连续失败数 超过或等于config中配置的数值 :param data: standby中的单个数据文档 ,dict类型 """ del data['_id'] ip = data['ip'] port = data['port'] proxy = ':'.join([ip,port]) if data['test_count']<STABLE_MIN_COUNT or round(float(data['success_rate'].replace('%',''))/100,4)\ < STABLE_MIN_RATE or data['combo_fail'] >= DELETE_COMBO: return condition = {'ip':ip,'port':port} _one_data = self.stableDB.select(condition) if _one_data: self.stableDB.update(condition,data) else: self.stableDB.save(data) logger.info('Find a stable proxy: %s , put it into the stable database.' % proxy) async def _detect_stable(self,data): """ 异步协程,对单个stable数据库中的数据文档进行检测 其中的 round(float(_one_data['success_rate'].replace('%',''))/100,4)< STABLE_MIN_RATE 表示 成功率小于config中配置的数值 _one_data['combo_fail'] >= DELETE_COMBO 表示 连续失败数 超过或等于config中配置的数值 :param data: stable中的单个数据文档 ,dict类型 """ ip = data['ip'] port = data['port'] proxy = ':'.join([ip,port]) condition = {'ip':ip,'port':port} res = self.standbyDB.select(condition) _one_data = res[0] if res else None if not bool(_one_data): self.stableDB.delete(condition) logger.warning( 'The high scored proxy: %s had been deleted from the standby database.It\'s unavailable.' % proxy) else: if round(float(_one_data['success_rate'].replace('%',''))/100,4) < STABLE_MIN_RATE or _one_data['combo_fail'] >= DELETE_COMBO: self.stableDB.delete(condition) logger.warning( 'The high scored proxy: %s is not that stable now.It\'s Removed.' % proxy) else: del _one_data['_id'] self.stableDB.update(condition,_one_data)
class Tentacle(object): """ 目标库扫描验证类,可以内置在其他部件中,是workstation的“触手”,对 每一个获得的代理IP针对目标网址进行逐个验证,并对本地存有的目标库 进行定时检测扫描,剔除无效的代理IP """ def __init__(self,targets=targets): """ 初始化 :param targets: 默认加载config中的目标url列表targets """ self.targets = targets self.db = Database(_DB_SETTINGS) def begin(self): """ 做开始扫描验证前的准备工作: * 连接数据库 * 清除过期的目标库 * 保存更新存储目标库信息的targets数据库 """ self.db.connect() self.clean_expired_targets() self.save_targets() def end(self): self.db.close() def load_target_db(self) -> dict: """ 加载所有待验证目标库中的所有数据 """ _targets = set() allowed_targets = [] _dict = {} if AGO: targets_inside = self.db.all(tname=TARGETS_DB) for i in targets_inside: url = i['url'] if url in self.targets: continue elif url: _targets.add(url) [allowed_targets.extend(i) for i in (self.targets,_targets)] for url in allowed_targets: _name = gen_target_db_name(url) _data = self.db.all(tname=_name) _dict[url] = _data logger.info('Loaded %d proxies from db: %s '%(len(_data),_name)) return _dict def save_targets(self): """ 保存当前config设置的targets信息到数据库 """ data = {} now = datetime.datetime.now() j = 0 for i in targets: inside_data = self.db.select({'url': i}, tname=TARGETS_DB) if inside_data: self.db.update({'url': i},{'validTime':now.isoformat()},tname=TARGETS_DB) continue data['url'] = i data['createdTime'] = now.isoformat() data['validTime'] = now.isoformat() data['db'] = gen_target_db_name(i) data['_id'] = str(j + random.randint(0,100000))+\ ascii_letters[random.randint(0,52)]+\ str(int(time.time()*1000)) self.db.save(data, tname=TARGETS_DB) def clean_expired_targets(self): """ 清除过期目标库 """ if not self.db.connected: return now = datetime.datetime.now() expired_created_time = (now - datetime.timedelta(days=TARGET_EXPIRE)).isoformat() all_data = self.db.all(tname=TARGETS_DB) for tar in all_data: if tar['validTime'] < expired_created_time: db_name = gen_target_db_name(tar['url']) _std_count = self.db.handler[db_name].drop() self.db.delete({'url':tar['url']},tname=TARGETS_DB) logger.info('Deleted expired target website proxy collection:(%s)' % (db_name)) def run(self): """ 运行Tentacle 逻辑: * 创建单一异步session,使用信号量控制连接池 * 判断是否联网 * 联网则加载需要扫描验证的目标库数据 * 每一个目标库扫一遍作为一个周期 * 在扫每一个目标库时加入一次性扫描最大数限制MAX_V_COUNT """ logger.info('Running Tentacle.') self.begin() loop = asyncio.get_event_loop() sem = asyncio.Semaphore(MAX_V_COUNT) conn = aiohttp.TCPConnector(verify_ssl=False, limit=MAX_V_COUNT) session = aiohttp.ClientSession(loop=loop, connector=conn) while 1: if not internet_access(): continue try: _dict = self.load_target_db() for url in _dict: logger.info('Start the validation of the target url:%s'%url) data = _dict[url] _len = len(data) _count = MAX_V_COUNT if MAX_V_COUNT <= _len else _len start = 0 while 1: _data = data[start:start+_count] if not _data: logger.info('Target url:%s -> validation finished,total proxies:%d'%(url,_len)) break tasks = [] for i in _data: ip = i['ip'] port = i['port'] proxy = format_proxies(':'.join([ip,port])) tasks.append(self.async_visit_target(self.db,url,proxy,i,sem,session)) loop.run_until_complete(asyncio.gather(*tasks)) start += _count time.sleep(VALIDATE_LOCAL) except Exception as e: self.end() logger.error('%s,msg: %s ' % (e.__class__, e)) logger.error('Shut down the Tentacle.') async def async_visit_target(self,db,url,proxy,bullet,sem,session,scan=True): """ 异步请求协程,对单个代理IP数据进行异步验证 :param db:处理操作的数据库 :param url:目标网站url :param proxy:要验证对目标网址是否有用的代理IP,dict类型 :param bullet:单个代理ip对象的所有数据 :param sem:协程并发信号量 :param session:异步请求session :param scan:是否进行的是目标库扫描操作,False则表示进行的是初次入库验证 """ data = { 'ip': bullet['ip'], 'port': bullet['port'], 'anony_type': bullet['anony_type'], 'address': bullet['address'], 'createdTime': bullet['createdTime'], 'score':bullet['score'], 'test_count': int(bullet['test_count']) + 1, 'url': url, } db_name = gen_target_db_name(url) async with sem: ret = await send_async_http(session, 'head', url, retries=RETRIES, headers=headers, proxy=proxy['http'], timeout=TIMEOUT) t, code = ret['cost'], ret['code'] if code == 200: data['score'] = round( (bullet['score'] * bullet['test_count'] + round((1 - t / 15) * 100, 2)) / data['test_count'], 2) data['total'] = round(data['score'] * data['test_count'], 2) data['resp_time'] = str(t) + 's' data['valid_time'] = time_to_date(int(time.time())) if scan: self.update(db,data,db_name) else: self.success(db,data,db_name) else: if scan: self.fail(db,data,db_name) async def specified_validate(self,db,bullet,session,sem): """ 初次入库验证协程,内置在Validator中的Tentacle调用此协程进行代理Ip 从采集器中采集验证后进行初次入目标库的验证操作 :param db:处理操作的数据库对象 :param bullet:单个代理ip对象的所有数据 :param session:异步请求session :param sem:协程并发信号量 """ ip = bullet['ip'] port = bullet['port'] proxy = format_proxies(':'.join([ip, port])) max_thread_count = MAX_T_LEN if MAX_T_LEN <= len(self.targets) else len(self.targets) allowed_targets = self.targets[:max_thread_count] tasks = [self.async_visit_target(db,i,proxy,bullet,sem,session,scan=False) for i in allowed_targets] resp = asyncio.gather(*tasks) await resp def success(self,db,bullet,tname): """ 初次在Validator中调用触手成功验证目标url后进行入库操作 :param db: 处理操作的数据库对象 :param bullet: 单个代理ip对象的所有数据 :param tname: 目标url对应的数据集合 """ ip = bullet['ip'] port = bullet['port'] _data = db.select({'ip':ip,'port':port},tname=tname) bullet['address'] = get_ip_addr(ip) if bullet['address'] == 'unknown' or\ bullet['address'] == '' else bullet['address'] if _data: bullet['_id'] = _data[0]['_id'] self.update(db,bullet,tname) return bullet['createdTime'] = time_to_date(int(time.time())) try: db.save(bullet,tname=tname) except Exception as e: logger.error('%s,msg: %s ' % (e.__class__, e)) return def update(self,db,bullet,tname): """ 验证成功后对已存在于目标库中的代理数据进行更新 :param db: 处理操作的数据库对象 :param bullet: 单个代理ip对象的所有数据 :param tname: 目标url对应的数据集合 """ ip = bullet['ip'] port = bullet['port'] if bullet['createdTime']=='': bullet['createdTime']=time_to_date(int(time.time())) bullet['address'] = get_ip_addr(ip) if bullet['address'] == 'unknown' or \ bullet['address'] == '' else bullet['address'] db.update({'ip':ip,'port':port},bullet,tname=tname) def fail(self,db,bullet,tname): """ 验证失败对已存在于目标库中的代理数据进行失败操作 :param db: 处理操作的数据库对象 :param bullet: 单个代理ip对象的所有数据 :param tname: 目标url对应的数据集合 """ try: ip = bullet['ip'] port = bullet['port'] proxy = ':'.join([ip,port]) db.delete({'ip':ip,'port':port},tname=tname) logger.warning('Deleted inoperative proxy %s in %s'%(proxy,tname)) except Exception as e: logger.error('%s,msg: %s ' % (e.__class__, e)) return
def __init__(self): self.db = Database(_DB_SETTINGS) self.db.table = _TABLE['standby'] self.rator = Rator(self.db) self.standby_data = []
class Scaner(object): def __init__(self): self.db = Database(_DB_SETTINGS) self.db.table = _TABLE['standby'] self.rator = Rator(self.db) self.standby_data = [] def run(self): logger.info('Running Scanner.') self.rator.begin() loop = asyncio.get_event_loop() while 1: try: if self.standby_data: pen = len(self.standby_data) logger.info( 'Start the validation of the local "standby" database,length : %d ' % pen) pop_len = pen if pen <= LOCAL_AMOUNT else LOCAL_AMOUNT stanby_proxies = [ self.standby_data.pop() for x in range(pop_len) ] semaphore = asyncio.Semaphore(COROUTINE_MAX) logger.info( 'Start to verify the standby proxy data,amount: %d ' % pop_len) tasks = [ asyncio.ensure_future(self.validate(i, semaphore)) for i in stanby_proxies if i ] loop.run_until_complete(asyncio.gather(*tasks)) logger.info( 'Local validation finished.Left standby proxies:%d' % len(self.standby_data)) time.sleep(VALIDATE_LOCAL) else: self.standby_data = self.db.all() except Exception as e: logger.error('Error class : %s , msg : %s ' % (e.__class__, e)) self.rator.end() loop.close() logger.info('Scanner shuts down.') return async def validate(self, proxy, semaphore): ip = proxy['ip'] port = proxy['port'] # 可设置响应超时对API服务器请求代理,没写 async with semaphore: async with aiohttp.ClientSession() as session: try: async with session.get(proxy_validate_url.format(ip, port), headers=headers, timeout=15) as response: data = await response.text(encoding='utf-8') data = json.loads(data) except Exception as e: logger.error('Error class : %s , msg : %s ' % (e.__class__, e)) return else: res = data['msg'][0] if 'anony' in res and 'time' in res: proxy['anony_type'] = res['anony'] proxy['resp_time'] = res['time'] self.rator.mark_update(proxy, collected=False) else: self.rator.mark_fail(proxy)
class Detector(object): def __init__(self): self.standbyDB = Database(_DB_SETTINGS) self.stableDB = Database(_DB_SETTINGS) self.standbyDB.table = _TABLE['standby'] self.stableDB.table = _TABLE['stable'] self.standby_data = [] self.stable_data = [] def begin(self): self.stableDB.connect() self.standbyDB.connect() def end(self): self.standbyDB.close() self.stableDB.close() def run(self): logger.info('Running Detector.') self.begin() loop = asyncio.get_event_loop() while 1: try: self.detect_standby(loop) self.detect_stable(loop) time.sleep(DETECT_LOCAL) except Exception as e: logger.error('Error class : %s , msg : %s ' % (e.__class__, e)) self.end() loop.close() logger.info('Detector shuts down.') return def detect_standby(self,loop): if self.standby_data: pen = len(self.standby_data) logger.info('Imported the "standby" database\' data,length: %d ' % pen) pop_len = pen if pen <= DETECT_AMOUNT else DETECT_AMOUNT logger.info('Start to detect the local valid data,amount: %d ' % pop_len) standby_data = [self.standby_data.pop() for i in range(pop_len)] tasks = [self._detect_standby(i) for i in standby_data] loop.run_until_complete(asyncio.gather(*tasks)) logger.info('Detection finished.Left standby data length:%d' % len(self.standby_data)) else: self.standby_data = self.standbyDB.all() def detect_stable(self,loop): if self.stable_data: pen = len(self.stable_data) logger.info('Imported the "stable" database\' data,length: %d ' % pen) pop_len = pen if pen <= DETECT_HIGH_AMOUNT else DETECT_HIGH_AMOUNT logger.info('Start to detect the high scored data,amount: %d ' % pop_len) stable_data = [self.stable_data.pop() for i in range(pop_len)] tasks = [self._detect_stable(i) for i in stable_data] loop.run_until_complete(asyncio.gather(*tasks)) logger.info('Detection finished.Left stable data length:%d' % len(self.stable_data)) else: self.stable_data = self.stableDB.all() async def _detect_standby(self,data): del data['_id'] ip = data['ip'] port = data['port'] proxy = ':'.join([ip,port]) if data['test_count']<STABLE_MIN_COUNT or round(float(data['success_rate'].replace('%',''))/100,4) < STABLE_MIN_RATE: return condition = {'ip':ip,'port':port} _one_data = self.stableDB.select(condition) if _one_data: self.stableDB.update(condition,data) else: self.stableDB.save(data) logger.info('Find a stable proxy: %s , put it into the stable database.' % proxy) async def _detect_stable(self,data): del data['_id'] ip = data['ip'] port = data['port'] proxy = ':'.join([ip,port]) condition = {'ip':ip,'port':port} res = self.standbyDB.select(condition) _one_data = res[0] if res else None if not bool(_one_data): self.stableDB.delete(condition) logger.warning( 'The high scored proxy: %s had been deleted from the standby database.It\'s unavailable.' % proxy) else: if round(float(_one_data['success_rate'].replace('%',''))/100,4) < STABLE_MIN_RATE or _one_data['combo_fail'] >= DELETE_COMBO: self.stableDB.delete(condition) logger.warning( 'The high scored proxy: %s is not that stable now.It\'s Removed.' % proxy) else: del _one_data['_id'] self.stableDB.update(condition,_one_data)
class Scaner(object): """ 本地扫描器,对本地standby有效代理数据库中的数据进行周期验证 保证其以后调用数据的实时验证,通过内置打分器进行打分存储 """ def __init__(self): self.db = Database(_DB_SETTINGS) self.db.table = _TABLE['standby'] self.rator = Rator(self.db) self.standby_data = [] def check_allot(self, proxies): """ 将扫描器一次取出的要验证的本地standby数据库有效代理数据进行分组 分成几组则有多少个异步协程来验证IP代理数据,一组中有多少个代理IP 则一个协程一次验证的代理IP就有多少个。建议一次验证的IP数不要太多, 防止目标验证网站封掉本机IP,如果你已经爬取到一定数量的IP代理并存储 到standby或stable数据库中,则可以将数值设置大一点,最大不能超过100 如果是刚刚开始建立FooProxy数据库,则建议将offset设置为2,慢慢爬取建立 稳定数据库后,再设置大一点的数值。此处设置为20是因为我的本地数据库已经很大。 Q:为甚要有这个函数? A:前期因为使用单个IP代理对应一个异步协程验证,一次取出500个代理进行验证,经常被 目标验证网站http://www.moguproxy.com封掉IP或者断开连接,此时使用查询分组可以 减少一次性访问的异步协程的数量,但是如果offset值设置过大会引起目标验证网站的多线程 验证压力增大,被封IP的几率大大增加,所以设置一个合适的offset比较好。 Q:那究竟要多大啊这个offset? A:前期刚刚开始使用FooProxy项目来建立代理池的话,建议设为2,即是最小值了,此时不会增加目标网站 的多线程验证压力,不会引起注意,但是也要设置好一次取出的待验证IP代理数据的量,在config中设置 的LOCAL_AMOUNT,默认500,可以自己设置100或者更小,看自己需求,offse和LOCAL_AMOUNT这两个值 越大被封IP的几率越大,建议前期offset为2,后续代理池稳定下来可以设置更大的值。 Q:这么麻烦那我自己验证代理有效性不就行了? A:这是可以的。由于我比较懒,所以使用了验证网站的接口,也可以自己去访问一些验证服务器来判断返回的 头部内容,根据response headers中的内容确定匿名程度,以及响应时间。比如访问:http://httpbin.org/get?show_env=1 但是如果用这种办法,验证用的validate异步协程函数就要重写。 :param proxies:扫描器一次取出来的待验证本地standby有效数据库的代理IP列表,格式[{},{},..] :return:返回分组结果,格式 {'查询参数字符串':[{},{},..],'查询参数字符串':[{},{},..],..} 查询参数字符串对应的值为分组后的一组代理IP数据,dict类型 """ p_len = len(proxies) offset = 20 params_dict = {} if p_len <= offset: return { '&'.join([ 'ip_ports%5B%5D={}%3A{}'.format(i['ip'], i['port']) for i in proxies ]): proxies } else: base = math.ceil(p_len / offset) p_groups = [ proxies[i * offset:(i + 1) * offset] for i in range(base) ] for group in p_groups: url_str = '&'.join([ 'ip_ports%5B%5D={}%3A{}'.format(i['ip'], i['port']) for i in group ]) params_dict[url_str] = group return params_dict def run(self): """ 运行本地扫描器 """ logger.info('Running Scanner.') self.rator.begin() loop = asyncio.get_event_loop() while 1: try: if self.standby_data: pen = len(self.standby_data) logger.info( 'Start the validation of the local "standby" database,length : %d ' % pen) pop_len = pen if pen <= LOCAL_AMOUNT else LOCAL_AMOUNT stanby_proxies = [ self.standby_data.pop() for x in range(pop_len) ] prams_dict = self.check_allot(stanby_proxies) semaphore = asyncio.Semaphore(COROUTINE_MAX) logger.info( 'Start to verify the standby proxy data,amount: %d ' % pop_len) tasks = [ asyncio.ensure_future( self.validate(i, prams_dict[i], semaphore)) for i in prams_dict ] loop.run_until_complete(asyncio.gather(*tasks)) logger.info( 'Local validation finished.Left standby proxies:%d' % len(self.standby_data)) time.sleep(VALIDATE_LOCAL) else: self.standby_data = self.db.all() except Exception as e: logger.error('Error class : %s , msg : %s ' % (e.__class__, e)) self.rator.end() loop.close() logger.info('Scanner shuts down.') return async def validate(self, url_str, proxies, semaphore): """ 异步验证协程,对本地standby中的代理数据进行异步验证 :param url_str: IP代理分组中一个组的验证查询参数字符串 :param proxies: 查询参数字符串对应的IP代理组 :param semaphore: 协程最大并发量信号 """ _proxy = None async with semaphore: async with aiohttp.ClientSession() as session: while 1: try: async with session.get(mul_validate_url + url_str, headers=v_headers, proxy=_proxy) as response: data = await response.text(encoding='utf-8') data = json.loads(data) except Exception as e: _proxy = get_proxy(format=False) if not _proxy: logger.error( 'No available proxy to retry the request for validation.' ) return continue else: for res in data['msg']: proxy = find_proxy(res['ip'], res['port'], proxies) try: if 'anony' in res and 'time' in res: proxy['anony_type'] = res['anony'] proxy['resp_time'] = res['time'] self.rator.mark_update(proxy, collected=False) else: self.rator.mark_fail(proxy) except KeyError as e: logger.error('Error class : %s , msg : %s ' % (e.__class__, e)) continue return
""" @author : linkin @email : [email protected] @date : 2018-10-04 """ import random import logging import json from flask import Flask from components.dbhelper import Database from config.DBsettings import _TABLE from config.DBsettings import _DB_SETTINGS logger = logging.getLogger('APIserver') app = Flask(__name__) stable_db = Database(_DB_SETTINGS) standby_db = Database(_DB_SETTINGS) common_db = Database(_DB_SETTINGS) standby_db.table = _TABLE['standby'] stable_db.table = _TABLE['stable'] standby_db.connect() stable_db.connect() common_db.connect() all_standby_proxy = standby_db.all() all_stable_proxy = stable_db.all() anony_standby = [ i for i in all_standby_proxy if i['anony_type'] == '高匿' and i['combo_fail'] == 0 ]