Example #1
0
 def __init__(self,targets=targets):
     """
     初始化
     :param targets: 默认加载config中的目标url列表targets
     """
     self.targets = targets
     self.db = Database(_DB_SETTINGS)
Example #2
0
 def __init__(self):
     self.standbyDB  = Database(_DB_SETTINGS)
     self.stableDB   = Database(_DB_SETTINGS)
     self.standbyDB.table  = _TABLE['standby']
     self.stableDB.table   = _TABLE['stable']
     self.standby_data     = []
     self.stable_data      = []
Example #3
0
class Detector(object):
    """
    本地检测器,主要职责有三:
    1. 负责检测本地standby数据库中存入的有效代理IP数据是否有符合高分稳定条件的,
       有则存入高分稳定数据库stable数据库
    2. 检测standby数据库的同时,如果符合高分条件的代理已经在stable中,则将standby中
       该代理的最新数据同步更新到stable数据库中
    3. 负责检测stable数据库中的高分稳定代理是否有不符合高分条件的,有则从stable中删除
    """
    def __init__(self):
        self.standbyDB  = Database(_DB_SETTINGS)
        self.stableDB   = Database(_DB_SETTINGS)
        self.standbyDB.table  = _TABLE['standby']
        self.stableDB.table   = _TABLE['stable']
        self.standby_data     = []
        self.stable_data      = []

    def begin(self):
        self.stableDB.connect()
        self.standbyDB.connect()

    def end(self):
        self.standbyDB.close()
        self.stableDB.close()

    def run(self):
        """
        运行本地检测器,利用asyncio提供的异步读写
        """
        logger.info('Running Detector.')
        self.begin()
        loop = asyncio.get_event_loop()
        while 1:
            try:
                self.detect_standby(loop)
                self.detect_stable(loop)
                time.sleep(DETECT_LOCAL)
            except Exception as e:
                logger.error('Error class : %s , msg : %s ' % (e.__class__, e))
                self.end()
                loop.close()
                logger.info('Detector shuts down.')
                return

    def detect_standby(self,loop):
        """
        检测standby数据库
        :param loop: 异步事件循环
        """
        if self.standby_data:
            pen = len(self.standby_data)
            logger.info('Imported the "standby" database\' data,length: %d ' % pen)
            pop_len = pen if pen <= DETECT_AMOUNT else DETECT_AMOUNT
            logger.info('Start to detect the local valid data,amount: %d ' % pop_len)
            standby_data = [self.standby_data.pop() for i in range(pop_len)]
            tasks = [self._detect_standby(i) for i in standby_data]
            loop.run_until_complete(asyncio.gather(*tasks))
            logger.info('Detection finished.Left standby data length:%d' % len(self.standby_data))
        else:
            self.standby_data = self.standbyDB.all()

    def detect_stable(self,loop):
        """
        检测stable数据库
        :param loop: 异步事件循环
        """
        if self.stable_data:
            pen = len(self.stable_data)
            logger.info('Imported the "stable" database\' data,length: %d ' % pen)
            pop_len = pen if pen <= DETECT_HIGH_AMOUNT else DETECT_HIGH_AMOUNT
            logger.info('Start to detect the high scored data,amount: %d ' % pop_len)
            stable_data = [self.stable_data.pop() for i in range(pop_len)]
            tasks = [self._detect_stable(i) for i in stable_data]
            loop.run_until_complete(asyncio.gather(*tasks))
            logger.info('Detection finished.Left stable data length:%d' % len(self.stable_data))
        else:
            self.stable_data = self.stableDB.all()

    async def _detect_standby(self,data):
        """
        异步协程,对单个standby数据库中的数据文档进行检测
        其中的
            data['test_count']<STABLE_MIN_COUNT
            表示 测试总数小于config中配置的数值
            round(float(data['success_rate'].replace('%',''))/100,4)< STABLE_MIN_RATE
            表示 成功率小于config中配置的数值
            data['combo_fail'] >= DELETE_COMBO
            表示 连续失败数 超过或等于config中配置的数值
        :param data: standby中的单个数据文档 ,dict类型
        """
        del data['_id']
        ip = data['ip']
        port = data['port']
        proxy = ':'.join([ip,port])
        if data['test_count']<STABLE_MIN_COUNT or round(float(data['success_rate'].replace('%',''))/100,4)\
                < STABLE_MIN_RATE or  data['combo_fail'] >= DELETE_COMBO:
            return
        condition = {'ip':ip,'port':port}
        _one_data = self.stableDB.select(condition)
        if _one_data:
            self.stableDB.update(condition,data)
        else:
            self.stableDB.save(data)
            logger.info('Find a stable proxy: %s , put it into the stable database.' % proxy)

    async def _detect_stable(self,data):
        """
       异步协程,对单个stable数据库中的数据文档进行检测
       其中的
           round(float(_one_data['success_rate'].replace('%',''))/100,4)< STABLE_MIN_RATE
           表示 成功率小于config中配置的数值
           _one_data['combo_fail'] >= DELETE_COMBO
           表示 连续失败数 超过或等于config中配置的数值
       :param data: stable中的单个数据文档 ,dict类型
       """
        ip = data['ip']
        port = data['port']
        proxy = ':'.join([ip,port])
        condition = {'ip':ip,'port':port}
        res = self.standbyDB.select(condition)
        _one_data = res[0] if res else None
        if not bool(_one_data):
            self.stableDB.delete(condition)
            logger.warning(
                'The high scored proxy: %s had been deleted from the standby database.It\'s unavailable.' % proxy)
        else:
            if round(float(_one_data['success_rate'].replace('%',''))/100,4) < STABLE_MIN_RATE or _one_data['combo_fail'] >= DELETE_COMBO:
                self.stableDB.delete(condition)
                logger.warning(
                    'The high scored proxy: %s is not that stable now.It\'s Removed.' % proxy)
            else:
                del _one_data['_id']
                self.stableDB.update(condition,_one_data)
Example #4
0
class Tentacle(object):
    """
    目标库扫描验证类,可以内置在其他部件中,是workstation的“触手”,对
    每一个获得的代理IP针对目标网址进行逐个验证,并对本地存有的目标库
    进行定时检测扫描,剔除无效的代理IP
    """
    def __init__(self,targets=targets):
        """
        初始化
        :param targets: 默认加载config中的目标url列表targets
        """
        self.targets = targets
        self.db = Database(_DB_SETTINGS)

    def begin(self):
        """
        做开始扫描验证前的准备工作:
        * 连接数据库
        * 清除过期的目标库
        * 保存更新存储目标库信息的targets数据库
        """
        self.db.connect()
        self.clean_expired_targets()
        self.save_targets()

    def end(self):
        self.db.close()

    def load_target_db(self) -> dict:
        """
        加载所有待验证目标库中的所有数据
        """
        _targets = set()
        allowed_targets = []
        _dict = {}
        if AGO:
            targets_inside = self.db.all(tname=TARGETS_DB)
            for i in targets_inside:
                url = i['url']
                if url in self.targets:
                    continue
                elif url:
                    _targets.add(url)
        [allowed_targets.extend(i) for i in (self.targets,_targets)]
        for url in allowed_targets:
            _name = gen_target_db_name(url)
            _data = self.db.all(tname=_name)
            _dict[url] = _data
            logger.info('Loaded %d proxies from db: %s '%(len(_data),_name))
        return _dict

    def save_targets(self):
        """
        保存当前config设置的targets信息到数据库
        """
        data = {}
        now = datetime.datetime.now()
        j = 0
        for i in targets:
            inside_data = self.db.select({'url': i}, tname=TARGETS_DB)
            if inside_data:
                self.db.update({'url': i},{'validTime':now.isoformat()},tname=TARGETS_DB)
                continue
            data['url'] = i
            data['createdTime'] = now.isoformat()
            data['validTime'] = now.isoformat()
            data['db'] = gen_target_db_name(i)
            data['_id'] = str(j + random.randint(0,100000))+\
                          ascii_letters[random.randint(0,52)]+\
                          str(int(time.time()*1000))
            self.db.save(data, tname=TARGETS_DB)

    def clean_expired_targets(self):
        """
        清除过期目标库
        """
        if not self.db.connected:
            return
        now = datetime.datetime.now()
        expired_created_time = (now - datetime.timedelta(days=TARGET_EXPIRE)).isoformat()
        all_data = self.db.all(tname=TARGETS_DB)
        for tar in all_data:
            if tar['validTime'] < expired_created_time:
                db_name = gen_target_db_name(tar['url'])
                _std_count = self.db.handler[db_name].drop()
                self.db.delete({'url':tar['url']},tname=TARGETS_DB)
                logger.info('Deleted expired target website proxy collection:(%s)' % (db_name))

    def run(self):
        """
        运行Tentacle
        逻辑:
        * 创建单一异步session,使用信号量控制连接池
        * 判断是否联网
        * 联网则加载需要扫描验证的目标库数据
        * 每一个目标库扫一遍作为一个周期
        * 在扫每一个目标库时加入一次性扫描最大数限制MAX_V_COUNT
        """
        logger.info('Running Tentacle.')
        self.begin()
        loop = asyncio.get_event_loop()
        sem = asyncio.Semaphore(MAX_V_COUNT)
        conn = aiohttp.TCPConnector(verify_ssl=False, limit=MAX_V_COUNT)
        session = aiohttp.ClientSession(loop=loop, connector=conn)
        while 1:
            if not internet_access():
                continue
            try:
                _dict = self.load_target_db()
                for url in _dict:
                    logger.info('Start the validation of the target url:%s'%url)
                    data = _dict[url]
                    _len = len(data)
                    _count = MAX_V_COUNT if MAX_V_COUNT <= _len else _len
                    start = 0
                    while 1:
                        _data = data[start:start+_count]
                        if not _data:
                            logger.info('Target url:%s -> validation finished,total proxies:%d'%(url,_len))
                            break
                        tasks = []
                        for i in _data:
                            ip = i['ip']
                            port = i['port']
                            proxy = format_proxies(':'.join([ip,port]))
                            tasks.append(self.async_visit_target(self.db,url,proxy,i,sem,session))
                        loop.run_until_complete(asyncio.gather(*tasks))
                        start += _count
                time.sleep(VALIDATE_LOCAL)
            except Exception as e:
                self.end()
                logger.error('%s,msg: %s ' % (e.__class__, e))
                logger.error('Shut down the Tentacle.')

    async def async_visit_target(self,db,url,proxy,bullet,sem,session,scan=True):
        """
        异步请求协程,对单个代理IP数据进行异步验证
        :param db:处理操作的数据库
        :param url:目标网站url
        :param proxy:要验证对目标网址是否有用的代理IP,dict类型
        :param bullet:单个代理ip对象的所有数据
        :param sem:协程并发信号量
        :param session:异步请求session
        :param scan:是否进行的是目标库扫描操作,False则表示进行的是初次入库验证
        """
        data = {
            'ip': bullet['ip'],
            'port': bullet['port'],
            'anony_type': bullet['anony_type'],
            'address': bullet['address'],
            'createdTime': bullet['createdTime'],
            'score':bullet['score'],
            'test_count': int(bullet['test_count']) + 1,
            'url': url,
        }
        db_name = gen_target_db_name(url)
        async with sem:
            ret = await send_async_http(session, 'head', url,
                                        retries=RETRIES,
                                        headers=headers,
                                        proxy=proxy['http'],
                                        timeout=TIMEOUT)
            t, code = ret['cost'], ret['code']
            if code == 200:
                data['score'] = round(
                    (bullet['score'] * bullet['test_count'] + round((1 - t / 15) * 100, 2)) / data['test_count'], 2)
                data['total'] = round(data['score'] * data['test_count'], 2)
                data['resp_time'] = str(t) + 's'
                data['valid_time'] = time_to_date(int(time.time()))
                if scan:
                    self.update(db,data,db_name)
                else:
                    self.success(db,data,db_name)
            else:
                if scan:
                    self.fail(db,data,db_name)

    async def specified_validate(self,db,bullet,session,sem):
        """
        初次入库验证协程,内置在Validator中的Tentacle调用此协程进行代理Ip
        从采集器中采集验证后进行初次入目标库的验证操作
        :param db:处理操作的数据库对象
        :param bullet:单个代理ip对象的所有数据
        :param session:异步请求session
        :param sem:协程并发信号量
        """
        ip = bullet['ip']
        port = bullet['port']
        proxy = format_proxies(':'.join([ip, port]))
        max_thread_count = MAX_T_LEN if MAX_T_LEN <= len(self.targets) else len(self.targets)
        allowed_targets = self.targets[:max_thread_count]
        tasks = [self.async_visit_target(db,i,proxy,bullet,sem,session,scan=False) for i in allowed_targets]
        resp = asyncio.gather(*tasks)
        await resp

    def success(self,db,bullet,tname):
        """
        初次在Validator中调用触手成功验证目标url后进行入库操作
        :param db: 处理操作的数据库对象
        :param bullet: 单个代理ip对象的所有数据
        :param tname: 目标url对应的数据集合
        """
        ip = bullet['ip']
        port = bullet['port']
        _data = db.select({'ip':ip,'port':port},tname=tname)
        bullet['address'] = get_ip_addr(ip) if bullet['address'] == 'unknown' or\
                                               bullet['address'] == '' else bullet['address']
        if _data:
            bullet['_id'] = _data[0]['_id']
            self.update(db,bullet,tname)
            return
        bullet['createdTime'] = time_to_date(int(time.time()))
        try:
            db.save(bullet,tname=tname)
        except Exception as e:
            logger.error('%s,msg: %s ' % (e.__class__, e))
            return

    def update(self,db,bullet,tname):
        """
        验证成功后对已存在于目标库中的代理数据进行更新
        :param db: 处理操作的数据库对象
        :param bullet: 单个代理ip对象的所有数据
        :param tname: 目标url对应的数据集合
        """
        ip = bullet['ip']
        port = bullet['port']
        if bullet['createdTime']=='':
            bullet['createdTime']=time_to_date(int(time.time()))
        bullet['address'] = get_ip_addr(ip) if bullet['address'] == 'unknown' or \
                                               bullet['address'] == '' else bullet['address']
        db.update({'ip':ip,'port':port},bullet,tname=tname)

    def fail(self,db,bullet,tname):
        """
        验证失败对已存在于目标库中的代理数据进行失败操作
        :param db: 处理操作的数据库对象
        :param bullet: 单个代理ip对象的所有数据
        :param tname: 目标url对应的数据集合
        """
        try:
            ip = bullet['ip']
            port = bullet['port']
            proxy = ':'.join([ip,port])
            db.delete({'ip':ip,'port':port},tname=tname)
            logger.warning('Deleted inoperative proxy %s in %s'%(proxy,tname))
        except Exception as e:
            logger.error('%s,msg: %s ' % (e.__class__, e))
            return
Example #5
0
 def __init__(self):
     self.db = Database(_DB_SETTINGS)
     self.db.table = _TABLE['standby']
     self.rator = Rator(self.db)
     self.standby_data = []
Example #6
0
class Scaner(object):
    def __init__(self):
        self.db = Database(_DB_SETTINGS)
        self.db.table = _TABLE['standby']
        self.rator = Rator(self.db)
        self.standby_data = []

    def run(self):
        logger.info('Running Scanner.')
        self.rator.begin()
        loop = asyncio.get_event_loop()
        while 1:
            try:
                if self.standby_data:
                    pen = len(self.standby_data)
                    logger.info(
                        'Start the validation of the local "standby" database,length : %d '
                        % pen)
                    pop_len = pen if pen <= LOCAL_AMOUNT else LOCAL_AMOUNT
                    stanby_proxies = [
                        self.standby_data.pop() for x in range(pop_len)
                    ]
                    semaphore = asyncio.Semaphore(COROUTINE_MAX)
                    logger.info(
                        'Start to verify the standby proxy data,amount: %d ' %
                        pop_len)
                    tasks = [
                        asyncio.ensure_future(self.validate(i, semaphore))
                        for i in stanby_proxies if i
                    ]
                    loop.run_until_complete(asyncio.gather(*tasks))
                    logger.info(
                        'Local validation finished.Left standby proxies:%d' %
                        len(self.standby_data))
                    time.sleep(VALIDATE_LOCAL)
                else:
                    self.standby_data = self.db.all()
            except Exception as e:
                logger.error('Error class : %s , msg : %s ' % (e.__class__, e))
                self.rator.end()
                loop.close()
                logger.info('Scanner shuts down.')
                return

    async def validate(self, proxy, semaphore):
        ip = proxy['ip']
        port = proxy['port']
        # 可设置响应超时对API服务器请求代理,没写
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(proxy_validate_url.format(ip, port),
                                           headers=headers,
                                           timeout=15) as response:
                        data = await response.text(encoding='utf-8')
                        data = json.loads(data)
                except Exception as e:
                    logger.error('Error class : %s , msg : %s ' %
                                 (e.__class__, e))
                    return
                else:
                    res = data['msg'][0]
                    if 'anony' in res and 'time' in res:
                        proxy['anony_type'] = res['anony']
                        proxy['resp_time'] = res['time']
                        self.rator.mark_update(proxy, collected=False)
                    else:
                        self.rator.mark_fail(proxy)
Example #7
0
class Detector(object):
    def __init__(self):
        self.standbyDB  = Database(_DB_SETTINGS)
        self.stableDB   = Database(_DB_SETTINGS)
        self.standbyDB.table  = _TABLE['standby']
        self.stableDB.table   = _TABLE['stable']
        self.standby_data     = []
        self.stable_data      = []

    def begin(self):
        self.stableDB.connect()
        self.standbyDB.connect()

    def end(self):
        self.standbyDB.close()
        self.stableDB.close()

    def run(self):
        logger.info('Running Detector.')
        self.begin()
        loop = asyncio.get_event_loop()
        while 1:
            try:
                self.detect_standby(loop)
                self.detect_stable(loop)
                time.sleep(DETECT_LOCAL)
            except Exception as e:
                logger.error('Error class : %s , msg : %s ' % (e.__class__, e))
                self.end()
                loop.close()
                logger.info('Detector shuts down.')
                return

    def detect_standby(self,loop):
        if self.standby_data:
            pen = len(self.standby_data)
            logger.info('Imported the "standby" database\' data,length: %d ' % pen)
            pop_len = pen if pen <= DETECT_AMOUNT else DETECT_AMOUNT
            logger.info('Start to detect the local valid data,amount: %d ' % pop_len)
            standby_data = [self.standby_data.pop() for i in range(pop_len)]
            tasks = [self._detect_standby(i) for i in standby_data]
            loop.run_until_complete(asyncio.gather(*tasks))
            logger.info('Detection finished.Left standby data length:%d' % len(self.standby_data))
        else:
            self.standby_data = self.standbyDB.all()

    def detect_stable(self,loop):
        if self.stable_data:
            pen = len(self.stable_data)
            logger.info('Imported the "stable" database\' data,length: %d ' % pen)
            pop_len = pen if pen <= DETECT_HIGH_AMOUNT else DETECT_HIGH_AMOUNT
            logger.info('Start to detect the high scored data,amount: %d ' % pop_len)
            stable_data = [self.stable_data.pop() for i in range(pop_len)]
            tasks = [self._detect_stable(i) for i in stable_data]
            loop.run_until_complete(asyncio.gather(*tasks))
            logger.info('Detection finished.Left stable data length:%d' % len(self.stable_data))
        else:
            self.stable_data = self.stableDB.all()

    async def _detect_standby(self,data):
        del data['_id']
        ip = data['ip']
        port = data['port']
        proxy = ':'.join([ip,port])
        if data['test_count']<STABLE_MIN_COUNT or round(float(data['success_rate'].replace('%',''))/100,4) < STABLE_MIN_RATE:
            return
        condition = {'ip':ip,'port':port}
        _one_data = self.stableDB.select(condition)
        if _one_data:
            self.stableDB.update(condition,data)
        else:
            self.stableDB.save(data)
            logger.info('Find a stable proxy: %s , put it into the stable database.' % proxy)

    async def _detect_stable(self,data):
        del data['_id']
        ip = data['ip']
        port = data['port']
        proxy = ':'.join([ip,port])
        condition = {'ip':ip,'port':port}
        res = self.standbyDB.select(condition)
        _one_data = res[0] if res else None
        if not bool(_one_data):
            self.stableDB.delete(condition)
            logger.warning(
                'The high scored proxy: %s had been deleted from the standby database.It\'s unavailable.' % proxy)
        else:
            if round(float(_one_data['success_rate'].replace('%',''))/100,4) < STABLE_MIN_RATE or _one_data['combo_fail'] >= DELETE_COMBO:
                self.stableDB.delete(condition)
                logger.warning(
                    'The high scored proxy: %s is not that stable now.It\'s Removed.' % proxy)
            else:
                del _one_data['_id']
                self.stableDB.update(condition,_one_data)
Example #8
0
class Scaner(object):
    """
    本地扫描器,对本地standby有效代理数据库中的数据进行周期验证
    保证其以后调用数据的实时验证,通过内置打分器进行打分存储
    """
    def __init__(self):
        self.db = Database(_DB_SETTINGS)
        self.db.table = _TABLE['standby']
        self.rator = Rator(self.db)
        self.standby_data = []

    def check_allot(self, proxies):
        """
        将扫描器一次取出的要验证的本地standby数据库有效代理数据进行分组
        分成几组则有多少个异步协程来验证IP代理数据,一组中有多少个代理IP
        则一个协程一次验证的代理IP就有多少个。建议一次验证的IP数不要太多,
        防止目标验证网站封掉本机IP,如果你已经爬取到一定数量的IP代理并存储
        到standby或stable数据库中,则可以将数值设置大一点,最大不能超过100
        如果是刚刚开始建立FooProxy数据库,则建议将offset设置为2,慢慢爬取建立
        稳定数据库后,再设置大一点的数值。此处设置为20是因为我的本地数据库已经很大。

        Q:为甚要有这个函数?
        A:前期因为使用单个IP代理对应一个异步协程验证,一次取出500个代理进行验证,经常被
        目标验证网站http://www.moguproxy.com封掉IP或者断开连接,此时使用查询分组可以
        减少一次性访问的异步协程的数量,但是如果offset值设置过大会引起目标验证网站的多线程
        验证压力增大,被封IP的几率大大增加,所以设置一个合适的offset比较好。

        Q:那究竟要多大啊这个offset?
        A:前期刚刚开始使用FooProxy项目来建立代理池的话,建议设为2,即是最小值了,此时不会增加目标网站
        的多线程验证压力,不会引起注意,但是也要设置好一次取出的待验证IP代理数据的量,在config中设置
        的LOCAL_AMOUNT,默认500,可以自己设置100或者更小,看自己需求,offse和LOCAL_AMOUNT这两个值
        越大被封IP的几率越大,建议前期offset为2,后续代理池稳定下来可以设置更大的值。

        Q:这么麻烦那我自己验证代理有效性不就行了?
        A:这是可以的。由于我比较懒,所以使用了验证网站的接口,也可以自己去访问一些验证服务器来判断返回的
        头部内容,根据response headers中的内容确定匿名程度,以及响应时间。比如访问:http://httpbin.org/get?show_env=1
        但是如果用这种办法,验证用的validate异步协程函数就要重写。

        :param proxies:扫描器一次取出来的待验证本地standby有效数据库的代理IP列表,格式[{},{},..]
        :return:返回分组结果,格式 {'查询参数字符串':[{},{},..],'查询参数字符串':[{},{},..],..}
        查询参数字符串对应的值为分组后的一组代理IP数据,dict类型
        """
        p_len = len(proxies)
        offset = 20
        params_dict = {}
        if p_len <= offset:
            return {
                '&'.join([
                    'ip_ports%5B%5D={}%3A{}'.format(i['ip'], i['port']) for i in proxies
                ]):
                proxies
            }
        else:
            base = math.ceil(p_len / offset)
            p_groups = [
                proxies[i * offset:(i + 1) * offset] for i in range(base)
            ]
            for group in p_groups:
                url_str = '&'.join([
                    'ip_ports%5B%5D={}%3A{}'.format(i['ip'], i['port'])
                    for i in group
                ])
                params_dict[url_str] = group
            return params_dict

    def run(self):
        """
        运行本地扫描器
        """
        logger.info('Running Scanner.')
        self.rator.begin()
        loop = asyncio.get_event_loop()
        while 1:
            try:
                if self.standby_data:
                    pen = len(self.standby_data)
                    logger.info(
                        'Start the validation of the local "standby" database,length : %d '
                        % pen)
                    pop_len = pen if pen <= LOCAL_AMOUNT else LOCAL_AMOUNT
                    stanby_proxies = [
                        self.standby_data.pop() for x in range(pop_len)
                    ]
                    prams_dict = self.check_allot(stanby_proxies)
                    semaphore = asyncio.Semaphore(COROUTINE_MAX)
                    logger.info(
                        'Start to verify the standby proxy data,amount: %d ' %
                        pop_len)
                    tasks = [
                        asyncio.ensure_future(
                            self.validate(i, prams_dict[i], semaphore))
                        for i in prams_dict
                    ]
                    loop.run_until_complete(asyncio.gather(*tasks))
                    logger.info(
                        'Local validation finished.Left standby proxies:%d' %
                        len(self.standby_data))
                    time.sleep(VALIDATE_LOCAL)
                else:
                    self.standby_data = self.db.all()
            except Exception as e:
                logger.error('Error class : %s , msg : %s ' % (e.__class__, e))
                self.rator.end()
                loop.close()
                logger.info('Scanner shuts down.')
                return

    async def validate(self, url_str, proxies, semaphore):
        """
        异步验证协程,对本地standby中的代理数据进行异步验证
        :param url_str: IP代理分组中一个组的验证查询参数字符串
        :param proxies: 查询参数字符串对应的IP代理组
        :param semaphore: 协程最大并发量信号
        """
        _proxy = None
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                while 1:
                    try:
                        async with session.get(mul_validate_url + url_str,
                                               headers=v_headers,
                                               proxy=_proxy) as response:
                            data = await response.text(encoding='utf-8')
                            data = json.loads(data)
                    except Exception as e:
                        _proxy = get_proxy(format=False)
                        if not _proxy:
                            logger.error(
                                'No available proxy to retry the request for validation.'
                            )
                            return
                        continue
                    else:
                        for res in data['msg']:
                            proxy = find_proxy(res['ip'], res['port'], proxies)
                            try:
                                if 'anony' in res and 'time' in res:
                                    proxy['anony_type'] = res['anony']
                                    proxy['resp_time'] = res['time']
                                    self.rator.mark_update(proxy,
                                                           collected=False)
                                else:
                                    self.rator.mark_fail(proxy)
                            except KeyError as e:
                                logger.error('Error class : %s , msg : %s ' %
                                             (e.__class__, e))
                                continue
                        return
Example #9
0
"""
    @author  : linkin
    @email   : [email protected]
    @date    : 2018-10-04
"""
import random
import logging
import json
from flask import Flask
from components.dbhelper import Database
from config.DBsettings import _TABLE
from config.DBsettings import _DB_SETTINGS

logger = logging.getLogger('APIserver')
app = Flask(__name__)
stable_db = Database(_DB_SETTINGS)
standby_db = Database(_DB_SETTINGS)
common_db = Database(_DB_SETTINGS)

standby_db.table = _TABLE['standby']
stable_db.table = _TABLE['stable']
standby_db.connect()
stable_db.connect()
common_db.connect()

all_standby_proxy = standby_db.all()
all_stable_proxy = stable_db.all()
anony_standby = [
    i for i in all_standby_proxy
    if i['anony_type'] == '高匿' and i['combo_fail'] == 0
]