Exemple #1
0
 async def push_to_checked_pool(cls, ips) -> int:
     if not isinstance(ips, list):
         ips = [ips]
     with await Redis.share() as redis:
         await redis.sadd(Config.REDIS_KEY_CHECKED_POOL, *ips)
         Logger.info('[check] send %d ip to checked pools' % len(ips))
     return len(ips)
Exemple #2
0
 async def show_result(self, session, site, result, resp: SiteResponse):
     Logger.info('[get] Url: %s' % resp.url)
     for item in result:
         if isinstance(item, SiteRequestData):
             await self.crawl_single_page(session, site, item)
         if not isinstance(item, SiteResponseData):
             continue
         Logger.info('[get] Get ip: %s' % item.to_str())
Exemple #3
0
 async def remove_legacy_ip(self):
     with await Redis.share() as redis:
         count = await redis.zremrangebyscore(
             Config.REDIS_KEY_IP_LEGACY_POOL, 0,
             time_int() - Config.DEFAULT_LEGACY_IP_RETAINED_TIME)
         if count:
             Logger.info('[check] remove legacy ip count %d' % count)
         return count
Exemple #4
0
    async def check_task(self):
        while True:
            Logger.debug('[check] check task loop')
            try:
                await self.start_check()
            except Exception as e:
                await self.handle_task_exception(e)

            if Config.APP_ENV == Config.AppEnvType.TEST:
                break
Exemple #5
0
 async def get_random_ip(cls,
                         https: bool = False,
                         rule: str = None) -> IPData:
     ips = await cls.get_ips(https=https, rule=rule)
     if not ips:
         return None
     ip = random.choice(ips)
     assert isinstance(ip, IPData), 'Error format'
     Logger.info('[factory] get ip %s', ip.to_str())
     return ip
Exemple #6
0
async def load_file(f_path):
    with open(f_path) as f:
        ips = []
        for ip in f.readlines():
            if ip and ip.find(':') and ip.find('#') < 0:
                ip = ip.strip()
                ips.append(ip)
        if ips:
            Logger.info('Find ip count %d' % len(ips))
            await IPGet.push_to_pool(ips)
Exemple #7
0
    async def check_stats_task(self):
        while True:
            Logger.debug('[get] check stats task loop')
            try:
                await self.running_stats()
            except Exception as e:
                await self.handle_task_exception(e)

            if Config.APP_ENV == Config.AppEnvType.TEST:
                break
            await asyncio.sleep(Config.DEFAULT_STATS_CHECK_INTERVAL)
Exemple #8
0
    async def check_legacy_task(self):
        while True:
            Logger.debug('[get] check legacy task loop')
            try:
                await self.remove_legacy_ip()
            except Exception as e:
                await self.handle_task_exception(e)

            if Config.APP_ENV == Config.AppEnvType.TEST:
                break
            await asyncio.sleep(Config.DEFAULT_LEGACY_IP_CHECK_INTERVAL)
Exemple #9
0
    async def crawl_task(self):
        while True:
            Logger.debug('[get] crawl task loop')
            try:
                await self.start_crawl()
            except Exception as e:
                await self.handle_task_exception(e)

            if Config.APP_ENV == Config.AppEnvType.TEST:
                break
            await asyncio.sleep(Config.DEFAULT_LOOP_INTERVAL)
Exemple #10
0
 async def save_parse_result(self, session, site: SiteData, result):
     ips = []
     for item in result:
         if isinstance(item, SiteRequestData):
             await self.crawl_single_page(session, site, item)
         if not isinstance(item, SiteResponseData):
             continue
         ips.append(item.to_str())
     if ips:
         Logger.info('[get] Get %d new ip' % len(ips))
         await self.push_to_pool(ips)
Exemple #11
0
 async def remove_low_score_ip(self):
     saver = IPSaver()
     needs_remove = []
     with await Redis.share() as redis:
         ips = await redis.zrangebyscore(Config.REDIS_KEY_IP_POOL, -100, 0)
         if len(ips) > 0:
             ips = [ip_str.decode() for ip_str in ips]
             needs_remove = ips
     if needs_remove:
         await saver.remove_ip(ips)
         Logger.info('[check] remove ip %s', ','.join(ips))
Exemple #12
0
    async def check_low_score_task(self):
        while True:
            Logger.debug('[check] check low score task loop')
            try:
                await self.remove_low_score_ip()
            except Exception as e:
                await self.handle_task_exception(e)

            if Config.APP_ENV == Config.AppEnvType.TEST:
                break
            await asyncio.sleep(Config.DEFAULT_CHECK_CLEAN_IP_INTERVAL)
Exemple #13
0
 async def parse_site(self, session, site: SiteData, resp: SiteResponse):
     parser = self._parsers.get(site.key)
     if not parser:
         return
     try:
         result = parser(resp)
         if not self._test_model:
             await self.save_parse_result(session, site, result)
         else:
             await self.show_result(session, site, result, resp=resp)
     except Exception as e:
         Logger.error('[get] Parse error, message: %s' % str(e))
Exemple #14
0
async def load_from_url(url: str):
    import re
    headers = {'User-Agent': get_user_agent()}
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url) as resp:
            text = await resp.text()
            matched = re.findall(r'(?:\d{1,3}\.){3}\d{1,3}:\d+', text)
            ips = []
            for ip in matched:
                if ip and ip.find(':') and ip.find('#') < 0:
                    ip = ip.strip()
                    ips.append(ip)
            if ips:
                Logger.info('Find ip count %d' % len(ips))
                await IPGet.push_to_pool(ips)
Exemple #15
0
 async def dump_to_file(self):
     from os import path, mkdir
     if not path.isdir(Config.DUMPED_DIR):
         mkdir(Config.DUMPED_DIR)
     with await Redis.share() as redis:
         members = await redis.zrangebyscore(Config.REDIS_KEY_IP_POOL, Config.DEFAULT_MINI_SCORE,
                                             Config.DEFAULT_MAX_SCORE + Config.DEFAULT_INC_SCORE)
         if members:
             members = [m.decode() for m in members]
             from datetime import datetime
             file_name = 'ip_pool_%s.ip.txt' % datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             with open(Config.DUMPED_DIR + file_name, 'w') as f:
                 f.write('\n'.join(members))
             Logger.info('Dump %d ip to file %s' % (len(members), file_name))
     return True
Exemple #16
0
    async def recheck_ip_task(self):
        key = 'recheck_ip'
        while True:
            Logger.debug('[check] recheck ip task loop')
            try:
                if not await Redis.last_time_check(
                        key, Config.DEFAULT_CHECK_INTERVAL):
                    await Redis.save_last_time(key)
                    await self.resend_check_ip()
            except Exception as e:
                await self.handle_task_exception(e)

            if Config.APP_ENV == Config.AppEnvType.TEST:
                break
            await asyncio.sleep(Config.DEFAULT_CHECK_INTERVAL)
Exemple #17
0
async def main():
    argv = None
    if len(sys.argv) > 1:
        argv = sys.argv[1]
    if argv and argv.find('://') > 0:
        return await load_from_url(argv)
    res = os.listdir('.')
    ip_file_lists = [name for name in res if name.find('.ip.txt') > 0]
    if argv:
        if argv not in ip_file_lists:
            Logger.error('file %s doesn\'t exists' % argv)
            return
        else:
            ip_file_lists = [argv]
    for fn in ip_file_lists:
        await load_file(fn)
Exemple #18
0
    async def check_dump_task(self):
        from src.app.ip_saver import IPSaver
        key = 'dump_to_file'
        while True:
            Logger.debug('[get] dump task loop')
            try:
                if not await Redis.last_time_check(
                        key, Config.DEFAULT_DUMP_IP_INTERVAL):
                    await Redis.save_last_time(key)
                    await IPSaver().dump_to_file()
            except Exception as e:
                await self.handle_task_exception(e)

            if Config.APP_ENV == Config.AppEnvType.TEST:
                break
            await asyncio.sleep(Config.DEFAULT_DUMP_IP_INTERVAL)
Exemple #19
0
 async def crawl_site(self, site: SiteData, page_limit: int = 0):
     headers = {'User-Agent': self.get_user_agent()}
     headers.update(site.headers)
     async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
             Config.DEFAULT_REQUEST_TIME_OUT),
                                      headers=headers) as session:
         pages = site.pages if page_limit == 0 else site.pages[0:page_limit]
         for page in pages:
             try:
                 await self.crawl_single_page(session, site,
                                              site.to_request(page))
             except MaxRetryException as e:
                 Logger.warn('[get] Max retry skip, message: %s' % str(e))
                 continue
             finally:
                 if site.page_interval:
                     await asyncio.sleep(site.page_interval)
Exemple #20
0
 async def start_check(self):
     with await Redis.share() as redis:
         ip_str = await redis.blpop(Config.REDIS_KEY_CHECK_POOL)
     ip_str = ip_str[1].decode()
     Logger.info('[check] got ip %s' % ip_str)
     Prometheus.IP_CHECK_TOTAL.inc(1)
     ip = IPData.with_str(ip_str)
     async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
             Config.DEFAULT_REQUEST_CHECK_TIME_OUT)) as session:
         ip = await self.http_check(ip, session)
         ip = await self.https_check(ip, session)
         ip = await self.rules_check(ip, session)
         Logger.info(
             '[check] Check result %s http %s https %s %s', ip.to_str(),
             ip.http, ip.https,
             " ".join(["%s %s" % (k, r) for k, r in ip.rules.items()]))
     await IPSaver().save_ip(ip)
Exemple #21
0
        async def wrapper(*args, **kwargs):
            retry_num = num
            if retry_num_key in kwargs:
                retry_num = kwargs.get(retry_num_key)
                kwargs.pop(retry_num_key)
            try:
                res = await func(*args, **kwargs)
            except RetryException as err:
                retry_num -= 1
                from src.app.main import Logger
                Logger.warning('Retry %s, remaining times %d' % (func.__name__, retry_num))
                if retry_num > 0:
                    kwargs[retry_num_key] = retry_num
                    return await wrapper(*args, **kwargs)
                raise MaxRetryException() from err

            return res
Exemple #22
0
 async def crawl_single_page(self, session, site, request: SiteRequestData):
     proxy = None
     if request.use_proxy is True:
         random_proxy = await IPFactory.get_random_ip(
             request.url.find('https') == 0)
         if random_proxy:
             proxy = random_proxy.to_http()
     try:
         async with session.get(request.url, proxy=proxy) as resp:
             text = await resp.text()
             if not text:
                 raise EmptyResponseException('empty text')
             site_resp = SiteResponse(text, url=request.url, site=site)
         await self.parse_site(session, site, site_resp)
     except Exception as e:
         Logger.error('[get] Get page %s error, message: %s' %
                      (request.url, str(e)))
         raise RetryException() from e
Exemple #23
0
 async def push_to_pool(cls, ips):
     from src.app.ip_checker import IPChecker
     if not isinstance(ips, list):
         ips = [ips]
     with await Redis.share() as redis:
         needs_ip = []
         for ip in ips:
             exists = await redis.zscore(Config.REDIS_KEY_IP_POOL, ip)
             if exists is not None:
                 continue
             exists = await redis.zscore(Config.REDIS_KEY_IP_LEGACY_POOL,
                                         ip)
             if exists is not None:
                 continue
             await redis.zadd(Config.REDIS_KEY_IP_POOL,
                              Config.DEFAULT_SCORE, ip)
             needs_ip.append(ip)
         if needs_ip:
             await IPChecker.push_to_pool(needs_ip)
         Logger.info('[get] send %d ip to ip pools' % len(needs_ip))
     return len(ips)
Exemple #24
0
 async def handle_task_exception(self, e):
     Logger.error('[error] ' + str(e))
     await asyncio.sleep(5)  #