async def push_to_checked_pool(cls, ips) -> int: if not isinstance(ips, list): ips = [ips] with await Redis.share() as redis: await redis.sadd(Config.REDIS_KEY_CHECKED_POOL, *ips) Logger.info('[check] send %d ip to checked pools' % len(ips)) return len(ips)
async def show_result(self, session, site, result, resp: SiteResponse): Logger.info('[get] Url: %s' % resp.url) for item in result: if isinstance(item, SiteRequestData): await self.crawl_single_page(session, site, item) if not isinstance(item, SiteResponseData): continue Logger.info('[get] Get ip: %s' % item.to_str())
async def remove_legacy_ip(self): with await Redis.share() as redis: count = await redis.zremrangebyscore( Config.REDIS_KEY_IP_LEGACY_POOL, 0, time_int() - Config.DEFAULT_LEGACY_IP_RETAINED_TIME) if count: Logger.info('[check] remove legacy ip count %d' % count) return count
async def check_task(self): while True: Logger.debug('[check] check task loop') try: await self.start_check() except Exception as e: await self.handle_task_exception(e) if Config.APP_ENV == Config.AppEnvType.TEST: break
async def get_random_ip(cls, https: bool = False, rule: str = None) -> IPData: ips = await cls.get_ips(https=https, rule=rule) if not ips: return None ip = random.choice(ips) assert isinstance(ip, IPData), 'Error format' Logger.info('[factory] get ip %s', ip.to_str()) return ip
async def load_file(f_path): with open(f_path) as f: ips = [] for ip in f.readlines(): if ip and ip.find(':') and ip.find('#') < 0: ip = ip.strip() ips.append(ip) if ips: Logger.info('Find ip count %d' % len(ips)) await IPGet.push_to_pool(ips)
async def check_stats_task(self): while True: Logger.debug('[get] check stats task loop') try: await self.running_stats() except Exception as e: await self.handle_task_exception(e) if Config.APP_ENV == Config.AppEnvType.TEST: break await asyncio.sleep(Config.DEFAULT_STATS_CHECK_INTERVAL)
async def check_legacy_task(self): while True: Logger.debug('[get] check legacy task loop') try: await self.remove_legacy_ip() except Exception as e: await self.handle_task_exception(e) if Config.APP_ENV == Config.AppEnvType.TEST: break await asyncio.sleep(Config.DEFAULT_LEGACY_IP_CHECK_INTERVAL)
async def crawl_task(self): while True: Logger.debug('[get] crawl task loop') try: await self.start_crawl() except Exception as e: await self.handle_task_exception(e) if Config.APP_ENV == Config.AppEnvType.TEST: break await asyncio.sleep(Config.DEFAULT_LOOP_INTERVAL)
async def save_parse_result(self, session, site: SiteData, result): ips = [] for item in result: if isinstance(item, SiteRequestData): await self.crawl_single_page(session, site, item) if not isinstance(item, SiteResponseData): continue ips.append(item.to_str()) if ips: Logger.info('[get] Get %d new ip' % len(ips)) await self.push_to_pool(ips)
async def remove_low_score_ip(self): saver = IPSaver() needs_remove = [] with await Redis.share() as redis: ips = await redis.zrangebyscore(Config.REDIS_KEY_IP_POOL, -100, 0) if len(ips) > 0: ips = [ip_str.decode() for ip_str in ips] needs_remove = ips if needs_remove: await saver.remove_ip(ips) Logger.info('[check] remove ip %s', ','.join(ips))
async def check_low_score_task(self): while True: Logger.debug('[check] check low score task loop') try: await self.remove_low_score_ip() except Exception as e: await self.handle_task_exception(e) if Config.APP_ENV == Config.AppEnvType.TEST: break await asyncio.sleep(Config.DEFAULT_CHECK_CLEAN_IP_INTERVAL)
async def parse_site(self, session, site: SiteData, resp: SiteResponse): parser = self._parsers.get(site.key) if not parser: return try: result = parser(resp) if not self._test_model: await self.save_parse_result(session, site, result) else: await self.show_result(session, site, result, resp=resp) except Exception as e: Logger.error('[get] Parse error, message: %s' % str(e))
async def load_from_url(url: str): import re headers = {'User-Agent': get_user_agent()} async with aiohttp.ClientSession(headers=headers) as session: async with session.get(url) as resp: text = await resp.text() matched = re.findall(r'(?:\d{1,3}\.){3}\d{1,3}:\d+', text) ips = [] for ip in matched: if ip and ip.find(':') and ip.find('#') < 0: ip = ip.strip() ips.append(ip) if ips: Logger.info('Find ip count %d' % len(ips)) await IPGet.push_to_pool(ips)
async def dump_to_file(self): from os import path, mkdir if not path.isdir(Config.DUMPED_DIR): mkdir(Config.DUMPED_DIR) with await Redis.share() as redis: members = await redis.zrangebyscore(Config.REDIS_KEY_IP_POOL, Config.DEFAULT_MINI_SCORE, Config.DEFAULT_MAX_SCORE + Config.DEFAULT_INC_SCORE) if members: members = [m.decode() for m in members] from datetime import datetime file_name = 'ip_pool_%s.ip.txt' % datetime.now().strftime('%Y-%m-%d %H:%M:%S') with open(Config.DUMPED_DIR + file_name, 'w') as f: f.write('\n'.join(members)) Logger.info('Dump %d ip to file %s' % (len(members), file_name)) return True
async def recheck_ip_task(self): key = 'recheck_ip' while True: Logger.debug('[check] recheck ip task loop') try: if not await Redis.last_time_check( key, Config.DEFAULT_CHECK_INTERVAL): await Redis.save_last_time(key) await self.resend_check_ip() except Exception as e: await self.handle_task_exception(e) if Config.APP_ENV == Config.AppEnvType.TEST: break await asyncio.sleep(Config.DEFAULT_CHECK_INTERVAL)
async def main(): argv = None if len(sys.argv) > 1: argv = sys.argv[1] if argv and argv.find('://') > 0: return await load_from_url(argv) res = os.listdir('.') ip_file_lists = [name for name in res if name.find('.ip.txt') > 0] if argv: if argv not in ip_file_lists: Logger.error('file %s doesn\'t exists' % argv) return else: ip_file_lists = [argv] for fn in ip_file_lists: await load_file(fn)
async def check_dump_task(self): from src.app.ip_saver import IPSaver key = 'dump_to_file' while True: Logger.debug('[get] dump task loop') try: if not await Redis.last_time_check( key, Config.DEFAULT_DUMP_IP_INTERVAL): await Redis.save_last_time(key) await IPSaver().dump_to_file() except Exception as e: await self.handle_task_exception(e) if Config.APP_ENV == Config.AppEnvType.TEST: break await asyncio.sleep(Config.DEFAULT_DUMP_IP_INTERVAL)
async def crawl_site(self, site: SiteData, page_limit: int = 0): headers = {'User-Agent': self.get_user_agent()} headers.update(site.headers) async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout( Config.DEFAULT_REQUEST_TIME_OUT), headers=headers) as session: pages = site.pages if page_limit == 0 else site.pages[0:page_limit] for page in pages: try: await self.crawl_single_page(session, site, site.to_request(page)) except MaxRetryException as e: Logger.warn('[get] Max retry skip, message: %s' % str(e)) continue finally: if site.page_interval: await asyncio.sleep(site.page_interval)
async def start_check(self): with await Redis.share() as redis: ip_str = await redis.blpop(Config.REDIS_KEY_CHECK_POOL) ip_str = ip_str[1].decode() Logger.info('[check] got ip %s' % ip_str) Prometheus.IP_CHECK_TOTAL.inc(1) ip = IPData.with_str(ip_str) async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout( Config.DEFAULT_REQUEST_CHECK_TIME_OUT)) as session: ip = await self.http_check(ip, session) ip = await self.https_check(ip, session) ip = await self.rules_check(ip, session) Logger.info( '[check] Check result %s http %s https %s %s', ip.to_str(), ip.http, ip.https, " ".join(["%s %s" % (k, r) for k, r in ip.rules.items()])) await IPSaver().save_ip(ip)
async def wrapper(*args, **kwargs): retry_num = num if retry_num_key in kwargs: retry_num = kwargs.get(retry_num_key) kwargs.pop(retry_num_key) try: res = await func(*args, **kwargs) except RetryException as err: retry_num -= 1 from src.app.main import Logger Logger.warning('Retry %s, remaining times %d' % (func.__name__, retry_num)) if retry_num > 0: kwargs[retry_num_key] = retry_num return await wrapper(*args, **kwargs) raise MaxRetryException() from err return res
async def crawl_single_page(self, session, site, request: SiteRequestData): proxy = None if request.use_proxy is True: random_proxy = await IPFactory.get_random_ip( request.url.find('https') == 0) if random_proxy: proxy = random_proxy.to_http() try: async with session.get(request.url, proxy=proxy) as resp: text = await resp.text() if not text: raise EmptyResponseException('empty text') site_resp = SiteResponse(text, url=request.url, site=site) await self.parse_site(session, site, site_resp) except Exception as e: Logger.error('[get] Get page %s error, message: %s' % (request.url, str(e))) raise RetryException() from e
async def push_to_pool(cls, ips): from src.app.ip_checker import IPChecker if not isinstance(ips, list): ips = [ips] with await Redis.share() as redis: needs_ip = [] for ip in ips: exists = await redis.zscore(Config.REDIS_KEY_IP_POOL, ip) if exists is not None: continue exists = await redis.zscore(Config.REDIS_KEY_IP_LEGACY_POOL, ip) if exists is not None: continue await redis.zadd(Config.REDIS_KEY_IP_POOL, Config.DEFAULT_SCORE, ip) needs_ip.append(ip) if needs_ip: await IPChecker.push_to_pool(needs_ip) Logger.info('[get] send %d ip to ip pools' % len(needs_ip)) return len(ips)
async def handle_task_exception(self, e): Logger.error('[error] ' + str(e)) await asyncio.sleep(5) #