async def push_to_checked_pool(cls, ips) -> int: if not isinstance(ips, list): ips = [ips] with await Redis.share() as redis: await redis.sadd(Config.REDIS_KEY_CHECKED_POOL, *ips) Logger.info('[check] send %d ip to checked pools' % len(ips)) return len(ips)
async def show_result(self, session, site, result, resp: SiteResponse): Logger.info('[get] Url: %s' % resp.url) for item in result: if isinstance(item, SiteRequestData): await self.crawl_single_page(session, site, item) if not isinstance(item, SiteResponseData): continue Logger.info('[get] Get ip: %s' % item.to_str())
async def remove_legacy_ip(self): with await Redis.share() as redis: count = await redis.zremrangebyscore( Config.REDIS_KEY_IP_LEGACY_POOL, 0, time_int() - Config.DEFAULT_LEGACY_IP_RETAINED_TIME) if count: Logger.info('[check] remove legacy ip count %d' % count) return count
async def load_file(f_path): with open(f_path) as f: ips = [] for ip in f.readlines(): if ip and ip.find(':') and ip.find('#') < 0: ip = ip.strip() ips.append(ip) if ips: Logger.info('Find ip count %d' % len(ips)) await IPGet.push_to_pool(ips)
async def get_random_ip(cls, https: bool = False, rule: str = None) -> IPData: ips = await cls.get_ips(https=https, rule=rule) if not ips: return None ip = random.choice(ips) assert isinstance(ip, IPData), 'Error format' Logger.info('[factory] get ip %s', ip.to_str()) return ip
async def remove_low_score_ip(self): saver = IPSaver() needs_remove = [] with await Redis.share() as redis: ips = await redis.zrangebyscore(Config.REDIS_KEY_IP_POOL, -100, 0) if len(ips) > 0: ips = [ip_str.decode() for ip_str in ips] needs_remove = ips if needs_remove: await saver.remove_ip(ips) Logger.info('[check] remove ip %s', ','.join(ips))
async def save_parse_result(self, session, site: SiteData, result): ips = [] for item in result: if isinstance(item, SiteRequestData): await self.crawl_single_page(session, site, item) if not isinstance(item, SiteResponseData): continue ips.append(item.to_str()) if ips: Logger.info('[get] Get %d new ip' % len(ips)) await self.push_to_pool(ips)
async def load_from_url(url: str): import re headers = {'User-Agent': get_user_agent()} async with aiohttp.ClientSession(headers=headers) as session: async with session.get(url) as resp: text = await resp.text() matched = re.findall(r'(?:\d{1,3}\.){3}\d{1,3}:\d+', text) ips = [] for ip in matched: if ip and ip.find(':') and ip.find('#') < 0: ip = ip.strip() ips.append(ip) if ips: Logger.info('Find ip count %d' % len(ips)) await IPGet.push_to_pool(ips)
async def dump_to_file(self): from os import path, mkdir if not path.isdir(Config.DUMPED_DIR): mkdir(Config.DUMPED_DIR) with await Redis.share() as redis: members = await redis.zrangebyscore(Config.REDIS_KEY_IP_POOL, Config.DEFAULT_MINI_SCORE, Config.DEFAULT_MAX_SCORE + Config.DEFAULT_INC_SCORE) if members: members = [m.decode() for m in members] from datetime import datetime file_name = 'ip_pool_%s.ip.txt' % datetime.now().strftime('%Y-%m-%d %H:%M:%S') with open(Config.DUMPED_DIR + file_name, 'w') as f: f.write('\n'.join(members)) Logger.info('Dump %d ip to file %s' % (len(members), file_name)) return True
async def start_check(self): with await Redis.share() as redis: ip_str = await redis.blpop(Config.REDIS_KEY_CHECK_POOL) ip_str = ip_str[1].decode() Logger.info('[check] got ip %s' % ip_str) Prometheus.IP_CHECK_TOTAL.inc(1) ip = IPData.with_str(ip_str) async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout( Config.DEFAULT_REQUEST_CHECK_TIME_OUT)) as session: ip = await self.http_check(ip, session) ip = await self.https_check(ip, session) ip = await self.rules_check(ip, session) Logger.info( '[check] Check result %s http %s https %s %s', ip.to_str(), ip.http, ip.https, " ".join(["%s %s" % (k, r) for k, r in ip.rules.items()])) await IPSaver().save_ip(ip)
async def push_to_pool(cls, ips): from src.app.ip_checker import IPChecker if not isinstance(ips, list): ips = [ips] with await Redis.share() as redis: needs_ip = [] for ip in ips: exists = await redis.zscore(Config.REDIS_KEY_IP_POOL, ip) if exists is not None: continue exists = await redis.zscore(Config.REDIS_KEY_IP_LEGACY_POOL, ip) if exists is not None: continue await redis.zadd(Config.REDIS_KEY_IP_POOL, Config.DEFAULT_SCORE, ip) needs_ip.append(ip) if needs_ip: await IPChecker.push_to_pool(needs_ip) Logger.info('[get] send %d ip to ip pools' % len(needs_ip)) return len(ips)