class Getter(object): """ 获取代理池 """ def __init__(self): """ 初始化 db 和 爬虫 """ self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] def is_full(self): """ 判断代理池是否已经满了 return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): """ 运行代理抓取工具 :return: """ if self.is_full(): return for crawler in self.crawlers: logger.info(f'爬取 {crawler} to get proxy') for proxy in crawler.crawl(): self.redis.add(proxy)
class Getter(object): """ getter of proxypool """ def __init__(self): """ init db and crawlers """ self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] def is_full(self): """ if proxypool if full return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): """ run crawlers to get proxy :return: """ if self.is_full(): return for crawler in self.crawlers: for proxy in crawler.crawl(): self.redis.add(proxy)
class Getter(object): """ getter of proxypool """ def __init__(self): """ init db and crawlers """ self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] def is_full(self): """ if proxypool if full return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): """ run crawlers to get proxy :return: """ if self.is_full(): return proxyfile = "staticproxy.txt" with open(proxyfile, 'r') as fh: proxylines = fh.readlines() logger.info(f'read {proxyfile}') for line in proxylines: if line.strip() != "" and not line.startswith("#"): line = line.replace("\r\n", "").replace("\n", "") pattern = re.compile( r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)' ) match = re.search(pattern, line) if match: username = match.groupdict()['username'] password = match.groupdict()['password'] ip = match.groupdict()['ip'] port = match.groupdict()['port'] proxy = Proxy(host=ip, port=port, username=username, password=password) logger.info("getproxy " + proxy.string()) self.redis.add(proxy) for crawler in self.crawlers: logger.info(f'crawler {crawler} to get proxy') for proxy in crawler.crawl(): print(proxy.string()) self.redis.add(proxy)