class Getter(): def __init__(self): self.redis = FileClient() self.crawler = Crawler() def is_over_flow(self): '''判断是否达到代理池限制''' if self.redis.count() >= POOL_UPPER_FLOW: return True return False def run(self): print('获取器开始执行') if not self.is_over_flow(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy) else: print('代理池数量已够',str(self.redis.count())) # if __name__ == '__main__': # getter = Getter() # getter.run()
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ Check if the mount of proxies is over threshold.""" if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('Getter started.') if PRIVATE_PROXY_ENABLE: proxies = PrivateProxy().get_proxies() for proxy in proxies: print('Add private proxy {}'.format(proxy)) self.redis.add(proxy) else: if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(object): def __init__(self): """ 初始化数据库和创建爬虫 """ self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断代理池是否达到上限 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('或取器开始运作...') # 判断代理池是否达到上限 if not self.is_over_threshold(): # 遍历所有的代理网站生成的各自的解析函数 for crawler_index in range(self.crawler.__CrawlerCount__): # 获取对应索引的回调函数 callback = self.crawler.__CrawlerFunc__[crawler_index] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: # print(proxy) self.redis.add(proxy)
class Getter(object): """ 代理IP获取器 """ def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_limit(self): """ 检测是否超过代理的最大限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): """ 通过python定义的元类可以顺序执行以crawl_开头的函数 :return: """ print("获取器开始运行,爬取免费代理") if not self.is_over_limit(): for callback_label in range(self.crawler.__CrawlFuncCount__): # 执行获取代理的函数 callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy=proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): # 看不懂 # self.crawler.__CrawlFuncCount__ 获取所有以crawl开头的函数 for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class GetterProxy(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始执行") if not self.is_over_threshold(): for callback_index in range(Crawler.__CrawlFuncCount__): # 获取方法 callback = self.crawler.__CrawlFunc__[callback_index] # 获取代理 proxies = self.crawler.get_proxies(callback) # 添加代理 for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush( ) #当我们打印一些字符时,并不是调用print函数后就立即打印的。一般会先将字符送到缓冲区,然后再打印。这就存在一个问题,如果你想立刻看到日志,但由于缓冲区没满,不会打印,sys.stdout.flush()立即把stdout缓存内容输出 for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies2 = self.crawler.get_proxies(callback) print(proxies2) for i in proxies2: print('__________________________') self.redis.add(i)
class Getter(object): def __init__(self): self.sqlite3 = sqlitedb() self.crawler = Crawler() def run(self): cprint('获取器开始执行') for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() cprint("插入数据到sqlite3 proxy 表") for proxy in proxies: self.sqlite3.add(list(proxy))
class Getter(): def __init__(self): self.crawler = Crawler() self.redis = RedisClient() def run(self): if self.redis.count() < POOL_UPPER_THRESHOLD: #for crawl_func_label in range(self.crawler.__CrawlFuncCount__): for crawl_func in self.crawler.__CrawlFunc__: #crawl_func = self.crawler.__CrawlFunc__[crawl_func_label] proxies = self.crawler.start_crawl_func(crawl_func) print(crawl_func, '正在爬取代理') for proxy in proxies: print(proxy) self.redis.add(proxy) proxy_sum = self.redis.count() print('目前代理个数:', proxy_sum)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def log(self): if not os.path.exists('log2'): os.mkdir('log2') log_file_name = 'log2/' + LOG_PATH log_file_1 = logging.FileHandler(log_file_name, 'a', encoding='utf-8') fmt = logging.Formatter( fmt= "%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s") log_file_1.setFormatter(fmt) logger1 = logging.Logger('run_log', level=logging.DEBUG) logger1.addHandler(log_file_1) return logger1 def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): """爬取到代理设置初始分数,直接存入redis""" print('获取器开始执行') if not self.is_over_threshold(): try: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() if not proxies: self.log().error('代理抓取失败,抓取函数:%s' % callback) continue for proxy in proxies: self.redis.add(proxy) except Exception as e: self.log().exception(e)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.get_count() < MAX_THRESHOLD: return True else: return False def run(self): if self.is_over_threshold(): for i in range(self.crawler.__CrawlCount__): proxies = self.crawler.get_proxies( self.crawler.__CrawlFunc__[i]) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_thershold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("the getter programmer started!") if not self.is_over_thershold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() #判断是否达到了代理池限制 def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for index in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[index] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter: def __init__(self): self._conn = RedisClient() self._crawler = Crawler() def is_over_threshold(self): if self._conn.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始运行") if not self.is_over_threshold(): for callback_index in range(self._crawler.__CrawlFuncCount__): callback = self._crawler.__CrawlFunc__[callback_index] proxies = self._crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self._conn.add(proxy)
class Getter(object): """docstring for Getter""" def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): #循环遍历计数器、 for callback_label in range(self.crawler.__CrawlFuncCount__): #使用下表索引取出函数列表中的函数并进行运行 #爬虫程序返回的是一个迭代器 callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 try: ''' 函数get_proxies是对爬虫结果进行遍历并取出其中的值加入 代理列表并且返回代理 ''' proxies = self.crawler.get_proxies(callback) #proxies: list except Exception: print("\033[1;31;40m这里有错误...\033[0m") print(f'爬虫{callback.__name__}发生了错误,需要进行调试') #清除缓存使得结果连续输出 sys.stdout.flush() #遍历列表中的代理,加入数据库 for proxy in proxies: try: self.redis.add(proxy) except OSError as e: print(f"\033[1;31;40m发生错误...{e.reason}\033[0m")
class Getter(): def __init__(self): self.mongo = MonClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ data = list( self.mongo.db.aggregate([{ "$match": { "pid": { "$eq": 0 } } }, { "$group": { "_id": None, "count": { "$sum": 1 } } }])) if len(data) == 1 and data[0]['count'] >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('\033[1;30;44m 获取器开始执行 \033[0m') if not self.is_over_threshold(): print("\033[1;30;44m 没有达到上限! \033[0m") for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.mongo.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: if self.first_test(proxy): # 如果测试成功 print('添加代理:', proxy) self.redis.add(proxy) # 添加到队列 def first_test(self, proxy): print('筛选测试代理:', proxy) proxies = { "http": "http://{}".format(proxy), } try: r = requests.get(TEST_URL, proxies=proxies, timeout=4) if r.status_code == 200: return True except: print('测试失败,删除代理', proxy) return False
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到代理池的界限 """ if self.redis.count() >= POOL_UPPER_LIMIT: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() # 输出获取代理地址信息,类似print for proxy in proxies: self.redis.add(proxy)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ Judge whether the threshold has been reached. """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('Start excution') if self.is_over_threshold() is not None: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(object): def __init__(self): """ 初始化数据库与爬虫 """ self.redis = RedisClient() self.crawler = Crawler() def is_full(self): """ 判断代理数目是否达上限 """ return self.redis.count() >= PROXY_NUMBER_MAX @logger.catch def run(self): logger.info('获取器开始执行......') if not self.is_full(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class GetProxy(object): def __init__(self): self.redis = SaveData() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): log.logger.info("获取器开始执行") if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(): crawler_list = [ "crawl_ip3366", "crawl_kuaidaili", "crawl_ip3366_new", "crawl_iphai", "crawl_data5u" ] def __init__(self): self.spider_log = logging.getLogger(GETTERLOGGER) self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self, mode=None): """ 判断是否达到了代理池限制 """ if mode is None: rediskey = REDIS_KEY else: rediskey = mode if self.redis.count(rediskey) >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): self.spider_log.info('获取器定时开始') httpflag = 0 httpsflag = 0 if not self.is_over_threshold(REDIS_HTTP): httpflag = 1 if not self.is_over_threshold(REDIS_HTTPS): httpsflag = 1 try: if httpflag == 1 or httpsflag == 1: self.spider_log.info("获取器开始执行,http:" + str(self.redis.count(REDIS_HTTP)) + ";https:" + str(self.redis.count(REDIS_HTTPS))) # if True: for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 if callback not in Getter.crawler_list: continue self.spider_log.info('开始获取:' + callback) proxies = self.crawler.get_proxies(callback) sys.stdout.flush() if httpflag == 1: for proxy in proxies: self.redis.add(proxy, mode=REDIS_HTTP) if httpsflag == 1: for proxy in proxies: self.redis.add(proxy, mode=REDIS_HTTPS) else: self.spider_log.info("获取器无需执行,http:" + str(self.redis.count(REDIS_HTTP)) + ";https:" + str(self.redis.count(REDIS_HTTPS))) except Exception as e: self.spider_log.error('获取器发生错误' + str(e.args)) self.spider_log.error('traceback:' + traceback.format_exc())
def __init__(self): self.spider_log = logging.getLogger(GETTERLOGGER) self.redis = RedisClient() self.crawler = Crawler()
def __init__(self): self.sqlite3 = sqlitedb() self.crawler = Crawler()
def __init__(self): self.redis = SaveData() self.crawler = Crawler()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self): self.mongo = MonClient() self.crawler = Crawler()
def __init__(self): """ 初始化数据库与爬虫 """ self.redis = RedisClient() self.crawler = Crawler()