Esempio n. 1
0
 def run(self):
     logger.info("代理池获取器开始运行")
     if not self.is_over_threshold():
         for callback in self.crawler.__CrawlFunc__:
             proxies = self.crawler.get_proxies(callback)
             for proxy in proxies:
                 self.client.add(proxy)
Esempio n. 2
0
 def max(self, proxy):
     """
     将代理值设置为最大值
     :param proxy:   代理
     :return:        设置的结果
     """
     logger.info('代理 {} 设置为 {}'.format(proxy, MAX_SCORE))
     self.db.zadd(REDIS_KEY, MAX_SCORE, proxy)
Esempio n. 3
0
 def run(self):
     logger.info("代理池开始运行")
     if ENABLE_TESTER:
         tester_process = Process(target=self.schedule_tester)
         tester_process.start()
     if ENABLE_GETTER:
         getter_process = Process(target=self.schedule_getter)
         getter_process.start()
     if ENABLE_API:
         api_process = Process(target=self.schedule_api)
         api_process.start()
Esempio n. 4
0
 def get_proxies(self, callback):
     """
     通过传进的方法名,动态进行方法调用,获取到抓回来的代理
     :param callback:
     :return:
     """
     proxies = []
     for proxy in eval('self.{}()'.format(callback)):
         logger.info("成功获取到代理 {}".format(proxy))
         proxies.append(proxy)
     return proxies
Esempio n. 5
0
    def decrease(self, proxy):
        """
        代理分数减一,小于最小值则删除该代理
        :param proxy:   代理
        :return:        修改后代理的分数

        zscore  返回有序集合key中指定成员的分数
        """
        score = self.db.zscore(REDIS_KEY, proxy)
        if score and score > MIN_SCORE:
            logger.info('代理 {}, 当前分数 {}, 减一'.format(proxy, score))
            return self.db.zincrby(REDIS_KEY, proxy, -1)
        else:
            logger.info('代理 {} 移除'.format(proxy))
            return self.db.zrem(REDIS_KEY, proxy)
Esempio n. 6
0
def get_page(url, options=None):
    """
    抓取代理页面
    :param url: 代理url
    :param options:  自定义header选项
    :return:
    """
    headers = dict(base_headers, **options) if options is not None else base_headers
    logger.info("正在抓取 {}".format(url))
    try:
        response = requests.get(url=url, headers=headers)
        if response.status_code == 200:
            return response.text
    except ConnectionError:
        logger.error("抓取连接 {} 失败".format(url))
        return None
Esempio n. 7
0
 async def test_single_proxy(self, proxy):
     """
     测试单个代理
     :param proxy:   代理
     :return:        代理是否有效
     """
     conn = aiohttp.TCPConnector(verify_ssl=False)
     async with aiohttp.ClientSession(connector=conn) as session:
         try:
             if isinstance(proxy, bytes):
                 proxy = proxy.decode('utf-8')
             real_proxy = 'http://' + proxy
             logger.info('正在测试代理: {}'.format(real_proxy))
         except Exception:
             self.client.decrease(proxy)
             logger.info('代理[{}]请求失败'.format(proxy))
Esempio n. 8
0
 def run(self):
     """测试器主函数"""
     logger.info("开始运行测试器")
     try:
         count = self.client.count()
         logger.info("当前剩余 {} 个代理".format(count))
         for i in range(0, count, BATCH_TEST_SIZE):
             start = i
             end = min(i + BATCH_TEST_SIZE, count)
             logger.info("正在测试第 {}-{} 个代理".format(start + 1, end))
             test_proxies = self.client.batch(start, end)
             loop = asyncio.get_event_loop()
             tasks = [
                 self.test_single_proxy(proxy) for proxy in test_proxies
             ]
             loop.run_until_complete(asyncio.wait(tasks))
             time.sleep(5)
     except Exception as e:
         logger.error("测试器发生错误 {}".format(e.args))