Beispiel #1
0
 def deduction(self, proxy):
     """
     deduct score of proxy
     :param proxy: proxy
     :return:
     """
     score = self.con.zscore(REDIS_KEY, proxy.string())
     if score and score > MIN_SCORE:
         logger.info(f"{proxy.string()} with score {score}, deduct")
         return self.con.zincrby(REDIS_KEY, -1, proxy.string())
     else:
         logger.info(f"{proxy.string()} with score {score}, remove")
         return self.con.zrem(REDIS_KEY, proxy.string())
Beispiel #2
0
 async def test(self, proxy: Proxy):
     """
     test single proxy
     :param proxy: Proxy object
     :return:
     """
     async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
             ssl=False)) as session:
         try:
             logger.info(f'testing {proxy.string()}')
             async with session.get(TEST_URL,
                                    proxy=f'http://{proxy.string()}',
                                    timeout=TEST_TIMEOUT,
                                    allow_redirects=False) as response:
                 if response.status in TEST_VALID_STATUS:
                     self.redis.max(proxy)
                     logger.info(
                         f'proxy {proxy.string()} is valid, set max score')
                 else:
                     self.redis.deduction(proxy)
                     logger.info(
                         f'proxy {proxy.string()} is invalid, decrease score'
                     )
         except EXCEPTIONS:
             self.redis.deduction(proxy)
             logger.warn(
                 f'proxy {proxy.string()} is invalid, decrease score')
         finally:
             await session.close()
Beispiel #3
0
    def run(self):
        def usable(proxy) -> bool:
            proxies = {'https': 'https://' + proxy}
            try:
                if requests.get('https://movie.douban.com/',
                                proxies=proxies,
                                cookies=self.cookies,
                                timeout=2,
                                headers=self.headers).status_code == 200:
                    return True
            except requests.exceptions.ProxyError:
                logger.error(f"proxy error with {proxy}")
            except requests.exceptions.ConnectTimeout:
                logger.error(f"ConnectTimeout with {proxy}")
            return False

        if self.full():
            return
        for crawler in self.crawlers:
            for proxy in crawler.crawl():
                if self.con.add(proxy):
                    logger.info(f"proxy {proxy.string()} been added")
Beispiel #4
0
 def run(self):
     """
     test main method
     :return:
     """
     # event loop of aiohttp
     logger.info('stating tester...')
     count = self.redis.count()
     logger.info(f'{count} proxies to test')
     for i in range(0, count, TEST_BATCH):
         # start end end offset
         start, end = i, min(i + TEST_BATCH, count)
         logger.info(f'testing proxies from {start} to {end} indices')
         proxies = self.redis.batch(start, end)
         tasks = [self.test(proxy) for proxy in proxies]
         # run tasks using event loop
         self.loop.run_until_complete(asyncio.wait(tasks))
Beispiel #5
0
    def run(self):
        global tester_process, getter_process, server_process
        try:
            logger.info('starting ProxyPool...')

            tester_process = multiprocessing.Process(target=self.run_tester)
            logger.info(f'starting tester, pid {tester_process.pid}...')
            tester_process.start()

            getter_process = multiprocessing.Process(target=self.run_getter)
            logger.info(f'starting getter, pid{getter_process.pid}...')
            getter_process.start()

            server_process = multiprocessing.Process(target=self.run_server)
            logger.info(f'starting server, pid{server_process.pid}...')
            server_process.start()

            tester_process.join()
            getter_process.join()
            server_process.join()
        except KeyboardInterrupt:
            logger.info('received keyboard interrupt signal')
            tester_process.terminate()
            getter_process.terminate()
            server_process.terminate()
        finally:
            # must call join method before calling is_alive
            tester_process.join()
            getter_process.join()
            server_process.join()
            logger.info(
                f'tester is {"alive" if tester_process.is_alive() else "dead"}'
            )
            logger.info(
                f'getter is {"alive" if getter_process.is_alive() else "dead"}'
            )
            logger.info(
                f'server is {"alive" if server_process.is_alive() else "dead"}'
            )
            logger.info('proxy terminated')