Exemple #1
0
class Getter(object):
    def __init__(self):
        """
        初始化数据库和创建爬虫
        """
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断代理池是否达到上限
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('或取器开始运作...')
        # 判断代理池是否达到上限
        if not self.is_over_threshold():
            # 遍历所有的代理网站生成的各自的解析函数
            for crawler_index in range(self.crawler.__CrawlerCount__):
                # 获取对应索引的回调函数
                callback = self.crawler.__CrawlerFunc__[crawler_index]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    # print(proxy)
                    self.redis.add(proxy)
Exemple #2
0
class Spider_getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Spider()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    """
        调用爬虫类里面的爬取网站函数,进行爬取,将爬取的结果放入redis中
    """

    def begin(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Exemple #3
0
class SpiderTester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """"""
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:

                        # 如果代码可以用就将这个ip的分数设置为最大
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        # 如果这个ip不可以用,就将这个ip的分数减5分
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    """遍历redis中的ip,进行测试"""

    def run(self):

        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                """使用异步的方式测试ip"""
                loop = asyncio.get_event_loop()
                # 将所有需要测试的ip任务放入到一个列表中
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]

                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Exemple #4
0
def get_conn():
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis
Exemple #5
0
 def __init__(self):
     self.redis = RedisClient()
Exemple #6
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test__single_proxy(self, proxy):
        """
        异步测试单个代理
        """
        try:
            async with aiohttp.ClientSession() as session:
                try:
                    if isinstance(proxy, bytes):
                        proxy = proxy.decode('utf-8')
                    real_proxy = 'http://' + proxy
                    print('Testing', proxy)
                    async with session.get(TEST_URL,
                                           proxy=real_proxy,
                                           timeout=TIME_OUT) as response:
                        if response.status == 200:
                            self.redis.max(proxy)
                            print('代理可用:', proxy)
                        else:
                            self.redis.decrease(proxy)
                            print('代理响应玛不为200: {} , 响应玛:{}'.format(
                                proxy, response.status))
                except (ProxyConnectionError, TimeoutError, ValueError) as e:
                    self.redis.decrease(proxy)
                    print('代理请求失败: ', proxy)
                    print('error is ', e)
        except (ServerDisconnectedError, ClientResponseError,
                ClientConnectorError) as s:
            self.redis.decrease(proxy)
            print(s)
            pass

        # conn = aiohttp.TCPConnector(verify_ssl=False)
        # async with aiohttp.ClientSession(connector=conn) as session:
        #     try:
        #         if isinstance(proxy, bytes):
        #             proxy = proxy.decode('utf-8')
        #         real_proxy = 'http://' + proxy
        #         print('正在测试', proxy)
        #         async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
        #             if response.status in VALID_STATUS_CODES:
        #                 self.redis.max(proxy)
        #                 print('代理可用', proxy)
        #             else:
        #                 self.redis.decrease(proxy)
        #                 print('请求响应码不合法 ', response.status, 'IP', proxy)
        #     except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
        #         self.redis.decrease(proxy)
        #         print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        """
        print('Tester开始运作...')
        try:
            count = self.redis.count()
            print('当前剩余: {} 个代理'.format(count))
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第 {} - {} 个代理'.format(start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test__single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Exemple #7
0
 def __init__(self):
     """
     初始化数据库和创建爬虫
     """
     self.redis = RedisClient()
     self.crawler = Crawler()
Exemple #8
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Spider()