Ejemplo n.º 1
0
class ValidityTester(object):
    test_api = TEST_API

    def __init__(self):
        self._raw_proxies = None
        self._usable_proxies = []

    def set_raw_proxies(self, proxies):
        self._raw_proxies = proxies
        self._conn = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        test one proxy, if valid, put them to usable_proxies.
        """
        try:
            async with aiohttp.ClientSession() as session:
                try:
                    if isinstance(proxy, bytes):
                        proxy = proxy.decode('utf-8')
                    real_proxy = 'http://' + proxy
                    print('Testing', proxy)
                    async with session.get(
                            self.test_api,
                            proxy=real_proxy,
                            timeout=get_proxy_timeout) as response:
                        if response.status == 200:
                            self._conn.put(proxy)
                            print('Valid proxy', proxy)
                except (ProxyConnectionError, TimeoutError, ValueError):
                    print('Invalid proxy', proxy)
        except (ServerDisconnectedError, ClientResponseError,
                ClientConnectorError) as s:
            print(s)
            pass

    def test(self):
        """
        aio test all proxies.
        """
        print('ValidityTester is working')
        try:
            loop = asyncio.get_event_loop()
            tasks = [
                self.test_single_proxy(proxy) for proxy in self._raw_proxies
            ]
            loop.run_until_complete(asyncio.wait(tasks))
        except ValueError:
            print('Async Error')
Ejemplo n.º 2
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        print('代理可用', proxy)
                    elif response.elapsed.total_seconds() > 0.5:
                        self.redis.zrem(proxy)
                        print('响应时间过长', response.elapsed.total_seconds(), 'IP',
                              proxy)
                    else:
                        self.redis.zrem(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.zrem(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Ejemplo n.º 3
0
 def vaild_proxy(cycle=VALID_CHECK_CYCLE):
     """Get half of proxies which in redis"""
     conn = RedisClient()
     tester = ValidityTester()
     while True:
         print('refresing ip')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 4
0
class Tester(object):
    def __init__(self, redis_key):
        self.redis = RedisClient(redis_key)

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                headers = {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'en;q=0.9,ja;q=0.8,fr;q=0.7',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
                    # 'Upgrade-Insecure-Requests': 1,
                    'Connection': 'close',
                }

                async with session.get(TEST_URL, headers=headers, proxy=real_proxy, timeout=TIMEOUT, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Ejemplo n.º 5
0
 def __init__(self, threshold):
     # 临界值
     self._threshold = threshold
     self._conn = RedisClient()
     self._tester = ValidityTester()
     # 免费代理爬虫
     self._crawler = FreeProxyGetter()
Ejemplo n.º 6
0
def get_conn():
    """
    获取 Redis 对象
    """
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis
Ejemplo n.º 7
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """从redis里面获取一半的代理
     """
     conn = RedisClient()
     tester = VaildityTester()
     while True:
         Logger.log_high('Refreshing ip')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             Logger.log_normal('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 8
0
class PoolAdder(object):
    def __init__(self, threshold):
        self._threshold = threshold
        self._conn = RedisClient()
        self._tester = ValidityTester()
        self._crawler = FreeProxyGetter()

    def is_over_threshold(self):
        if self._conn.queue_len() >= self._threshold:
            return True
        else:
            return False

    def add_to_queue(self):
        print('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():
            for callback_label in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawFunc__[callback_label]
                raw_proxies = self._crawler.get_raw_proxies(callback)
                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)
                self._tester.test()
                proxy_count += len(raw_proxies)
                if self.is_over_threshold():
                    print('IP is enough, waiting to be uesd')
                    break
            if proxy_count == 0:
                raise ResourceDepletionError
Ejemplo n.º 9
0
class Tester:
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):  # 判断是不是bytes类型
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试')
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:  # 状态码是否为200,302
                        self.redis.max(proxy)  # 代理可用就改变代理的分数为100
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)  # 代理减分
                        print('请求响应码不合理', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败______________', proxy)

    def run(self):
        """
        检测主函数
        :return:
        """
        print('检测器开始运行')
        try:
            count = self.redis.count()  # 获取proxies数量
            print('当前剩余', count, '个代理')
            for i in range(0, count,
                           BATCH_TEST_SIZE):  # 最大批测试量BATCH_TEST_SIZE = 10
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)  # 批量获取
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Ejemplo n.º 10
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        Test single proxy
        :param proxy: Single proxy
        :return: None
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('Testing', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print("Proxy is OK", proxy)
                    else:
                        self.redis.decrease(proxy)
                        print("Response code is wrong", response.status, 'IP',
                              proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print("Fail to get proxy", proxy)

    def run(self):
        """
        Main function
        :return: None
        """
        print("Tester starts running")
        try:
            count = self.redis.count()
            print("Current surplus", count, "proxies")
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print("Current testing the", start + 1, '-', stop, 'th proxy')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print("Error!", e.args)
Ejemplo n.º 11
0
class Scheduler():
    def schedule_tester(self, cycle=TESTER_CYCLE):
        """
        定时测试代理
        """
        tester = Tester()
        # tester.run()
        while True:
            print('测试器开始运行')
            tester.run()
            time.sleep(cycle)

    def schedule_getter(self, cycle=GETTER_CYCLE):
        """
        定时获取代理
        """
        getter = Getter()
        while True:
            print('开始抓取代理')
            getter.run()
            time.sleep(cycle)

    def schedule_api(self):
        """
        开启API
        """
        app.run(API_HOST, API_PORT)  #端口如果是5555,会报400的错误

    def schedule_redis(self):
        """
        开启Redis
        """
        os.system("redis-server")

    def run(self):
        print('代理池开始运行')

        # 开启redis线程
        while True:
            self.redis = RedisClient()
            print('检查redis')
            if (self.redis.check()):
                break
            else:
                redis_process = Process(target=self.schedule_redis)
                redis_process.start()
            time.sleep(0.5)

        if TESTER_ENABLED:
            tester_process = Process(target=self.schedule_tester)
            tester_process.start()

        if GETTER_ENABLED:
            getter_process = Process(target=self.schedule_getter)
            getter_process.start()

        if API_ENABLED:
            api_process = Process(target=self.schedule_api)
            api_process.start()
Ejemplo n.º 12
0
class Getter():
    def __init__(self):
        self.crawler = Crawler()
        self.redis = RedisClient()

    def run(self):
        if self.redis.count() < POOL_UPPER_THRESHOLD:
            #for crawl_func_label in range(self.crawler.__CrawlFuncCount__):
            for crawl_func in self.crawler.__CrawlFunc__:
                #crawl_func = self.crawler.__CrawlFunc__[crawl_func_label]
                proxies = self.crawler.start_crawl_func(crawl_func)
                print(crawl_func, '正在爬取代理')
                for proxy in proxies:
                    print(proxy)
                    self.redis.add(proxy)
        proxy_sum = self.redis.count()
        print('目前代理个数:', proxy_sum)
Ejemplo n.º 13
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """
     获取队列中一半的代理进行可用性测试!
     """
     conn = RedisClient()
     tester = ValidityTester()
     while True:
         print('测试redis队列代理可用性:')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             print('代理池为空或者全部测试完毕!')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 14
0
 def timingCheck(cycle=TIMING_CHECK):
     conn = RedisClient()
     valiClass = ValidityTester()
     while True:
         if conn.queue_len > 0:
             valiClass.set_timing_params()
             valiClass.TimingCheck()
         time.sleep(cycle)
Ejemplo n.º 15
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        headers = {
            "Connection": "keep-alive",
            "Host": "www.sogou.com",
            "Pragma": "no-cache",
            "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
        }
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试' + proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False,
                                       headers=headers) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用' + proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ' + str(response.status) + 'IP' + proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败{}'.format(proxy))

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余{}个代理'.format(count))
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第{}-{}个代理'.format(start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误{}'.format(e.args))
Ejemplo n.º 16
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """
     Get half of proxies which in redis
     """
     conn = RedisClient()
     tester = ValidityTester()
     while True:
         print('Refreshing ip')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 17
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """
     Get half of proxies which in redis
     """
     conn = RedisClient()  #Redis连接对象
     tester = ValidityTester()
     while True:
         print('刷新代理池中...')
         count = int(0.3 * conn.queue_len)  #从左侧拿出一半的代理,只剩一个时,看做0个
         if count == 0:  #如果队列长度不够了
             print('等待添加代理中...')
             time.sleep(cycle)  #设置暂时睡眠,等待添加
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)  #调用函数添加,raw_proxies设置为类变量
         tester.test()  #检测代理是否可用
         time.sleep(cycle)
Ejemplo n.º 18
0
def get_conn():
    """
    Opens a new redis connection if there is none yet for the
    current application context.
    """
    if not hasattr(g, 'redis_client'):
        g.redis_client = RedisClient()
    return g.redis_client
Ejemplo n.º 19
0
def get_conn():
    """
    建立Redis连接;若已连接则直接返回
    :return: 返回一个Redis连接类的全局对象
    """
    if not hasattr(g, 'redis_client'):
        g.redis_client = RedisClient()
    return g.redis_client
Ejemplo n.º 20
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                logger.debug(f'正在测试 {proxy}')
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        logger.debug(f'代理可用 {proxy}')
                    else:
                        self.redis.decrease(proxy)
                        logger.debug(f'请求响应码不合法 {response.status}, IP {proxy}')
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                logger.debug(f'代理请求失败 {proxy}')

    def run(self):
        """
        测试主函数
        :return:
        """
        logger.debug('测试器开始运行')
        try:
            count = self.redis.count()
            logger.debug(f'当前剩余 {count} 个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                logger.debug(f'正在测试第 {start + 1 - stop} 个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            logger.debug(f'测试器发生错误 {e.args}')
Ejemplo n.º 21
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):  # 定时检测器
     """
     Get half of proxies which in redis
     """
     conn = RedisClient()  # redis连接对象
     tester = ValidityTester()  # 检测代理是否可用
     while True:
         print('Refreshing ip')
         count = int(0.5 * conn.queue_len)  # 取出前一半的代理
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 22
0
class ProxyMiddleware(object):
    def __init__(self):
        # 连接redis数据库,调用radom方法获取一个随机的IP
        self.db = RedisClient()
        self.proxy = self.db.random()

    def process_request(self, request, spider):
        request.meta["proxy"] = self.proxy
Ejemplo n.º 23
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """
     Get half of proxies which in redis
     """
     conn = RedisClient()  # redis连接对象
     tester = ValidityTester()  # 代理检测对象
     while True:
         print('Refreshing ip')
         count = int(0.5 * conn.queue_len)  # 需要从redis中取出一半的代理地址
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)  # 从redis中取出一半的代理地址,返回列表
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 24
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def log(self):
        if not os.path.exists('log2'):
            os.mkdir('log2')
        log_file_name = 'log2/' + LOG_PATH
        log_file_1 = logging.FileHandler(log_file_name, 'a', encoding='utf-8')
        fmt = logging.Formatter(
            fmt=
            "%(asctime)s - %(name)s - %(levelname)s -%(module)s:  %(message)s")
        log_file_1.setFormatter(fmt)
        logger1 = logging.Logger('run_log', level=logging.DEBUG)
        logger1.addHandler(log_file_1)

        return logger1

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        """爬取到代理设置初始分数,直接存入redis"""
        print('获取器开始执行')
        if not self.is_over_threshold():
            try:
                for callback_label in range(self.crawler.__CrawlFuncCount__):
                    callback = self.crawler.__CrawlFunc__[callback_label]
                    # 获取代理
                    proxies = self.crawler.get_proxies(callback)
                    sys.stdout.flush()
                    if not proxies:
                        self.log().error('代理抓取失败,抓取函数:%s' % callback)
                        continue
                    for proxy in proxies:
                        self.redis.add(proxy)
            except Exception as e:
                self.log().exception(e)
Ejemplo n.º 25
0
 def test_proxies(cycle=VALID_CHECK_CYCLE):
     """
     检查代理队列左半边(旧的)队列的代理有效性,无效的剔除,有效的重新放入队列右侧
     :param cycle: 检测周期
     """
     conn = RedisClient()
     tester = ValidityTester()
     while True:
         print('testing & refreshing ips...')
         count = int(0.5 * conn.list_len)
         if count == 0:
             print('0 ip, waiting for adding...')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get_for_test(count)  #从数据库中获取ip进行测试
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 26
0
 def valid_proxy(cycle=VALID_CHECK_CYCLE):
     """
     Get half of proxies which in redis
     从redis取出代理进行异步检测,将可用代理重新放回redis数据库列表中的右侧,保证代理的定时更新
     """
     conn = RedisClient()
     tester = ValidityTester()
     while True:
         print('Refreshing ip')
         count = int(0.5 * conn.queue_len)
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 27
0
 def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,
                upper_threshold=POOL_UPPER_THRESHOLD,
                cycle=POOL_LEN_CHECK_CYCLE):
     conn = RedisClient()
     adder = PoolAdder(upper_threshold)
     while True:
         if conn.queue_len < lower_threshold:
             adder.add_to_queue()
         time.sleep(cycle)
Ejemplo n.º 28
0
class Getter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        if self.redis.get_count() < MAX_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        if self.is_over_threshold():
            for i in range(self.crawler.__CrawlCount__):
                proxies = self.crawler.get_proxies(
                    self.crawler.__CrawlFunc__[i])
                for proxy in proxies:
                    self.redis.add(proxy)
Ejemplo n.º 29
0
class ValidityTester(object):
    test_api = TEST_API

    def __init__(self):
        self._raw_proxies = None
        self._usable_proxies = []

    def set_raw_proxies(self, proxies):
        self._raw_proxies = proxies
        self._conn = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        text one proxy, if valid, put them to usable_proxies.
        """
        try:
            async with aiohttp.ClientSession() as session:
                try:
                    if isinstance(proxy, bytes):
                        proxy = proxy.decode('utf-8')
                    real_proxy = 'http://' + proxy
                    print('Testing', proxy)
                    async with session.get(self.test_api, proxy=real_proxy, timeout=get_proxy_timeout) as response:
                        if response.status == 200:
                            self._conn.put(proxy)
                            print('Valid proxy', proxy)
                except (ProxyConnectionError, TimeoutError, ValueError):
                    print('Invalid proxy', proxy)
        except (ServerDisconnectedError, ClientResponseError,ClientConnectorError) as s:
            print(s)
            pass

    def test(self):
        """
        aio test all proxies.
        """
        print('ValidityTester is working')
        try:
            loop = asyncio.get_event_loop()
            tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies]
            loop.run_until_complete(asyncio.wait(tasks))
        except ValueError:
            print('Async Error')
Ejemplo n.º 30
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        异步测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                """
                在Python3以后,字符串和bytes类型彻底分开了。字符串是以字符为单位进行处理的,bytes类型是以字节为单位处理的。
                直接以默认的utf-8编码解码bytes成string
                """
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print("正在测试", proxy)
                async with session.get(TEST_URL, allow_redirects=False, proxy=real_proxy, timeout=15) as response:
                    if response.status in VALID_STATUS_CODES:
                        # 将代理设置为分数最大
                        self.redis.max(proxy)
                        print("代理", proxy, '可用, 设置为100')
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法', response.status, 'IP', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                print("代理验证失败", proxy)
                self.redis.decrease(proxy)

    def run(self):
        """
        测试函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                """获取测试代理"""
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print("测试器发生错误", e.args)
Ejemplo n.º 31
0
 def valid_proxy(
         cycle=VALID_CHECK_CYCLE):  # VALID_CHECK_CYCLE 时间的参数 setting中配置
     """
     Get half of proxies which in redis
     定时检测器 检测redis数据库ip是否有效
     """
     conn = RedisClient()  # 连接redis数据库
     tester = ValidityTester()  # 检测代理是否可用的类
     while True:
         print('Refreshing ip')
         count = int(0.5 * conn.queue_len)  # 取出队列长度一般的代理
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_raw_proxies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Ejemplo n.º 32
0
class ValidityTester(object):
    #检查代理是否可用,并保存。
    def __init__(self):
        self._raw_proxies=None
    def set_raw_proxies(self,proxies):
        self._raw_proxies=proxies
        self._redis=RedisClient()
    async def check_single_proxy(self,proxy):
        #检查单个代理
        if isinstance(proxy,bytes):
            proxy=proxy.decode('utf-8')
        real_proxy='http://'+proxy
        try:
            async with aiohttp.ClientSession() as session:

                try:
                    async with session.get(TEST_API,proxy=real_proxy,timeout=PROXY_TIMEOUT) as response:
                        print('Check proxy',proxy)
                        if response.status==200:
                            self._redis.add(proxy)
                            print('Add to redis',proxy)
                except (ProxyConnectionError,TimeoutError):
                    print('Dont add to proxy',proxy)
                    await session.close()

        except(ServerDisconnectedError, ClientResponseError,ClientConnectorError,Exception) as s:
            print(s)
            await session.close()

    def check_some_proxies(self):
        '''
        建立循环消息圈:循环检查_raw_proxies中的代理ip
        _raw_proxies 为空或者None 抛出异常
        '''
        if not self._raw_proxies:
            return
        try:
            print('Check_some_proxies Ing')
            loop=asyncio.get_event_loop()
            tasks=[self.check_single_proxy(task) for task in self._raw_proxies]
            loop.run_until_complete(asyncio.wait(tasks))
        except:
            print('Check_some_proxies Error')
Ejemplo n.º 33
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()
    
    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)
    
    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)
                loop = asyncio.get_event_loop()
                tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Ejemplo n.º 34
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()
    
    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False
    
    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Ejemplo n.º 35
0
class Tester(object):

    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, session, proxy):
        """测试单个代理"""
        try:
            real_proxy = eval(proxy)['https']
            print('正在测试', proxy)
            async with session.get(TEST_URL, proxy=real_proxy, timeout=20, allow_redirects=False) as response:
                if response.status in VALID_STATUS_CODES:
                    rst = await response.text()
                    if rst:
                        resp_ip = '//'+eval(rst).get('headers').get('X-Forwarded-For')
                        proxy_ip = real_proxy.split(':')
                        if resp_ip == proxy_ip[1]:
                            self.redis.max(proxy)
                            print('代理可用', proxy)
                else:
                    self.redis.decrease(proxy)
                    print('请求响应码不合法 ', response.status, 'IP', proxy)
        except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
            self.redis.decrease(proxy)
            print('代理请求失败', proxy)

    async def set_test_tasks(self, loop):
        """设置测试任务"""
        count = self.redis.count
        print('当前剩余', count, '个代理')
        for start in range(0, count, BATCH_TEST_SIZE): # 一段一段创建任务, 每一段一个Session减少内存开销
            stop = min(start + BATCH_TEST_SIZE, count)
            print('正在测试第', start + 1, '-', stop, '个代理')
            test_proxies = self.redis.batch(start, stop)
            # conn = aiohttp.TCPConnector(verify_ssl=False)
            conn = aiohttp.TCPConnector()
            async with aiohttp.ClientSession(connector=conn, loop=loop) as session:
                tasks = [self.test_single_proxy(session, proxy) for proxy in test_proxies]
                await asyncio.wait(tasks)

    def run(self):
        """测试主函数"""
        print('测试器开始运行')
        try:
            loop = asyncio.get_event_loop()
            loop.run_until_complete(self.set_test_tasks(loop))
            sys.stdout.flush()  # 马上print不用等到循环结束
            time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Ejemplo n.º 36
0
class Crawler(object, metaclass=ProxyMetaclass): # 继承ProxyMetaclass, 拥有__CrawlFunc__, __CrawlFuncCount__两个属性
    def __init__(self):
        self.redis = RedisClient()

    def save_to_db(self, proxies):
        for proxy in proxies:
            sys.stdout.flush()
            print('成功获取到代理', proxy)
            self.redis.add(proxy)

    async def crawl_daili66(self):
        """获取代理66, 外国ip多"""
        urls = ['http://www.66ip.cn/{}.html'.format(page) for page in range(1, 5)]
        print('Crawling')
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                doc = pq(page)
                trs = doc('.containerbox table tr:gt(0)').items()
                for tr in trs:
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    ip_port = ':'.join([ip, port])
                    proxy = {'https':'http://'+ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip3366(self):
        """云代理index"""
        urls = ['http://www.ip3366.net/?stype=1&page={}'.format(page) for page in range(1, 5)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                find_tr = re.compile('<tr>(.*?)</tr>', re.S)
                trs = find_tr.findall(page)
                for s in range(1, len(trs)):
                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                    re_ip_address = find_ip.findall(trs[s])
                    find_port = re.compile('<td>(\d+)</td>')
                    re_port = find_port.findall(trs[s])
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address+':'+port
                        ip_port = address_port.replace(' ','')
                        proxy = {'https':'http://'+ip_port}
                        proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip3366_(self):
        """云代理free"""
        urls = ['http://www.ip3366.net/free/?stype=1&page={}'.format(page) for page in range(1, 5)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                # ip_port = [(re_ip_address[i] + ':' + re_port[i]).replace(' ', '') for i in range(len(re_port))]
                # proxies.append(ip_port)
                for address, port in re_ip_address:
                    result = address+':'+ port
                    ip_port = result.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_kuaidaili(self):
        """快代理(都是http的)"""
        urls = ['http://www.kuaidaili.com/free/inha/{}/'.format(page) for page in range(1, 5)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<td data-title="IP">(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                port = re.compile('<td data-title="PORT">(.*?)</td>')
                re_port = port.findall(page)
                for address,port in zip(re_ip_address, re_port):
                    address_port = address+':'+port
                    ip_port = address_port.replace(' ','')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_xicidaili(self):
        """西刺代理"""
        urls = ['http://www.xicidaili.com/nn/{}'.format(page) for page in range(1, 4)]
        headers = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
            'Host':'www.xicidaili.com',
            'Referer':'http://www.xicidaili.com/nn/3',
            'Upgrade-Insecure-Requests':'1',
        }
        html = await get_page(urls, options=headers)
        if html:
            proxies = []
            for page in html:
                find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
                trs = find_trs.findall(page)
                for tr in trs:
                    find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                    re_ip_address = find_ip.findall(tr)
                    find_port = re.compile('<td>(\d+)</td>')
                    re_port = find_port.findall(tr)
                    for address,port in zip(re_ip_address, re_port):
                        address_port = address+':'+port
                        ip_port = address_port.replace(' ', '')
                        proxy = {'https':'http://'+ip_port}
                        proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_iphai(self):
        """ip海"""
        urls = ['http://www.iphai.com/']
        html = await get_page(urls)
        if html:
            proxies = []
            find_tr = re.compile('<tr>(.*?)</tr>', re.S)
            trs = find_tr.findall(html[0])
            for s in range(1, len(trs)):
                find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
                re_ip_address = find_ip.findall(trs[s])
                find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
                re_port = find_port.findall(trs[s])
                for address,port in zip(re_ip_address, re_port):
                    address_port = address+':'+port
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_data5u(self):
        """data5u"""
        urls = ['http://www.data5u.com/free/gngn/index.shtml']
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
            'Host': 'www.data5u.com',
            'Referer': 'http://www.data5u.com/free/index.shtml',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
        }
        html = await get_page(urls, options=headers)
        if html:
            proxies = []
            ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S)
            re_ip_address = ip_address.findall(html[0])
            for address, port in re_ip_address:
                result = address + ':' + port
                ip_port = result.replace(' ', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

# 近期修改的

    async def crawl_goubanjia(self):
        """全网ip"""
        urls = ['http://www.goubanjia.com']
        html = await get_page(urls)
        if html:
            proxies = []
            doc = pq(html[0])
            tds = doc('td.ip').items()
            for td in tds:
                td.find('p').remove()
                ip_port = td.text().replace('\n', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_89ip(self):
        """89ip"""
        urls = ['http://www.89ip.cn/index_{}.html'.format(page) for page in range(1, 4)]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                doc = pq(page)
                ips = doc('tr td:nth-child(1)').items()
                ports = doc('tr td:nth-child(2)').items()
                for ip, port in zip(ips, ports):
                    result = ip.text() + ':' + port.text()
                    ip_port = result.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_ip181(self):
        """讯代理api接口"""
        urls = ['http://www.ip181.com/']
        html = await get_page(urls)
        if html:
            proxies = []
            json_ = eval(html[0])
            RESULT = json_.get('RESULT')
            for i in RESULT:
                ip = i.get('ip')
                port = i.get('port')
                result = ip + ':' + port
                ip_port = result.replace(' ', '')
                proxy = {'https': 'http://' + ip_port}
                proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_premproxy(self):
        """premproxy"""
        urls = ['https://premproxy.com/proxy-by-country/{}.htm'.format(country) for country in ('China-01','China-02','China-03','China-04','Taiwan-01')]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>')
                re_ip_address = ip_address.findall(page)
                for address_port in re_ip_address:
                    ip_port = address_port.replace(' ', '')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)

    async def crawl_xroxy(self):
        """xroxy 换了网址不挂代理, 访问很慢"""
        urls = ['https://www.xroxy.com/proxy-country-{}'.format(country) for country in ('cn','tw')]
        html = await get_page(urls)
        if html:
            proxies = []
            for page in html:
                ip_address1 = re.compile('<td class="sorting_1">(\d+\.\d+\.\d+\.\d+)</td>')
                re_ip_address1 = ip_address1.findall(page)
                print(re_ip_address1)
                ip_address2 = re.compile("<td>\d[3-5]</td>")
                re_ip_address2 = ip_address2.findall(page)
                print(re_ip_address2)
                for address,port in zip(re_ip_address1,re_ip_address2):
                    address_port = address+':'+port
                    ip_port = address_port.replace(' ','')
                    proxy = {'https': 'http://' + ip_port}
                    proxies.append(proxy)
            self.save_to_db(proxies)
Ejemplo n.º 37
0
 def set_raw_proxies(self, proxies):
     self._raw_proxies = proxies
     self._conn = RedisClient()
Ejemplo n.º 38
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Ejemplo n.º 39
0
 def __init__(self):
     self.redis = RedisClient()