Beispiel #1
0
 def __init__(self, crawler):
     print(
         '[' +
         time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) +
         ']', "【init】")
     self.proxy = None  # 默认代理为空,使用真实IP
     self.db = RedisClient()
Beispiel #2
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]

                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    print(proxy, type(proxy))
                    self.redis.add(proxy)
Beispiel #3
0
 def control_add_proxy():
     proxy_min = PROXY_MIN
     conn = RedisClient()
     addr = PoolAdder()
     cycle = CYCLE_TIME
     while True:
         if conn.count() < PROXY_MAX:
             addr.add_to_redis()
         time.sleep(cycle)
Beispiel #4
0
 def valid_proxies(cycle=VALID_CHECK_CYCLE):
     conn = RedisClient()
     tester = ValidityTester()
     while True:
         print("check valid proxies")
         count = int(conn.zset_len / 2)
         if count == 0:
             print("add new valid proxies")
             time.sleep(cycle)
             continue
         proxies = conn.get(count)
         tester.set_raw_proxies(proxies)
         tester.test()
         time.sleep(cycle)
Beispiel #5
0
 def control_test_proxy():
     conn = RedisClient()
     cycle = CYCLE_TIME
     tester = ValidityTester()
     while True:
         print('Refreshing ip')
         count = int(0.5 * conn.count())
         if count == 0:
             print('Waiting for adding')
             time.sleep(cycle)
             continue
         raw_proxies = conn.get(count)
         tester.set_test_procies(raw_proxies)
         tester.test()
         time.sleep(cycle)
Beispiel #6
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = "http://" + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                    print('请求响应码不合法 ', response.status, 'IP', proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return:
        """
        print('测试器开始运行')
        try:
            count = self.redis.count()
            print('当前剩余', count, '个代理')
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                print('正在测试第', start + 1, '-', stop, '个代理')
                test_proxies = self.redis.batch(start, stop)

                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]

                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Beispiel #7
0
 def run(self):
     print('Ip processing running')
     RedisClient().flush()
     valid_process = Process(target=Schedule.valid_proxies)
     check_process = Process(target=Schedule.check_pool)
     valid_process.start()
     check_process.start()
Beispiel #8
0
class PoolAdder(object):
    def __init__(self):
        self.proxy_max = PROXY_MAX
        self.tester = ValidityTester()
        self.getter = FreeProxyGetter()
        self.conn = RedisClient()

    def is_picture(self):
        if self.conn.count() < self.proxy_max:
            return True
        return False

    def add_to_redis(self):
        proxy_count = 0
        while self.is_picture():
            for callback_label in range(self.getter.__CrawlFuncCount__):
                callback = self.getter.__CrawlFunc__[callback_label]
                raw_proxies = self.getter.get_raw_proxies(callback)
                self.tester.set_test_procies(raw_proxies)
                self.tester.test()
                proxy_count += len(raw_proxies)
                if not self.is_picture():
                    print('代理ip队列已满')
                    break
            if proxy_count == 0:
                print('请求不到ip')
                break
Beispiel #9
0
 def check_pool(cycle=POOL_LEN_CHECK_CYCLE,
                min=POOL_LOWER_THRESHOLD,
                max=POOL_UPPER_THRESHOLD):
     conn = RedisClient()
     adder = ProxyPoolAdder(max)
     while True:
         if conn.zset_len <= min:
             adder.add_to_zset()
         time.sleep(cycle)
Beispiel #10
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__crawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Beispiel #11
0
class ValidityTester():
    test_api = TEST_API
    header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5702.400 QQBrowser/10.2.1893.400'
    }

    def __init__(self):
        self.test_proxies = []

    def set_test_procies(self, proxies):
        self.test_proxies = proxies
        self.conn = RedisClient()

    def test_single_proxy(self, proxy):
        try:
            if isinstance(proxy, bytes):
                proxy = proxy.decode('utf-8')
            real_proxy = 'http://' + proxy
            r = requests.get(url=self.test_api,
                             proxies={'http://': real_proxy},
                             headers=self.header)
            if r.status_code == 200:
                self.conn.put(proxy)
                print('验证成功:', proxy)
            else:
                print('验证失败:', proxy)
        except:
            print('请求失败:', proxy)

    def test(self):
        try:
            test_proxy_pool = threadpool.ThreadPool(5)
            test_proxy_requests = threadpool.makeRequests(
                self.test_single_proxy, self.test_proxies)
            for tpr in test_proxy_requests:
                test_proxy_pool.putRequest(tpr)
            test_proxy_pool.wait()
            self.test_proxies = []
        except:
            self.test()
Beispiel #12
0
class QichachaProxyMiddleware(object):
    def __init__(self):
        self.proxyData = RedisClient()
        self.logger = logging.getLogger(__name__)

    def process_request(self, request, spider):
        if request.meta.get('retry_times'):
            proxy = self.proxyData.random_proxy()
            if proxy:
                uri = 'https://{}'.format(proxy)
                self.logger.debug('使用代理' + proxy)
                request.meta['proxy'] = uri
                request.meta['downlaod_timeout'] = 20
Beispiel #13
0
class ValidityTester(object):
    test_api = TEST_API

    def __init__(self):
        self._raw_proxies = None
        self._available_proxies = []
        self._conn = RedisClient()
        self.logger = logging.getLogger(__name__)

    def set_raw_proxies(self, proxies):
        self._raw_proxies = proxies

    async def test_one_proxy(self, proxy):
        async with aiohttp.ClientSession() as session:
            if isinstance(proxy, bytes):
                proxy = proxy.decode('utf-8')
            real_proxy = "http://" + proxy
            try:
                async with session.get(ValidityTester.test_api,
                                       proxy=real_proxy,
                                       timeout=10) as rep:
                    if rep.status == 200:
                        self._conn.add(proxy)
                        self.logger.info("valid proxy:" + proxy)
            except Exception as e:
                self.logger.error("invalid proxy:" + proxy)

    def test(self):
        """
        aio test all proxies.
        """
        print('ValidityTester is working')
        try:
            loop = asyncio.get_event_loop()
            tasks = [self.test_one_proxy(proxy) for proxy in self._raw_proxies]
            loop.run_until_complete(asyncio.wait(tasks))
        except ValueError:
            self.logger.error('Async Error')
Beispiel #14
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy: 单个代理
        :return: None
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode("utf-8")
                real_proxy = "http://" + proxy
                print("正在测试", proxy)
                async with session.get(TEST_URL, proxy=real_proxy,
                                       timeout=10) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                    else:
                        self.redis.decrease(proxy)
                        print("请求响应码不和法", proxy)
            except (
                    ClientError,
                    ProxyConnectionError,
                    ServerConnectionError,
                    TimeoutError,
                    AttributeError,
            ):
                self.redis.decrease(proxy)
                print("代理请求失败", proxy)
            except Exception as e:
                print(e)

    def run(self):
        """
        测试主函数
        :return:
        """
        print("测试器开始运行")
        try:
            proxies = self.redis.all()
            loop = asyncio.get_event_loop()
            # 批量测试
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
                tasks = [self.test_single_proxy(proxy) for proxy in proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print("测试器发生错误", e.args)
Beispiel #15
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()
    
    def is_over_threshold(self):
        """
        Compare to the capacity of proxy pool.
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False
    
    def run(self):
        logger.debug('Getter is running.')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Beispiel #16
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy,
                                       timeout=15) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法', proxy)
            except (ClientError, ClientConnectionError, TimeoutError,
                    AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        print('测试器开始运行')
        try:
            proxies = self.redis.all()
            loop = asyncio.get_event_loop()
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误', e.args)
Beispiel #17
0
 def __init__(self):
     self.redis = RedisClient()
Beispiel #18
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Beispiel #19
0
 def set_test_procies(self, proxies):
     self.test_proxies = proxies
     self.conn = RedisClient()
Beispiel #20
0
 def __init__(self):
     self.proxy_max = PROXY_MAX
     self.tester = ValidityTester()
     self.getter = FreeProxyGetter()
     self.conn = RedisClient()
Beispiel #21
0
class SuperspiderDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    def __init__(self, crawler):
        print(
            '[' +
            time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) +
            ']', "【init】")
        self.proxy = None  # 默认代理为空,使用真实IP
        self.db = RedisClient()

    def infoprint(self, message):
        print('[' +
              time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(time.time())) +
              ']【Info】' + message)

    def get_proxy(self):
        """随机获取代理"""
        try:
            proxy = self.db.get_random().decode('utf-8')
            return "http://{}".format(proxy)
        except Exception as e:
            print(
                '[' + time.strftime('%Y/%m/%d %H:%M:%S',
                                    time.localtime(time.time())) + ']',
                '【获取代理失败!】')
            return None

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls(crawler)
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        """处理请求"""
        self.infoprint('【是否重试】:' + ('是' if request.meta.get('retry') else '否'))
        old_proxy = request.meta.get('proxy')
        if self.proxy is None or old_proxy is None or self.proxy == request.meta.get(
                'proxy'):
            # 请求被重来,更换代理
            proxy = self.get_proxy()
            self.infoprint('更换代理为:{}'.format(proxy))
            if proxy:
                self.proxy = proxy
        request.meta['proxy'] = self.proxy
        spider.info('【request】' + self.proxy + ' URL:' + request.url)

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        """处理响应"""
        if response.status != 200:
            if response.status == 302:
                self.infoprint('【被拦截】:' + self.proxy)
                spider.logger.warning('【被拦截】:' + self.proxy)
            elif response.status == 404:
                self.infoprint('【无法找到文件】:' + self.proxy + ' URL:' +
                               request.url)
                spider.logger.warning('【无法找到文件】:' + self.proxy + ' URL:' +
                                      request.url)
            else:
                self.infoprint('【未知】' + self.proxy + ' ' +
                               str(response.status) + ' URL:' + request.url)
                spider.logger.warning('【未知】' + self.proxy + ' ' +
                                      str(response.status) + ' URL:' +
                                      request.url)

            return self.get_retry_request(request)
        elif '用户访问安全认证' in response.text:
            self.infoprint('【出现安全认证】' + response.url)
            spider.logger.warning('【出现安全认证】' + response.url)
            return self.get_retry_request(request)

        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        try:
            oserror = str(exception.osError)
            if oserror == "10060" or oserror == "10061":
                self.infoprint('【exception】' + request.url + ' ' +
                               str(exception.args))
                spider.logger.error('【exception】' + request.url + ' ' +
                                    str(exception.args))
            else:
                self.infoprint('【exception】' + request.url + ' ' +
                               str(exception.osError))
                spider.logger.error('【exception】' + request.url + ' ' +
                                    str(exception.osError))
        except:
            try:
                self.infoprint('【exception】' + request.url + ' ' +
                               str(exception))
                spider.logger.error('【exception】' + request.url + ' ' +
                                    str(exception))
            except:
                pass
            pass

        self.infoprint('【请求错误】重试')
        spider.logger.info('【请求错误】重试')

        # 重试
        return self.get_retry_request(request)

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

    def get_retry_request(self, request):
        """获取要重试的请求"""
        try:
            self.proxy = None  # 重置代理
            retry_request = request.copy()
            retry_request.dont_filter = True  # 禁止去重
            retry_request.meta['retry'] = time.time()
            return retry_request
        except Exception as e:
            self.infoprint('【get_retry_request】【获取要重试的请求出错】' + str(e))
            return None
Beispiel #22
0
def get_conn():
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis
Beispiel #23
0
from ProxyPool.db import RedisClient

conn = RedisClient()


def set(proxy):
    result = conn.add(proxy)
    print(proxy)
    print('录入成功' if result else '录入失败')


def scan():
    print('请输入代理, 输入exit退出读入')
    while True:
        proxy = input()
        if proxy == 'exit':
            break
        set(proxy)


if __name__ == '__main__':
    scan()
Beispiel #24
0
 def __init__(self, threshold):
     self._conn = RedisClient()
     self._crawl = FreeProxyGetter()
     self._tester = ValidityTester()
     self._threshold = threshold
     self.logger = logging.getLogger(__name__)
Beispiel #25
0
class Tester:
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        Test the availability of a proxy.
        And maximize its score if available else decrease its score.
        :param proxy
        :return
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy  # url应包括协议,浏览器访问不加协议会默认用http
                logger.debug('Testing {}'.format(proxy))
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       timeout=15,
                                       allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODES:
                        logger.debug('Proxy {} is OK'.format(proxy))
                        self.redis.maximize(proxy)
                    else:
                        logger.warning(
                            'Failed to use proxy {} because the response code was {}'
                            .format(proxy, response.status))
                        self.redis.decrease(proxy)
            except (ClientError,
                    aiohttp.client_exceptions.ClientConnectorError,
                    asyncio.TimeoutError, AttributeError) as e:
                self.redis.decrease(proxy)
                logger.warning('Failed to use proxy {} because of {}'.format(
                    proxy, repr(e)))

    def run(self):
        """
        Run tester.
        :return:
        """
        logger.debug('Tester is running.')
        try:
            count = self.redis.count()
            logger.info(
                'There are {} proxy (proxies) in proxy pool now.'.format(
                    count))
            for i in range(0, count, BATCH_TEST_SIZE):
                start = i
                stop = min(i + BATCH_TEST_SIZE, count)
                # 分批测试防止内存开销过大
                logger.debug(
                    'Testing proxies with index between {} and {}.'.format(
                        start + 1, stop))
                test_proxies = self.redis.batch(start, stop)
                # 异步测试加快速度
                loop = asyncio.get_event_loop()
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
            logger.info('Testing finished')

        except Exception as e:
            logger.warning('Tester error {}'.format(e.args))
Beispiel #26
0
 def __init__(self):
     self.proxyData = RedisClient()
     self.logger = logging.getLogger(__name__)
Beispiel #27
0
 def __init__(self):
     self._raw_proxies = None
     self._available_proxies = []
     self._conn = RedisClient()
     self.logger = logging.getLogger(__name__)
Beispiel #28
0
def get_proxy():
    cli = RedisClient()
    return cli.random()