コード例 #1
0
ファイル: getter.py プロジェクト: lixinjiang/ProxyPool
class Getter(object):
    """
    getter of proxypool
    """

    def __init__(self):
        """
        init db and crawlers
        """
        self.redis = RedisClient()
        self.crawlers_cls = crawlers_cls
        self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

    def is_full(self):
        """
        if proxypool if full
        return: bool
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        """
        run crawlers to get proxy
        :return:
        """
        if self.is_full():
            return
        for crawler in self.crawlers:
            for proxy in crawler.crawl():
                self.redis.add(proxy)
コード例 #2
0
class Getter(object):
    """
    获取代理池
    """
    def __init__(self):
        """
        初始化 db 和 爬虫
        """
        self.redis = RedisClient()
        self.crawlers_cls = crawlers_cls
        self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

    def is_full(self):
        """
        判断代理池是否已经满了
        return: bool
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        """
        运行代理抓取工具
        :return:
        """
        if self.is_full():
            return
        for crawler in self.crawlers:
            logger.info(f'爬取 {crawler} to get proxy')
            for proxy in crawler.crawl():
                self.redis.add(proxy)
コード例 #3
0
ファイル: getter.py プロジェクト: lixinjiang/ProxyPool
 def __init__(self):
     """
     init db and crawlers
     """
     self.redis = RedisClient()
     self.crawlers_cls = crawlers_cls
     self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
コード例 #4
0
 def __init__(self):
     """
     初始化 db 和 爬虫
     """
     self.redis = RedisClient()
     self.crawlers_cls = crawlers_cls
     self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
コード例 #5
0
ファイル: getter.py プロジェクト: so86/ProxyPool
class Getter(object):
    """
    getter of proxypool
    """
    def __init__(self):
        """
        init db and crawlers
        """
        self.redis = RedisClient()
        self.crawlers_cls = crawlers_cls
        self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

    def is_full(self):
        """
        if proxypool if full
        return: bool
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        """
        run crawlers to get proxy
        :return:
        """
        if self.is_full():
            return
        proxyfile = "staticproxy.txt"
        with open(proxyfile, 'r') as fh:
            proxylines = fh.readlines()
        logger.info(f'read {proxyfile}')
        for line in proxylines:
            if line.strip() != "" and not line.startswith("#"):
                line = line.replace("\r\n", "").replace("\n", "")
                pattern = re.compile(
                    r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)'
                )
                match = re.search(pattern, line)
                if match:
                    username = match.groupdict()['username']
                    password = match.groupdict()['password']
                    ip = match.groupdict()['ip']
                    port = match.groupdict()['port']
                    proxy = Proxy(host=ip,
                                  port=port,
                                  username=username,
                                  password=password)
                    logger.info("getproxy " + proxy.string())
                    self.redis.add(proxy)

        for crawler in self.crawlers:
            logger.info(f'crawler {crawler} to get proxy')
            for proxy in crawler.crawl():
                print(proxy.string())
                self.redis.add(proxy)
コード例 #6
0
class Tester(object):
    """
    tester for testing proxies in queue
    """
    def __init__(self):
        """
        init redis
        """
        self.redis = RedisClient()
        self.loop = asyncio.get_event_loop()

    async def test(self, proxy: Proxy):
        """
        test single proxy
        :param proxy: Proxy object
        :return:
        """
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
                ssl=False)) as session:
            try:
                logger.debug(f'testing {proxy.string()}')
                async with session.get(TEST_URL,
                                       proxy=f'http://{proxy.string()}',
                                       timeout=TEST_TIMEOUT,
                                       allow_redirects=False) as response:
                    if response.status in TEST_VALID_STATUS:
                        self.redis.max(proxy)
                        logger.debug(
                            f'proxy {proxy.string()} is valid, set max score')
                    else:
                        self.redis.decrease(proxy)
                        logger.debug(
                            f'proxy {proxy.string()} is invalid, decrease score'
                        )
            except EXCEPTIONS:
                self.redis.decrease(proxy)
                logger.debug(
                    f'proxy {proxy.string()} is invalid, decrease score')

    @logger.catch
    def run(self):
        """
        test main method
        :return:
        """
        # event loop of aiohttp
        logger.info('stating tester...')
        count = self.redis.count()
        logger.debug(f'{count} proxies to test')
        for i in range(0, count, TEST_BATCH):
            # start end end offset
            start, end = i, min(i + TEST_BATCH, count)
            logger.debug(f'testing proxies from {start} to {end} indices')
            proxies = self.redis.batch(start, end)
            tasks = [self.test(proxy) for proxy in proxies]
            # run tasks using event loop
            self.loop.run_until_complete(asyncio.wait(tasks))
コード例 #7
0
def get_conn():
    """
    get redis client object
    :return:
    """
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis
コード例 #8
0
ファイル: checker.py プロジェクト: XieFengCheng/xfc
 def __init__(self):
     """
     init redis
     """
     self.redis = RedisClient()
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko'
         ') Chrome/83.0.4103.97 Safari/537.36'
     }
コード例 #9
0
ファイル: check_proxy.py プロジェクト: XieFengCheng/xfc
def checking(proxy):
    ocj = RedisClient()
    proxies = {
        'http': 'http://{}'.format(proxy),
        'https': 'https://{}'.format(proxy)
    }
    try:
        res_proxy = requests.get(TEST_URL,
                                 headers=TEST_HEADERS,
                                 proxies=proxies,
                                 verify=False,
                                 timeout=100)
        if res_proxy.status_code == 200:
            return proxy
        else:
            ocj.decrease(proxy)
            return None
    except ConnectionError and Timeout and HTTPError:
        ocj.decrease(proxy)
        return None
コード例 #10
0
 def __init__(self):
     """
     init redis
     """
     self.redis = RedisClient()
     self.loop = asyncio.get_event_loop()
コード例 #11
0
class Tester(object):
    """
    tester for testing proxies in queue
    """
    def __init__(self):
        """
        init redis
        """
        self.redis = RedisClient()
        self.loop = asyncio.get_event_loop()

    async def test(self, proxy: Proxy):
        """
        test single proxy
        :param proxy: Proxy object
        :return:
        """
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
                ssl=False)) as session:
            try:
                logger.debug(f'testing {proxy.string()}')
                # if TEST_ANONYMOUS is True, make sure that
                # the proxy has the effect of hiding the real IP
                if TEST_ANONYMOUS:
                    url = 'https://httpbin.org/ip'
                    async with session.get(url,
                                           timeout=TEST_TIMEOUT) as response:
                        resp_json = await response.json()
                        origin_ip = resp_json['origin']
                    async with session.get(url,
                                           proxy=f'http://{proxy.string()}',
                                           timeout=TEST_TIMEOUT) as response:
                        resp_json = await response.json()
                        anonymous_ip = resp_json['origin']
                    assert origin_ip != anonymous_ip
                    assert proxy.host == anonymous_ip
                async with session.get(TEST_URL,
                                       proxy=f'http://{proxy.string()}',
                                       timeout=TEST_TIMEOUT,
                                       allow_redirects=False) as response:
                    if response.status in TEST_VALID_STATUS:
                        self.redis.max(proxy)
                        logger.debug(
                            f'proxy {proxy.string()} is valid, set max score')
                    else:
                        self.redis.decrease(proxy)
                        logger.debug(
                            f'proxy {proxy.string()} is invalid, decrease score'
                        )
            except EXCEPTIONS:
                self.redis.decrease(proxy)
                logger.debug(
                    f'proxy {proxy.string()} is invalid, decrease score')

    @logger.catch
    def run(self):
        """
        test main method
        :return:
        """
        # event loop of aiohttp
        logger.info('stating tester...')
        count = self.redis.count()
        logger.debug(f'{count} proxies to test')
        cursor = 0
        while True:
            logger.debug(
                f'testing proxies use cursor {cursor}, count {TEST_BATCH}')
            cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH)
            if proxies:
                tasks = [self.test(proxy) for proxy in proxies]
                self.loop.run_until_complete(asyncio.wait(tasks))
            if not cursor:
                break
コード例 #12
0
def clien():
    client = RedisClient()
    return client.random()
コード例 #13
0
class Tester(object):
    """
    测试队列中代理 的 测试器
    """
    def __init__(self):
        """
        初始化redis
        """
        self.redis = RedisClient()
        self.loop = asyncio.get_event_loop()

    # 测试匿名1
    async def test_anonymous1(self, proxy: Proxy, session):

        url = 'https://httpbin.org/ip'
        async with session.get(url, timeout=TEST_TIMEOUT) as response:
            resp_json = await response.json()
            origin_ip = resp_json['origin']
        async with session.get(url,
                               proxy=f'http://{proxy.string()}',
                               timeout=TEST_TIMEOUT) as response:
            resp_json = await response.json()
            anonymous_ip = resp_json['origin']

        logger.debug(
            f'只测试匿名代理: {origin_ip != anonymous_ip} -- 结果匿名ip:{anonymous_ip},代理ip:{proxy.string()}'
        )
        assert origin_ip != anonymous_ip
        assert proxy.host == anonymous_ip

    # 测试匿名2
    async def test_anonymous2(self, proxy: Proxy, session):
        url = 'http://km.chik.cn/ip'
        async with session.get(url, timeout=TEST_TIMEOUT) as response:
            resp_json = await response.json()
            origin_ip = resp_json['origin']
        async with session.get(url,
                               proxy=f'http://{proxy.string()}',
                               timeout=TEST_TIMEOUT) as response:
            resp_json = await response.json()
            anonymous_ip = resp_json['origin']

        logger.debug(
            f'只测试匿名代理2: {origin_ip != anonymous_ip} -- 结果匿名ip:{anonymous_ip},代理ip:{proxy.string()}'
        )
        assert origin_ip != anonymous_ip
        assert proxy.host == anonymous_ip

    async def test(self, proxy: Proxy):
        """
        测试单个代理
        :param proxy: 代理对象
        :return:
        """
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
                ssl=False)) as session:
            try:
                # logger.debug(f'测试代理ip为: {proxy.string()}')

                # 如果 TEST_ANONYMOUS 设置为 True, 确保
                # 代理具有隐藏真实IP的作用
                # TEST_ANONYMOUS 测试匿名
                if TEST_ANONYMOUS:
                    await self.test_anonymous1(proxy, session)
                if TEST_ANONYMOUS_MYSELF:
                    await self.test_anonymous2(proxy, session)

                async with session.get(TEST_URL,
                                       proxy=f'http://{proxy.string()}',
                                       timeout=TEST_TIMEOUT,
                                       allow_redirects=False) as response:
                    if response.status in TEST_VALID_STATUS:
                        self.redis.max(proxy)
                        logger.debug(f'代理 {proxy.string()} 是有效的, 设置最大分数')
                    else:
                        self.redis.decrease(proxy)
                        logger.debug(f'代理 {proxy.string()} 不可使用, 降低分数')

            except EXCEPTIONS:
                self.redis.decrease(proxy, score=-10)
                # logger.error(f'代理 {proxy.string()} 异常, 降低分数')

    @logger.catch
    def run(self):
        """
        主测试程序
        :return:
        """
        # event loop of aiohttp 的事件循环
        logger.info('开始测试...')

        # proxies = convert_proxy_or_proxies("191.235.98.23:3128")
        # [self.test(proxy) for proxy in proxies]

        count = self.redis.count()
        logger.debug(f'共 {count} 个代理等待测试')
        cursor = 0
        while True:
            logger.debug(f'测试代理游标 {cursor}, count {TEST_BATCH}')
            cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH)
            if proxies:
                # proxies = convert_proxy_or_proxies("191.235.98.23:3128")
                tasks = [self.test(proxy) for proxy in proxies]
                self.loop.run_until_complete(asyncio.wait(tasks))
            if not cursor:
                break