Esempio n. 1
0
class Proxy:
    def __init__(self):
        self.fetcher = ProxyFetcher('https',
                                    strategy='greedy',
                                    redis_args=args)
        self.pools = self.fetcher.pool
        self.used = 1
        # self.usedConn = StrictRedis(**args)

    def get_ip(self):
        self.used += 1
        # log.info("pid: {} , used: {}, length: {}, ***** {} %".format(os.getpid(), self.used, len(self.pools),
        #                                                              self.used / (len(self.pools) + 1) * 100))
        if self.used > len(self.pools) / 3:
            # self.fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args)
            self.pools = self.fetcher.get_available_proxies(self.fetcher.conn)
            self.used = 0
            # log.info("""
            #         ******************************
            #         ******    {}    use   ********
            #         ******************************
            #         """.format(str(len(self.pools))))
        return random.choice(self.pools)
        # use = self.usedConn.srandmember('haipproxy:all').decode()
        # log.info(use)
        # return use

    def remove(self, url):
        self.fetcher.delete_proxy(url)
        self.pools.remove(url)
Esempio n. 2
0
class Proxy:
    def __init__(self):
        self.fetcher = ProxyFetcher('https',
                                    strategy='greedy',
                                    redis_args=args)
        self.pools = self.fetcher.get_proxies()
        self.used = 1

    def get_ip(self):
        self.used += 1
        log.info("pid: {} , used: {}, length: {}, ***** {} %".format(
            os.getpid(), self.used, len(self.pools),
            self.used / (len(self.pools) + 1) * 100))
        if self.used > len(self.pools) / 10:
            # self.fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args)
            self.pools = self.fetcher.get_proxies()
            self.used = 0
            log.info("""
                        ******************************
                        ******    {}          ********
                        ******************************
                        """.format(str(len(self.pools))))
        return random.choice(self.pools)

    def remove(self, url):
        self.fetcher.delete_proxy(url)
        self.pools.remove(url)
Esempio n. 3
0
def get_proxy():
    try:
        args = settings['outgoing']['haipproxy_redis']
        fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args)
        proxy = fetcher.get_proxy()
        if proxy:
            return {'http': proxy}
        else:
            logger.warning('No available proxy fetched from the proxy pool.')
    except Exception:
        logger.warning('Exception in fetching proxy.')
        logger.warning(traceback.print_exc())
Esempio n. 4
0
 def __init__(self, crawler):
     super(RandomUserAgentMiddleware, self).__init__()
     self.ua = UserAgent()
     self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
     self.args = dict(host='127.0.0.1',
                      port=6379,
                      password=REDIS_PASSWORD,
                      db=0)
     self.fetcher = ProxyFetcher('jd',
                                 strategy='greedy',
                                 redis_args=self.args)
     self.fetch_count = 0
Esempio n. 5
0
    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        # 代理池使用1000次后更换
        self.fetch_count += 1
        if self.fetch_count > 1000:
            self.fetcher = ProxyFetcher('jd',
                                        strategy='greedy',
                                        redis_args=self.args)
            self.fetch_count = 0
        request.headers.setdefault('User-Agent', get_ua())
        request.meta['proxy'] = self.fetcher.get_proxy()
Esempio n. 6
0
class RandomUserAgentMiddleware(object):
    """
    动态代理
    """
    def __init__(self, crawler):
        super(RandomUserAgentMiddleware, self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
        self.args = dict(host='127.0.0.1',
                         port=6379,
                         password=REDIS_PASSWORD,
                         db=0)
        self.fetcher = ProxyFetcher('jd',
                                    strategy='greedy',
                                    redis_args=self.args)
        self.fetch_count = 0

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        # 代理池使用1000次后更换
        self.fetch_count += 1
        if self.fetch_count > 1000:
            self.fetcher = ProxyFetcher('jd',
                                        strategy='greedy',
                                        redis_args=self.args)
            self.fetch_count = 0
        request.headers.setdefault('User-Agent', get_ua())
        request.meta['proxy'] = self.fetcher.get_proxy()
Esempio n. 7
0
def get_proxy_fetcher():
    """
    需要维护自己的代理,你可以在这里封装自己的代理。
    """
    host = get_config_field('redis', 'IP')
    port = get_config_field('redis', 'PORT')
    db = get_config_field('redis', 'DB')
    args = dict(host=host, port=port, password='', db=db)
    return ProxyFetcher('zhihu', strategy='greedy', redis_args=args)
Esempio n. 8
0
 def __init__(self):
     self.fetcher = ProxyFetcher('https',
                                 strategy='greedy',
                                 redis_args=args)
     self.pools = self.fetcher.pool
     self.used = 1
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from haipproxy.client.py_cli import ProxyFetcher
from random import randrange

args = dict(host='127.0.0.1', port=6379, password='', db=0)
fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args)

class ScrapywithproxySpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
Esempio n. 10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/7/23 18:42
# @Author  : youfeng
import time

from haipproxy.client.py_cli import ProxyFetcher

args = dict(host='192.168.1.90', port=6379, password='******', db=15)
# # 这里`zhihu`的意思是,去和`zhihu`相关的代理ip校验队列中获取ip
# # 这么做的原因是同一个代理IP对不同网站代理效果不同
fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args)
# 获取一个可用代理
start_time = time.time()
print(fetcher.get_proxy())
print("获取代理耗费时间: {} s".format(time.time() - start_time))
# 获取可用代理列表
proxies_list = fetcher.get_proxies()
print(len(proxies_list))
print(fetcher.get_proxies())  # or print(fetcher.pool)
Esempio n. 11
0
 def proxy(self):
     fetcher = ProxyFetcher('http', strategy='greedy', redis_args=self.args)
     return fetcher.get_proxy()
Esempio n. 12
0
from haipproxy.client.py_cli import ProxyFetcher
args = dict(host='127.0.0.1', port=6379, password='******', db=0)
fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args)
print(fetcher.get_proxy())
print(fetcher.get_proxies())  # or print(fetcher.pool)