class Proxy: def __init__(self): self.fetcher = ProxyFetcher('https', strategy='greedy', redis_args=args) self.pools = self.fetcher.pool self.used = 1 # self.usedConn = StrictRedis(**args) def get_ip(self): self.used += 1 # log.info("pid: {} , used: {}, length: {}, ***** {} %".format(os.getpid(), self.used, len(self.pools), # self.used / (len(self.pools) + 1) * 100)) if self.used > len(self.pools) / 3: # self.fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args) self.pools = self.fetcher.get_available_proxies(self.fetcher.conn) self.used = 0 # log.info(""" # ****************************** # ****** {} use ******** # ****************************** # """.format(str(len(self.pools)))) return random.choice(self.pools) # use = self.usedConn.srandmember('haipproxy:all').decode() # log.info(use) # return use def remove(self, url): self.fetcher.delete_proxy(url) self.pools.remove(url)
class Proxy: def __init__(self): self.fetcher = ProxyFetcher('https', strategy='greedy', redis_args=args) self.pools = self.fetcher.get_proxies() self.used = 1 def get_ip(self): self.used += 1 log.info("pid: {} , used: {}, length: {}, ***** {} %".format( os.getpid(), self.used, len(self.pools), self.used / (len(self.pools) + 1) * 100)) if self.used > len(self.pools) / 10: # self.fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args) self.pools = self.fetcher.get_proxies() self.used = 0 log.info(""" ****************************** ****** {} ******** ****************************** """.format(str(len(self.pools)))) return random.choice(self.pools) def remove(self, url): self.fetcher.delete_proxy(url) self.pools.remove(url)
def get_proxy(): try: args = settings['outgoing']['haipproxy_redis'] fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args) proxy = fetcher.get_proxy() if proxy: return {'http': proxy} else: logger.warning('No available proxy fetched from the proxy pool.') except Exception: logger.warning('Exception in fetching proxy.') logger.warning(traceback.print_exc())
def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random') self.args = dict(host='127.0.0.1', port=6379, password=REDIS_PASSWORD, db=0) self.fetcher = ProxyFetcher('jd', strategy='greedy', redis_args=self.args) self.fetch_count = 0
def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) # 代理池使用1000次后更换 self.fetch_count += 1 if self.fetch_count > 1000: self.fetcher = ProxyFetcher('jd', strategy='greedy', redis_args=self.args) self.fetch_count = 0 request.headers.setdefault('User-Agent', get_ua()) request.meta['proxy'] = self.fetcher.get_proxy()
class RandomUserAgentMiddleware(object): """ 动态代理 """ def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random') self.args = dict(host='127.0.0.1', port=6379, password=REDIS_PASSWORD, db=0) self.fetcher = ProxyFetcher('jd', strategy='greedy', redis_args=self.args) self.fetch_count = 0 @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) # 代理池使用1000次后更换 self.fetch_count += 1 if self.fetch_count > 1000: self.fetcher = ProxyFetcher('jd', strategy='greedy', redis_args=self.args) self.fetch_count = 0 request.headers.setdefault('User-Agent', get_ua()) request.meta['proxy'] = self.fetcher.get_proxy()
def get_proxy_fetcher(): """ 需要维护自己的代理,你可以在这里封装自己的代理。 """ host = get_config_field('redis', 'IP') port = get_config_field('redis', 'PORT') db = get_config_field('redis', 'DB') args = dict(host=host, port=port, password='', db=db) return ProxyFetcher('zhihu', strategy='greedy', redis_args=args)
def __init__(self): self.fetcher = ProxyFetcher('https', strategy='greedy', redis_args=args) self.pools = self.fetcher.pool self.used = 1
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from haipproxy.client.py_cli import ProxyFetcher from random import randrange args = dict(host='127.0.0.1', port=6379, password='', db=0) fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args) class ScrapywithproxySpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception.
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/7/23 18:42 # @Author : youfeng import time from haipproxy.client.py_cli import ProxyFetcher args = dict(host='192.168.1.90', port=6379, password='******', db=15) # # 这里`zhihu`的意思是,去和`zhihu`相关的代理ip校验队列中获取ip # # 这么做的原因是同一个代理IP对不同网站代理效果不同 fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args) # 获取一个可用代理 start_time = time.time() print(fetcher.get_proxy()) print("获取代理耗费时间: {} s".format(time.time() - start_time)) # 获取可用代理列表 proxies_list = fetcher.get_proxies() print(len(proxies_list)) print(fetcher.get_proxies()) # or print(fetcher.pool)
def proxy(self): fetcher = ProxyFetcher('http', strategy='greedy', redis_args=self.args) return fetcher.get_proxy()
from haipproxy.client.py_cli import ProxyFetcher args = dict(host='127.0.0.1', port=6379, password='******', db=0) fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args) print(fetcher.get_proxy()) print(fetcher.get_proxies()) # or print(fetcher.pool)