def __init__( self, proxies=None, valid_timeout=20, check_interval=180, max_proxy_use_num=10000, delay=30, use_interval=None, logger=None, **kwargs, ): """ :param proxies: :param valid_timeout: 代理检测超时时间 默认-1 20181008 默认不再监测有效性 :param check_interval: :param max_proxy_use_num: :param delay: :param use_interval: 使用间隔 单位秒 默认不限制 :param logger: 日志处理器 默认 log.get_logger() :param kwargs: """ # {"http": ..., "https": ...} self.proxies = proxies # 检测超时时间 秒 self.valid_timeout = valid_timeout # 检测间隔 秒 self.check_interval = check_interval # 标记 0:正常 -1:丢弃 1: 待会再用 ... self.flag = 0 # 上次状态变化时间 self.flag_ts = 0 # 上次更新时间 有效时间 self.update_ts = 0 # 最大被使用次数 self.max_proxy_use_num = max_proxy_use_num # 被使用次数记录 self.use_num = 0 # 延迟使用时间 self.delay = delay # 使用间隔 单位秒 self.use_interval = use_interval # 使用时间 self.use_ts = 0 self.proxy_args = self.parse_proxies(self.proxies) self.proxy_ip = self.proxy_args["ip"] self.proxy_port = self.proxy_args["port"] self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port) if self.proxy_args["user"]: self.proxy_id = "{user}:{password}@{ip}:{port}".format( **self.proxy_args) else: self.proxy_id = self.proxy_ip_port # 日志处理器 self.logger = logger or log.get_logger(__file__)
def __init__( self, key, timeout=300, wait_timeout=8 * 3600, break_wait=None, redis_uri=None, connection_pool=None, logger=None, ): """ redis超时锁 :param key: 关键字 不同项目区分 :param timeout: 锁超时时间 :param wait_timeout: 等待加锁超时时间 默认8小时 防止多线程竞争时可能出现的 某个线程无限等待 <=0 则不等待 直接加锁失败 :param break_wait: 可自定义函数 灵活控制 wait_timeout 时间 当此函数返回True时 不再wait 用法示例: with RedisLock(key="test", timeout=10, wait_timeout=100, redis_uri="") as _lock: if _lock.locked: # 用来判断是否加上了锁 # do somethings """ self.redis_index = -1 if not key: raise Exception("lock key is empty") if connection_pool: self.redis_conn = redis.StrictRedis( connection_pool=connection_pool) else: self.redis_conn = self.get_redis_conn(redis_uri) self.logger = logger or log.get_logger(__file__) self.lock_key = "redis_lock:{}".format(key) # 锁超时时间 self.timeout = timeout # 等待加锁时间 self.wait_timeout = wait_timeout # wait中断函数 self.break_wait = break_wait if self.break_wait is None: self.break_wait = lambda: False if not callable(self.break_wait): raise TypeError( "break_wait must be function or None, but: {}".format( type(self.break_wait))) self.locked = False
# coding:utf8 from spider.spiders import SingleBatchSpider, Request, Response from spider.utils import log from dateutil import parser from bs4 import BeautifulSoup import spider.utils.tools as tools logger = log.get_logger(__file__) try: from . import setting except: import setting # 详情页解析 class ccpg_detail_Spider(SingleBatchSpider): def __init__(self, **kwargs): super(ccpg_detail_Spider, self).__init__(**kwargs) self.task_key = "task:ccgp:detail" # 需修改 self.task_table_name = "ccgp_list" self.task_data_table = "ccgp_detail" self.task_field_list = ["id", "url", "title", "ctime"] self.batch_interval = 7 self.task_tag_name = "ccgp_detail" self.message_recipients = ["WXT"] self.debug = False self.pool_size = 1 if self.debug else 100 self.downloader.proxy_enable = not self.debug def add_task(self): pass
def __init__(self, **kwargs): """ :param size: 代理池大小 -1 为不限制 :param proxy_source_url: 代理文件地址 支持列表 :param proxy_instance: 提供代理的实例 :param reset_interval: 代理池重置间隔 最小间隔 :param reset_interval_max: 代理池重置间隔 最大间隔 默认2分钟 :param check_valid: 是否在获取代理时进行检测有效性 :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间 :param logger: 日志处理器 默认 log.get_logger() :param kwargs: 其他的参数 """ super(ProxyPool, self).__init__(**kwargs) # 队列最大长度 self.max_queue_size = kwargs.get("size", -1) # 实际代理数量 self.real_max_proxy_count = 1000 # 代理可用最大次数 # 代理获取地址 http://localhost/proxy.txt self.proxy_source_url = kwargs.get("proxy_source_url", []) if not isinstance(self.proxy_source_url, list): self.proxy_source_url = [self.proxy_source_url] self.proxy_source_url = [x for x in self.proxy_source_url if x] self.proxy_source_url = list(set(self.proxy_source_url)) kwargs.update({"proxy_source_url": self.proxy_source_url}) # 处理日志 self.logger = kwargs.get("logger") or log.get_logger(__file__) kwargs["logger"] = self.logger if not self.proxy_source_url: self.logger.warn("need set proxy_source_url or proxy_instance") # 代理池重置间隔 self.reset_interval = kwargs.get("reset_interval", 5) # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理 self.reset_interval_max = kwargs.get("reset_interval_max", 180) # 是否监测代理有效性 self.check_valid = kwargs.get("check_valid", True) # 代理队列 self.proxy_queue = None # {代理id: ProxyItem, ...} self.proxy_dict = {} # 失效代理队列 self.invalid_proxy_dict = {} # self.kwargs = kwargs # 重置代理池锁 self.reset_lock = None # 重置时间 self.last_reset_time = 0 # 重置的太快了 计数 self.reset_fast_count = 0 # 计数 获取代理重试3次仍然失败 次数 self.no_valid_proxy_times = 0 # 上次获取代理时间 self.last_get_ts = time.time() # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性 self.proxy_item_update_ts_dict = {} # 警告 self.warn_flag = False
def check_proxy( ip="", port="", proxies=None, type=0, timeout=5, logger=None, show_error_log=False, **kwargs, ): """ 代理有效性检查 :param ip: :param port: :param type: 0:socket 1:requests :param timeout: :param logger: :return: """ if not logger: logger = log.get_logger(__file__) ok = 0 if type == 0 and ip and port: # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk: sk.settimeout(timeout) try: # 必须检测 否则代理永远不刷新 sk.connect((ip, int(port))) ok = 1 except Exception as e: if show_error_log: logger.debug("check proxy failed: {} {}:{}".format( e, ip, port)) sk.close() else: if not proxies: proxies = { "http": "http://{}:{}".format(ip, port), "https": "https://{}:{}".format(ip, port), } target_url = random.choice([ "http://www.baidu.com", # "http://httpbin.org/ip", ]) try: r = requests.get( target_url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" }, proxies=proxies, timeout=timeout, stream=True, ) ok = 1 r.close() except Exception as e: if show_error_log: logger.debug("check proxy failed: {} {}:{} {}".format( e, ip, port, proxies)) return ok