Beispiel #1
0
    def __init__(
        self,
        proxies=None,
        valid_timeout=20,
        check_interval=180,
        max_proxy_use_num=10000,
        delay=30,
        use_interval=None,
        logger=None,
        **kwargs,
    ):
        """
        :param proxies:
        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
        :param check_interval:
        :param max_proxy_use_num:
        :param delay:
        :param use_interval: 使用间隔 单位秒 默认不限制
        :param logger: 日志处理器 默认 log.get_logger()
        :param kwargs:
        """
        # {"http": ..., "https": ...}
        self.proxies = proxies
        # 检测超时时间 秒
        self.valid_timeout = valid_timeout
        # 检测间隔 秒
        self.check_interval = check_interval

        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
        self.flag = 0
        # 上次状态变化时间
        self.flag_ts = 0
        # 上次更新时间 有效时间
        self.update_ts = 0
        # 最大被使用次数
        self.max_proxy_use_num = max_proxy_use_num
        # 被使用次数记录
        self.use_num = 0
        # 延迟使用时间
        self.delay = delay
        # 使用间隔 单位秒
        self.use_interval = use_interval
        # 使用时间
        self.use_ts = 0

        self.proxy_args = self.parse_proxies(self.proxies)
        self.proxy_ip = self.proxy_args["ip"]
        self.proxy_port = self.proxy_args["port"]
        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
        if self.proxy_args["user"]:
            self.proxy_id = "{user}:{password}@{ip}:{port}".format(
                **self.proxy_args)
        else:
            self.proxy_id = self.proxy_ip_port

        # 日志处理器
        self.logger = logger or log.get_logger(__file__)
Beispiel #2
0
    def __init__(
        self,
        key,
        timeout=300,
        wait_timeout=8 * 3600,
        break_wait=None,
        redis_uri=None,
        connection_pool=None,
        logger=None,
    ):
        """
        redis超时锁
        :param key: 关键字  不同项目区分
        :param timeout: 锁超时时间
        :param wait_timeout:  等待加锁超时时间 默认8小时  防止多线程竞争时可能出现的 某个线程无限等待
                            <=0 则不等待 直接加锁失败
        :param break_wait: 可自定义函数 灵活控制 wait_timeout 时间 当此函数返回True时 不再wait

        用法示例:
        with RedisLock(key="test", timeout=10, wait_timeout=100, redis_uri="") as _lock:
            if _lock.locked:
                # 用来判断是否加上了锁
                # do somethings
        """
        self.redis_index = -1
        if not key:
            raise Exception("lock key is empty")
        if connection_pool:
            self.redis_conn = redis.StrictRedis(
                connection_pool=connection_pool)
        else:
            self.redis_conn = self.get_redis_conn(redis_uri)

        self.logger = logger or log.get_logger(__file__)

        self.lock_key = "redis_lock:{}".format(key)
        # 锁超时时间
        self.timeout = timeout
        # 等待加锁时间
        self.wait_timeout = wait_timeout
        # wait中断函数
        self.break_wait = break_wait
        if self.break_wait is None:
            self.break_wait = lambda: False
        if not callable(self.break_wait):
            raise TypeError(
                "break_wait must be function or None, but: {}".format(
                    type(self.break_wait)))

        self.locked = False
Beispiel #3
0
# coding:utf8
from spider.spiders import SingleBatchSpider, Request, Response
from spider.utils import log
from dateutil import parser
from bs4 import BeautifulSoup
import spider.utils.tools as tools
logger = log.get_logger(__file__)

try:
    from . import setting
except:
    import setting


# 详情页解析
class ccpg_detail_Spider(SingleBatchSpider):
    def __init__(self, **kwargs):
        super(ccpg_detail_Spider, self).__init__(**kwargs)
        self.task_key = "task:ccgp:detail"  # 需修改
        self.task_table_name = "ccgp_list"
        self.task_data_table = "ccgp_detail"
        self.task_field_list = ["id", "url", "title", "ctime"]
        self.batch_interval = 7
        self.task_tag_name = "ccgp_detail"
        self.message_recipients = ["WXT"]
        self.debug = False
        self.pool_size = 1 if self.debug else 100
        self.downloader.proxy_enable = not self.debug

    def add_task(self):
        pass
Beispiel #4
0
    def __init__(self, **kwargs):
        """
        :param size: 代理池大小  -1 为不限制
        :param proxy_source_url: 代理文件地址 支持列表
        :param proxy_instance:  提供代理的实例
        :param reset_interval:  代理池重置间隔 最小间隔
        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
        :param check_valid: 是否在获取代理时进行检测有效性
        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
        :param logger: 日志处理器 默认 log.get_logger()
        :param kwargs: 其他的参数
        """
        super(ProxyPool, self).__init__(**kwargs)
        # 队列最大长度
        self.max_queue_size = kwargs.get("size", -1)
        # 实际代理数量
        self.real_max_proxy_count = 1000
        # 代理可用最大次数
        # 代理获取地址 http://localhost/proxy.txt
        self.proxy_source_url = kwargs.get("proxy_source_url", [])
        if not isinstance(self.proxy_source_url, list):
            self.proxy_source_url = [self.proxy_source_url]
            self.proxy_source_url = [x for x in self.proxy_source_url if x]
            self.proxy_source_url = list(set(self.proxy_source_url))
            kwargs.update({"proxy_source_url": self.proxy_source_url})
        # 处理日志
        self.logger = kwargs.get("logger") or log.get_logger(__file__)
        kwargs["logger"] = self.logger
        if not self.proxy_source_url:
            self.logger.warn("need set proxy_source_url or proxy_instance")

        # 代理池重置间隔
        self.reset_interval = kwargs.get("reset_interval", 5)
        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
        # 是否监测代理有效性
        self.check_valid = kwargs.get("check_valid", True)

        # 代理队列
        self.proxy_queue = None
        # {代理id: ProxyItem, ...}
        self.proxy_dict = {}
        # 失效代理队列
        self.invalid_proxy_dict = {}
        #
        self.kwargs = kwargs

        # 重置代理池锁
        self.reset_lock = None
        # 重置时间
        self.last_reset_time = 0
        # 重置的太快了  计数
        self.reset_fast_count = 0
        # 计数 获取代理重试3次仍然失败 次数
        self.no_valid_proxy_times = 0

        # 上次获取代理时间
        self.last_get_ts = time.time()

        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
        self.proxy_item_update_ts_dict = {}

        # 警告
        self.warn_flag = False
Beispiel #5
0
def check_proxy(
    ip="",
    port="",
    proxies=None,
    type=0,
    timeout=5,
    logger=None,
    show_error_log=False,
    **kwargs,
):
    """
    代理有效性检查
    :param ip:
    :param port:
    :param type: 0:socket  1:requests
    :param timeout:
    :param logger:
    :return:
    """
    if not logger:
        logger = log.get_logger(__file__)
    ok = 0
    if type == 0 and ip and port:
        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
            sk.settimeout(timeout)
            try:
                # 必须检测 否则代理永远不刷新
                sk.connect((ip, int(port)))
                ok = 1
            except Exception as e:
                if show_error_log:
                    logger.debug("check proxy failed: {} {}:{}".format(
                        e, ip, port))
            sk.close()
    else:
        if not proxies:
            proxies = {
                "http": "http://{}:{}".format(ip, port),
                "https": "https://{}:{}".format(ip, port),
            }
        target_url = random.choice([
            "http://www.baidu.com",
            # "http://httpbin.org/ip",
        ])
        try:
            r = requests.get(
                target_url,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
                },
                proxies=proxies,
                timeout=timeout,
                stream=True,
            )
            ok = 1
            r.close()
        except Exception as e:
            if show_error_log:
                logger.debug("check proxy failed: {} {}:{} {}".format(
                    e, ip, port, proxies))
    return ok