Beispiel #1
0
    def get_user(self, block=True) -> Optional[GuestUser]:
        """

        Args:
            block: 无用户时是否等待

        Returns:

        """
        while True:
            try:
                user_id = self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)
                    # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id
                    if not user_str:
                        self._load_users_id()
                        continue

                if not user_id and block:
                    self._keep_alive = False
                    with RedisLock(key=self._tab_user_pool,
                                   lock_timeout=3600,
                                   wait_timeout=0) as _lock:
                        if _lock.locked:
                            self.run()
                    continue

                return user_str and GuestUser(**eval(user_str))
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Beispiel #2
0
    def check_filter_capacity(self):
        """
        检测filter状态,如果已满,加载新的filter
        @return:
        """
        if (
            not self._check_capacity_time
            or time.time() - self._check_capacity_time > 1800
        ):
            if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY:
                with self._thread_lock:
                    while True:
                        if self.filters[-1].is_at_capacity:
                            self.filters.append(self.create_filter())
                        else:
                            break

                    self._check_capacity_time = time.time()
            else:
                with RedisLock(
                    key="ScalableBloomFilter",
                    timeout=300,
                    wait_timeout=300,
                    redis_cli=RedisDB(url=self.redis_url).get_redis_obj(),
                ) as lock:  # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
                    if lock.locked:
                        while True:
                            if self.filters[-1].is_at_capacity:
                                self.filters.append(self.create_filter())
                            else:
                                break

                        self._check_capacity_time = time.time()
Beispiel #3
0
    def check_filter_capacity(self):
        """
        检测filter状态,如果已满,加载新的filter
        @return:
        """
        if (not self._check_capacity_time
                or time.time() - self._check_capacity_time > 1800):
            if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY:
                with self._thread_lock:
                    while True:
                        if self.filters[-1].is_at_capacity:
                            self.filters.append(self.create_filter())
                        else:
                            break

                    self._check_capacity_time = time.time()
            else:
                # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
                key = (f"ScalableBloomFilter:{self.name}"
                       if self.name else "ScalableBloomFilter")
                with RedisLock(key=key, redis_cli=self._redis_cli) as lock:
                    if lock.locked:
                        while True:
                            if self.filters[-1].is_at_capacity:
                                self.filters.append(self.create_filter())
                            else:
                                break

                        self._check_capacity_time = time.time()
Beispiel #4
0
    def run(self):
        with RedisLock(
            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
        ) as _lock:
            if _lock.locked:
                user_infos = self.get_user_info()
                if not isinstance(user_infos, Iterable):
                    raise ValueError("get_user_info 返回值必须可迭代")

                if not user_infos:
                    log.info("无可用用户")

                for username, password in user_infos:
                    for i in range(self._login_retry_times):
                        try:
                            cookie = self.create_cookie(username, password)
                            if cookie:
                                self.save_cookie(username, cookie)
                            else:
                                self.handle_login_failed_user(username, password)

                            break
                        except Exception as e:
                            self.handel_exception(e)

                    else:
                        self.handle_login_failed_user(username, password)
Beispiel #5
0
    def run(self):
        while True:
            try:
                try:
                    with RedisLock(
                        key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0
                    ) as _lock:
                        if _lock.locked:
                            for user in self._load_user():
                                retry_times = 0
                                while retry_times <= self._login_retry_times:
                                    try:
                                        user = self.login(user)
                                        if user:
                                            self.add_user(user)
                                        else:
                                            self.handle_login_failed_user(user)
                                        break
                                    except NotImplementedError:
                                        log.error(
                                            f"{self.__class__.__name__} must be implementation login method!"
                                        )
                                        os._exit(0)
                                    except Exception as e:
                                        self.handel_exception(e)
                                    log.debug(
                                        f"login failed, user: {user} retry_times: {retry_times}"
                                    )
                                    retry_times += 1
                                else:
                                    self.handle_login_failed_user(user)

                            now_user_count = self._redisdb.hget_count(
                                self._tab_user_pool
                            )
                            log.info("当前在线user数为 {}".format(now_user_count))

                except Exception as e:
                    log.exception(e)

                if self._keep_alive:
                    tools.delay_time(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
Beispiel #6
0
 def get_cookie(self, wait_when_null=True):
     while True:
         try:
             cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
             if not cookie_info and wait_when_null:
                 log.info("暂无cookie 生产中...")
                 self._keep_alive = False
                 self._min_cookies = 1
                 with RedisLock(
                     key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
                 ) as _lock:
                     if _lock.locked:
                         self.run()
                 continue
             return eval(cookie_info) if cookie_info else {}
         except Exception as e:
             log.exception(e)
             tools.delay_time(1)
Beispiel #7
0
    def _start(self):
        # 启动request_buffer
        self._request_buffer.start()
        # 启动item_buffer
        self._item_buffer.start()
        # 启动collector
        self._collector.start()

        # 启动parser control
        for i in range(self._thread_count):
            parser_control = self._parser_control_obj(
                self._collector,
                self._redis_key,
                self._request_buffer,
                self._item_buffer,
            )

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_controls.append(parser_control)

        # 下发任务 因为时间可能比较长,放到最后面
        if setting.RETRY_FAILED_REQUESTS:
            # 重设失败的任务, 不用加锁,原子性操作
            handle_failed_requests = HandleFailedRequests(self._redis_key)
            handle_failed_requests.reput_failed_requests_to_requests()

        # 下发新任务
        if self._auto_start_requests:  # 自动下发
            if self.wait_lock:
                # 将添加任务处加锁,防止多进程之间添加重复的任务
                with RedisLock(
                    key=self._spider_name,
                    timeout=3600,
                    wait_timeout=60,
                    redis_cli=RedisDB().get_redis_obj(),
                ) as lock:
                    if lock.locked:
                        self.__add_task()
            else:
                self.__add_task()
Beispiel #8
0
    def task_is_done(self):
        """
        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
        ---------
        ---------
        @result: True / False (做完 / 未做完)
        """

        is_done = False

        # 查看批次记录表任务状态
        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )

        batch_info = self._mysqldb.find(sql)
        if batch_info is None:
            raise Exception("查询批次信息失败")

        if batch_info:
            self._batch_date_cache, total_count, done_count, is_done = batch_info[
                0]  # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间

            log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % (
                self._batch_name,
                self._batch_date_cache,
                done_count,
                total_count,
                is_done,
            ))
            os.environ[
                "batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间

        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
            # 比较耗时 加锁防止多进程同时查询
            with RedisLock(
                    key=self._spider_name,
                    timeout=3600,
                    wait_timeout=0,
                    redis_cli=RedisDB().get_redis_obj(),
            ) as lock:
                if lock.locked:
                    log.info("批次表标记已完成,正在检查任务表是否有未完成的任务")

                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
                        self._task_table,
                        self._task_state,
                        self._task_state,
                        self._task_condition_prefix_and,
                    )
                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
                    if tasks:
                        log.info("检测到任务表中有未完成任务,等待任务下发")
                        is_done = False

                        # 更新batch_record 表的is_done 状态,减少查询任务表的次数
                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
                            batch_record_table=self._batch_record_table,
                            batch_date=self._batch_date_cache,
                        )
                        self._mysqldb.update(sql)

                    else:
                        log.info("任务表中任务均已完成,爬虫结束")
                else:
                    log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待")

                    is_done = False

        return is_done
Beispiel #9
0
def test_lock():
    with RedisLock(key="test", redis_cli=RedisDB().get_redis_obj(), wait_timeout=10) as _lock:
        if _lock.locked:
            print(1)
            time.sleep(100)
Beispiel #10
0
    def run(self, username=None):
        while True:
            try:
                with RedisLock(key=self._tab_user_pool,
                               lock_timeout=3600,
                               wait_timeout=0) as _lock:
                    if _lock.locked:
                        self.__sycn_users_info()
                        online_user = 0
                        for user in self.users:
                            if username and username != user.username:
                                continue

                            try:
                                if user.cookies:
                                    online_user += 1
                                    continue

                                # 预检查
                                if not user.is_time_to_login():
                                    log.info("账号{}与上次登录时间间隔过短,暂不登录: 将在{}登录使用".
                                             format(user.username,
                                                    user.next_login_time()))
                                    continue

                                user = self.login(user)
                                if user.cookies:
                                    # 保存cookie
                                    user.set_login_time()
                                    self.add_user(user)
                                    self.record_user_status(
                                        user.user_id,
                                        GoldUserStatus.LOGIN_SUCCESS)
                                    log.debug("登录成功 {}".format(user.username))
                                    online_user += 1
                                else:
                                    log.info("登录失败 {}".format(user.username))
                                    self.record_user_status(
                                        user.user_id,
                                        GoldUserStatus.LOGIN_FALIED)
                            except NotImplementedError:
                                log.error(
                                    f"{self.__class__.__name__} must be implementation login method!"
                                )
                                os._exit(0)
                            except Exception as e:
                                log.exception(e)
                                msg = f"{user.username} 账号登陆失败 exception: {str(e)}"
                                log.info(msg)
                                self.record_user_status(
                                    user.user_id, GoldUserStatus.LOGIN_FALIED)

                                send_msg(
                                    msg=msg,
                                    level="error",
                                    message_prefix=f"{user.username} 账号登陆失败",
                                )

                        log.info("当前在线user数为 {}".format(online_user))

                if self._keep_alive:
                    time.sleep(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                time.sleep(1)