Beispiel #1
0
    def related_spider_is_done(self):
        """
        相关连的爬虫是否跑完
        @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断
        """

        for related_redis_task_table in self._related_task_tables:
            if self._redisdb.exists_key(related_redis_task_table):
                return False

        if self._related_batch_record:
            sql = "select is_done from {} order by id desc limit 1".format(
                self._related_batch_record
            )
            is_done = self._mysqldb.find(sql)
            is_done = is_done[0][0] if is_done else None

            if is_done is None:
                log.warning("相关联的批次表不存在或无批次信息")
                return None

            if not is_done:
                return False

        return True
Beispiel #2
0
 def db_tip(self):
     msg = ""
     if setting.ADD_ITEM_TO_MYSQL:
         msg += "item 自动入mysql "
     if setting.ADD_ITEM_TO_REDIS:
         msg += "item 自动入redis "
     if not msg:
         log.warning("*** 请注意检查item是否入库 !!!")
     else:
         log.info(msg)
Beispiel #3
0
    def is_at_work_time(self):
        if datetime.datetime.now().hour in list(range(7, 23)):
            return True

        log.warning("账号 {} 不再工作时间内".format(self.username))
        return False
Beispiel #4
0
    def get_user(
        self,
        username=None,
        used_for_spider_name=None,
        wait_when_null=True,
        not_limit_frequence=False,
    ) -> LimitTimesUser:
        """
        @params username: 获取指定的用户
        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
        @params wait_when_null: 无用户时是否等待
        @params not_limit_frequence: 不限制使用频率
        @return: LimitTimesUser
        """
        if not self.support_more_client:
            warnings.warn(
                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
                category=Warning,
            )
            self._is_show_warning = True

        while True:
            if (
                not self.limit_times_users
                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
            ):
                self.__load_users(username)
                if not self.limit_times_users:
                    log.warning("无可用的用户")
                    if wait_when_null:
                        time.sleep(1)
                        continue
                    else:
                        return None

            self.current_user_index += 1
            self.current_user_index = self.current_user_index % len(
                self.limit_times_users
            )

            limit_times_user = self.limit_times_users[self.current_user_index]
            if self.support_more_client:  # 需要先同步下最新数据
                limit_times_user.sync_account_info_from_redis()

            if username and limit_times_user.username != username:
                log.info(
                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
                )
                time.sleep(1)
                continue

            # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
            if (
                limit_times_user.used_for_spider_name
                and limit_times_user.used_for_spider_name != used_for_spider_name
            ):
                wait_time = time.time() - limit_times_user.get_last_search_time()
                if wait_time < limit_times_user.used_for_time_length:
                    log.info(
                        "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
                            limit_times_user.username,
                            limit_times_user.used_for_spider_name,
                            limit_times_user.used_for_time_length - wait_time,
                        )
                    )
                    time.sleep(1)
                    continue

            if (
                not limit_times_user.is_overwork()
                and limit_times_user.is_at_work_time()
            ):
                if not limit_times_user.cookies:
                    self.limit_times_users.remove(limit_times_user)
                    continue

                if not_limit_frequence or limit_times_user.is_time_to_search():
                    limit_times_user.used_for_spider_name = used_for_spider_name

                    limit_times_user.update_status()
                    log.info("使用用户 {}".format(limit_times_user.username))
                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
                    return limit_times_user
                else:
                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
                    time.sleep(1)
                    continue
            else:
                self.limit_times_users.remove(limit_times_user)
                self.current_user_index -= 1

                if not limit_times_user.is_at_work_time():
                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
                    if wait_when_null:
                        time.sleep(30)
                        continue
                    else:
                        return None
Beispiel #5
0
    def is_overwork(self):
        if self.search_times > self.max_search_times:
            log.warning("账号 {} 请求次数超限制".format(self.username))
            return True

        return False