def related_spider_is_done(self): """ 相关连的爬虫是否跑完 @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断 """ for related_redis_task_table in self._related_task_tables: if self._redisdb.exists_key(related_redis_task_table): return False if self._related_batch_record: sql = "select is_done from {} order by id desc limit 1".format( self._related_batch_record ) is_done = self._mysqldb.find(sql) is_done = is_done[0][0] if is_done else None if is_done is None: log.warning("相关联的批次表不存在或无批次信息") return None if not is_done: return False return True
def db_tip(self): msg = "" if setting.ADD_ITEM_TO_MYSQL: msg += "item 自动入mysql " if setting.ADD_ITEM_TO_REDIS: msg += "item 自动入redis " if not msg: log.warning("*** 请注意检查item是否入库 !!!") else: log.info(msg)
def is_at_work_time(self): if datetime.datetime.now().hour in list(range(7, 23)): return True log.warning("账号 {} 不再工作时间内".format(self.username)) return False
def get_user( self, username=None, used_for_spider_name=None, wait_when_null=True, not_limit_frequence=False, ) -> LimitTimesUser: """ @params username: 获取指定的用户 @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占 @params wait_when_null: 无用户时是否等待 @params not_limit_frequence: 不限制使用频率 @return: LimitTimesUser """ if not self.support_more_client: warnings.warn( "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程", category=Warning, ) self._is_show_warning = True while True: if ( not self.limit_times_users or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL ): self.__load_users(username) if not self.limit_times_users: log.warning("无可用的用户") if wait_when_null: time.sleep(1) continue else: return None self.current_user_index += 1 self.current_user_index = self.current_user_index % len( self.limit_times_users ) limit_times_user = self.limit_times_users[self.current_user_index] if self.support_more_client: # 需要先同步下最新数据 limit_times_user.sync_account_info_from_redis() if username and limit_times_user.username != username: log.info( "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username) ) time.sleep(1) continue # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用 if ( limit_times_user.used_for_spider_name and limit_times_user.used_for_spider_name != used_for_spider_name ): wait_time = time.time() - limit_times_user.get_last_search_time() if wait_time < limit_times_user.used_for_time_length: log.info( "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format( limit_times_user.username, limit_times_user.used_for_spider_name, limit_times_user.used_for_time_length - wait_time, ) ) time.sleep(1) continue if ( not limit_times_user.is_overwork() and limit_times_user.is_at_work_time() ): if not limit_times_user.cookies: self.limit_times_users.remove(limit_times_user) continue if not_limit_frequence or limit_times_user.is_time_to_search(): limit_times_user.used_for_spider_name = used_for_spider_name limit_times_user.update_status() log.info("使用用户 {}".format(limit_times_user.username)) limit_times_user.record_user_status(LimitTimesUserStatus.USED) return limit_times_user else: log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username)) time.sleep(1) continue else: self.limit_times_users.remove(limit_times_user) self.current_user_index -= 1 if not limit_times_user.is_at_work_time(): log.warning("用户 {} 不在工作时间".format(limit_times_user.username)) if wait_when_null: time.sleep(30) continue else: return None
def is_overwork(self): if self.search_times > self.max_search_times: log.warning("账号 {} 请求次数超限制".format(self.username)) return True return False