class SyncArtice(threading.Thread): def __init__(self): super(SyncArtice, self).__init__() self._es = ES() self._redis = RedisDB() self._sync_count = 0 def run(self): while True: try: datas = self.get_data_from_redis(SYNC_STEP) if not datas: print('无数据 休眠...') elif self.add_data_to_es(datas): self._sync_count += len(datas) tools.print_one_line('已同步 %d 条数据' % self._sync_count) tools.delay_time(1) except Exception as e: log.error(e) def get_data_from_redis(self, count): datas = self._redis.zget('news:news_article', count=count) return_datas = [] for data in datas: data = eval(data) release_time = data.get('release_time') if release_time and len(release_time) == 19: return_datas.append(data) return return_datas def add_data_to_es(self, datas): return self._es.add_batch(datas, primary_key='uuid', table='news_article')
class TaskManager(): IS_IN_TIME_RANGE = 1 # 在时间范围 NOT_REACH_TIME_RANGE = 2 # 没到达时间范围 OVER_MIN_TIME_RANGE = 3 # 超过时间范围 def __init__(self): self._mysqldb = MysqlDB(**config.get('mysqldb')) self._redis = RedisDB(**config.get('redisdb')) self._task_root_key = config.get('spider').get('redis_task_cache_root_key') self._account_task_key = self._task_root_key + ':z_account_task' self._article_task_key = self._task_root_key + ':z_article_task' self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time' self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time' self._ignore_haved_crawl_today_article_account = config.get('spider').get('ignore_haved_crawl_today_article_account') self._monitor_interval = config.get('spider').get('monitor_interval') self._zombie_account_not_publish_article_days = config.get('spider').get('zombie_account_not_publish_article_days') self._spider_interval_min = config.get('spider').get('spider_interval').get('min_sleep_time') self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time') self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time') self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~') def __get_task_from_redis(self, key): task = self._redis.zget(key, is_pop=True) if task: task = eval(task[0]) return task def __random_int(self, min, max): pass def get_account_task(self): """ 获取公众号任务 :return: {'__biz': 'Mjc1NjM3MjY2MA==', 'last_publish_time': None} 或 None """ task = self.__get_task_from_redis(self._account_task_key) if not task: publish_time_condition = "AND last_publish_time < '{today}'".format(today=tools.get_current_date(date_format='%Y-%m-%d' + ' 00:00:00')) if self._ignore_haved_crawl_today_article_account else '' sql = ''' SELECT __biz, last_publish_time FROM wechat_account_task WHERE `is_zombie` != 1 AND ( ( ( UNIX_TIMESTAMP(CURRENT_TIMESTAMP) - UNIX_TIMESTAMP(last_spider_time) ) > {monitor_interval} {publish_time_condition} ) OR (last_spider_time IS NULL) ) '''.format(monitor_interval=self._monitor_interval, publish_time_condition=publish_time_condition) tasks = self._mysqldb.find(sql, to_json=True) if tasks: self._redis.zadd(self._account_task_key, tasks) task = self.__get_task_from_redis(self._account_task_key) return task def get_article_task(self): """ 获取文章任务 :return: {'article_url': 'http://mp.weixin.qq.com/s?__biz=MzIxNzg1ODQ0MQ==&mid=2247485501&idx=1&sn=92721338ddbf7d907eaf03a70a0715bd&chksm=97f220dba085a9cd2b9a922fb174c767603203d6dbd2a7d3a6dc41b3400a0c477a8d62b96396&scene=27#wechat_redirect'} 或 None """ task = self.__get_task_from_redis(self._article_task_key) if not task: sql = 'select id, article_url from wechat_article_task where state = 0 limit 5000' tasks = self._mysqldb.find(sql) if tasks: # 更新任务状态 task_ids = str(tuple([task[0] for task in tasks])).replace(',)', ')') sql = 'update wechat_article_task set state = 2 where id in %s' % (task_ids) self._mysqldb.update(sql) else: sql = 'select id, article_url from wechat_article_task where state = 2 limit 5000' tasks = self._mysqldb.find(sql) if tasks: task_json = [ { 'article_url': article_url } for id, article_url in tasks ] self._redis.zadd(self._article_task_key, task_json) task = self.__get_task_from_redis(self._article_task_key) return task def update_article_task_state(self, sn, state=1): sql = 'update wechat_article_task set state = %s where sn = "%s"' % (state, sn) self._mysqldb.update(sql) def record_last_article_publish_time(self, __biz, last_publish_time): self._redis.hset(self._last_article_publish_time, __biz, last_publish_time or '') def is_reach_last_article_publish_time(self, __biz, publish_time): last_publish_time = self._redis.hget(self._last_article_publish_time, __biz) if not last_publish_time: # 查询mysql里是否有该任务 sql = "select last_publish_time from wechat_account_task where __biz = '%s'" % __biz data = self._mysqldb.find(sql) if data: # [(None,)] / [] last_publish_time = str(data[0][0] or '') self.record_last_article_publish_time(__biz, last_publish_time) if last_publish_time is None: return if publish_time < last_publish_time: return True return False def is_in_crawl_time_range(self, publish_time): """ 是否在时间范围 :param publish_time: :return: 是否达时间范围 """ if not publish_time or (not self._crawl_time_range[0] and not self._crawl_time_range[1]): return TaskManager.IS_IN_TIME_RANGE if self._crawl_time_range[0]: # 时间范围 上限 if publish_time > self._crawl_time_range[0]: return TaskManager.NOT_REACH_TIME_RANGE if publish_time <= self._crawl_time_range[0] and publish_time >= self._crawl_time_range[1]: return TaskManager.IS_IN_TIME_RANGE if publish_time < self._crawl_time_range[1]: # 下限 return TaskManager.OVER_MIN_TIME_RANGE return TaskManager.IS_IN_TIME_RANGE def record_new_last_article_publish_time(self, __biz, new_last_publish_time): self._redis.hset(self._new_last_article_publish_time, __biz, new_last_publish_time) def get_new_last_article_publish_time(self, __biz): return self._redis.hget(self._new_last_article_publish_time, __biz) def update_account_last_publish_time(self, __biz, last_publish_time): sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}" where __biz="{}"'.format( last_publish_time, tools.get_current_date(), __biz ) self._mysqldb.update(sql) def is_zombie_account(self, last_publish_timestamp): if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400: return True return False def sign_account_is_zombie(self, __biz, last_publish_time=None): if last_publish_time: sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( last_publish_time, tools.get_current_date(), __biz ) else: sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( tools.get_current_date(), __biz ) self._mysqldb.update(sql) def get_task(self, url=None, tip=''): """ 获取任务 :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取 :return: """ sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max) if not url: account_task = self.get_account_task() if account_task: __biz = account_task.get('__biz') last_publish_time = account_task.get('last_publish_time') self.record_last_article_publish_time(__biz, last_publish_time) tip = '正在抓取列表' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(__biz) else: article_task = self.get_article_task() if article_task: tip = '正在抓取详情' url = article_task.get('article_url') else: sleep_time = config.get('spider').get('no_task_sleep_time') log.info('暂无任务 休眠 {}s'.format(sleep_time)) tip = '暂无任务 ' if url: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000 ) else: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000 ) return next_page def reset_task(self): # 清除redis缓存 keys = self._task_root_key + "*" keys = self._redis.getkeys(keys) if keys: for key in keys: self._redis.clear(key) # 重设任务 sql = "update wechat_article_task set state = 0 where state = 2" self._mysqldb.update(sql)
class Collector(threading.Thread): def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '') def run(self): while not self._thread_stop: try: self.__input_data() except Exception as e: log.error(e) time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if self._urls: log.debug('url 未处理完,不取url, url数量 = %s'%len(self._urls)) return # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做 url_count = self._url_count # 先赋值 # 根据等待节点数量,动态分配url worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0) if worker_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_urls) # 动态分配的数量 = 任务数量 / 休息的节点数量 url_count = task_count // worker_wait_count url_count = url_count if url_count <= self._url_count else self._url_count urls_list = self._db.zget(self._tab_urls, count = url_count) if not urls_list: if not self._is_show_wait: log.info('等待任务...') self._is_show_wait = True else: # # 记录url数量 测试用 # url_count_record = tools.read_file('url_count.txt') # url_count_record = url_count_record and int(url_count_record) or 0 # url_count_record += len(urls_list) # tools.write_file('url_count.txt', str(url_count_record)) # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做 # 存url self.put_urls(urls_list) self._is_show_wait = False # if self.is_all_have_done(): # log.debug('is_all_have_done end') # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count())) if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): for url_info in urls_list: try: url_info = eval(url_info) except Exception as e: url_info = None if url_info: self._urls.append(url_info) # @tools.log_function_time def get_urls(self, count): urls = [] count = count if count <= len(self._urls) else len(self._urls) while count: urls.append(self._urls.popleft()) count -= 1 return urls
class Collector(threading.Thread): def __init__(self, tab_urls, depth): super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth # or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False def run(self): while not self._thread_stop: self.__input_data() time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if self._urls: log.debug('url 未处理完,不取url, url数量 = %s' % len(self._urls)) return urls_list = self._db.zget(self._tab_urls, count=self._url_count) if not urls_list: if not self._is_show_wait: log.info('等待任务...') self._is_show_wait = True else: # 存url self.put_urls(urls_list) self._is_show_wait = False # if self.is_all_have_done(): # log.debug('is_all_have_done end') # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count())) if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count( ) == 0: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): for url_info in urls_list: try: url_info = eval(url_info) except Exception as e: url_info = None if url_info: self._urls.append(url_info) # @tools.log_function_time def get_urls(self, count): urls = [] count = count if count <= len(self._urls) else len(self._urls) while count: urls.append(self._urls.popleft()) count -= 1 return urls