def load_task(self): if TaskService._offset == 1: log.info('开始新的一轮抓取') TaskService._spider_start_timestamp = tools.get_current_timestamp() TaskService._total_task_size = 0 # 清空url表 TaskService._redisdb.clear('news:news_urls') TaskService._redisdb.clear('news:news_urls_dupefilter') task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and t.position != 35 and rownum < {page_size}) where r >= {offset} '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset) TaskService._offset += TASK_BUFFER_SIZE print(task_sql) tasks = TaskService._db.find(task_sql) TaskService._total_task_size += len(tasks) if not tasks: TaskService._spider_end_timestamp = tools.get_current_timestamp() log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp))) TaskService._offset = 1 self.load_task() TaskService._task_ring_buff.put_data(tasks)
def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 is_all_done = False # 是否全部做完(所有公众号当日的发布的信息均已采集) if WechatAction._todo_urls: url = WechatAction._todo_urls.popleft() else: # 做完一个公众号 更新其文章数 WechatAction._wechat_service.update_account_article_num( WechatAction._current_account_biz) # 跳转到下一个公众号 account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account( ) WechatAction._account_info[__biz] = account_id or '' # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz log.debug(''' 下一个公众号 : %s ''' % url) # 注入js脚本实现自动跳转 if is_all_done: # 当天文章均已爬取 下一天再爬 # 睡眠到下一天 sleep_time = self.get_next_day_time_interval() elif is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date( ) < tools.get_current_date( "%Y-%m-%d" ) + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() log.debug(''' next_page_url : %s is_done: %s is_all_done: %s sleep_time: %s next_start_time %s ''' % (url, is_done, is_all_done, tools.seconds_to_h_m_s(sleep_time / 1000), tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000))) next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % ( url, sleep_time) return next_page
def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 url = None while WechatAction._todo_urls: result = WechatAction._todo_urls.popleft() if callable(result): # 为更新公众号已做完的回调 result() #执行回调 else: url = result break if not url: # 跳转到下一个公众号 account = WechatAction._wechat_service.get_next_account() if account: account_id, __biz = account WechatAction._account_info[__biz] = account_id or '' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz log.debug(''' 下一个公众号 : %s '''%url) else: is_done = True # 注入js脚本实现自动跳转 if is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000) tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000) if not url: url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time) log.debug(''' next_page_url : %s is_done: %s sleep_time: %s next_start_time %s '''%(url, is_done, tip_sleep_time, tip_next_start_time)) next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time) return next_page
def monitor_task(): task_manager = TaskManager() total_time = 0 task_count = 0 begin_time = None end_time = None spend_hours = None is_show_start_tip = False is_show_have_task = False while True: task_count = task_manager.get_task_count() if not task_count: if not is_show_start_tip: log.info('开始监控任务池...') is_show_start_tip = True total_time += CHECK_HAVE_TASK_SLEEP_TIME tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) else: if not is_show_have_task: log.info('任务池中有%s条任务,work可以正常工作' % task_count) is_show_have_task = True total_time = 0 tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) if total_time > MAX_NULL_TASK_TIME: is_show_start_tip = False is_show_have_task = False # 结束一轮 做些统计 if begin_time: # 统计时间 end_time = tools.timestamp_to_date( tools.get_current_timestamp() - MAX_NULL_TASK_TIME) spend_time = tools.date_to_timestamp( end_time) - tools.date_to_timestamp(begin_time) spend_hours = tools.seconds_to_h_m_s(spend_time) # 统计url数量 depth_count_info = task_manager.get_ever_depth_count(5) # 统计文章数量 article_count_msg = statistic_article_count.get_article_count_msg( begin_time, end_time) log.info( ''' ------- 已做完一轮 -------- \r开始时间:%s \r结束时间:%s \r耗时:%s \r网站数量:%s \rurl数量信息:%s \r文章数量信息:%s ''' % (begin_time, end_time, spend_hours, task_count, tools.dumps_json(depth_count_info), article_count_msg)) # 删除url指纹 log.info('删除url指纹...') task_manager.clear_task() log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' % (total_time, MAX_NULL_TASK_TIME)) # 取任务 tasks = task_manager.get_task_from_oracle() if tasks: total_time = 0 task_manager.add_task_to_redis(tasks) task_count = task_manager.get_task_count() if task_count: begin_time = tools.get_current_date() log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count)) else: log.error('未从oracle中取到任务')