Esempio n. 1
0
    def load_task(self):
        if TaskService._offset == 1:
            log.info('开始新的一轮抓取')
            TaskService._spider_start_timestamp = tools.get_current_timestamp()
            TaskService._total_task_size = 0

            # 清空url表
            TaskService._redisdb.clear('news:news_urls')
            TaskService._redisdb.clear('news:news_urls_dupefilter')


        task_sql = '''
            select *
              from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                      from TAB_IOPM_SITE t
                     where classify = 1
                       and t.mointor_status = 701
                       and t.position != 35
                       and rownum < {page_size})
             where r >= {offset}
        '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset)
        TaskService._offset += TASK_BUFFER_SIZE

        print(task_sql)
        tasks = TaskService._db.find(task_sql)
        TaskService._total_task_size += len(tasks)

        if not tasks:
            TaskService._spider_end_timestamp = tools.get_current_timestamp()
            log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp)))
            TaskService._offset = 1
            self.load_task()

        TaskService._task_ring_buff.put_data(tasks)
Esempio n. 2
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False  # 是否做完一轮
        is_all_done = False  # 是否全部做完(所有公众号当日的发布的信息均已采集)

        if WechatAction._todo_urls:
            url = WechatAction._todo_urls.popleft()
        else:
            # 做完一个公众号 更新其文章数
            WechatAction._wechat_service.update_account_article_num(
                WechatAction._current_account_biz)

            # 跳转到下一个公众号
            account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account(
            )
            WechatAction._account_info[__biz] = account_id or ''

            # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz
            url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz
            log.debug('''
                下一个公众号 : %s
                ''' % url)

        # 注入js脚本实现自动跳转
        if is_all_done:  # 当天文章均已爬取 下一天再爬
            # 睡眠到下一天
            sleep_time = self.get_next_day_time_interval()
        elif is_done:  # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date(
        ) < tools.get_current_date(
                "%Y-%m-%d"
        ) + ' ' + SPIDER_START_TIME:  # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else:  # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        log.debug('''
            next_page_url : %s
            is_done:        %s
            is_all_done:    %s
            sleep_time:     %s
            next_start_time %s
            ''' % (url, is_done, is_all_done,
                   tools.seconds_to_h_m_s(sleep_time / 1000),
                   tools.timestamp_to_date(tools.get_current_timestamp() +
                                           sleep_time / 1000)))
        next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % (
            url, sleep_time)
        return next_page
Esempio n. 3
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False # 是否做完一轮
        url = None

        while WechatAction._todo_urls:
            result = WechatAction._todo_urls.popleft()
            if callable(result): # 为更新公众号已做完的回调
                result() #执行回调
            else:
                url = result
                break

        if not url:
            # 跳转到下一个公众号
            account = WechatAction._wechat_service.get_next_account()
            if account:
                account_id, __biz = account
                WechatAction._account_info[__biz] = account_id or ''

                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz
                log.debug('''
                    下一个公众号 : %s
                    '''%url)
            else:
                is_done = True

        # 注入js脚本实现自动跳转
        if is_done: # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else: # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000)
        tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000)
        if not url:
            url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time)

        log.debug('''
            next_page_url : %s
            is_done:        %s
            sleep_time:     %s
            next_start_time %s
            '''%(url, is_done, tip_sleep_time, tip_next_start_time))
        next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time)
        return next_page
def monitor_task():
    task_manager = TaskManager()
    total_time = 0

    task_count = 0
    begin_time = None
    end_time = None
    spend_hours = None

    is_show_start_tip = False
    is_show_have_task = False

    while True:
        task_count = task_manager.get_task_count()
        if not task_count:
            if not is_show_start_tip:
                log.info('开始监控任务池...')
                is_show_start_tip = True

            total_time += CHECK_HAVE_TASK_SLEEP_TIME
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)
        else:
            if not is_show_have_task:
                log.info('任务池中有%s条任务,work可以正常工作' % task_count)
                is_show_have_task = True

            total_time = 0
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)

        if total_time > MAX_NULL_TASK_TIME:
            is_show_start_tip = False
            is_show_have_task = False

            # 结束一轮 做些统计
            if begin_time:
                # 统计时间
                end_time = tools.timestamp_to_date(
                    tools.get_current_timestamp() - MAX_NULL_TASK_TIME)
                spend_time = tools.date_to_timestamp(
                    end_time) - tools.date_to_timestamp(begin_time)
                spend_hours = tools.seconds_to_h_m_s(spend_time)

                # 统计url数量
                depth_count_info = task_manager.get_ever_depth_count(5)

                # 统计文章数量
                article_count_msg = statistic_article_count.get_article_count_msg(
                    begin_time, end_time)

                log.info(
                    '''
                    ------- 已做完一轮 --------
                    \r开始时间:%s
                    \r结束时间:%s
                    \r耗时:%s
                    \r网站数量:%s
                    \rurl数量信息:%s
                    \r文章数量信息:%s
                    ''' %
                    (begin_time, end_time, spend_hours, task_count,
                     tools.dumps_json(depth_count_info), article_count_msg))

            # 删除url指纹
            log.info('删除url指纹...')
            task_manager.clear_task()

            log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' %
                     (total_time, MAX_NULL_TASK_TIME))
            # 取任务
            tasks = task_manager.get_task_from_oracle()
            if tasks:
                total_time = 0
                task_manager.add_task_to_redis(tasks)
                task_count = task_manager.get_task_count()
                if task_count:
                    begin_time = tools.get_current_date()
                    log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count))
            else:
                log.error('未从oracle中取到任务')