Ejemplo n.º 1
0
class Collector(threading.Thread):
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')

    def run(self):
        while not self._thread_stop:
            try:
                self.__input_data()
            except Exception as e:
                log.error(e)

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True
        if self._finished_callback:
            self._finished_callback()

    # @tools.log_function_time
    def __input_data(self):
        if self._urls:
            log.debug('url 未处理完,不取url, url数量 = %s'%len(self._urls))
            return

        # 汇报节点信息
        self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做

        url_count = self._url_count # 先赋值
        # 根据等待节点数量,动态分配url
        worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0)
        if worker_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_urls)
            # 动态分配的数量 = 任务数量 / 休息的节点数量
            url_count = task_count // worker_wait_count

        url_count = url_count if url_count <= self._url_count else self._url_count

        urls_list = self._db.zget(self._tab_urls, count = url_count)

        if not urls_list:
            if not self._is_show_wait:
                log.info('等待任务...')
                self._is_show_wait = True
        else:
            # # 记录url数量 测试用
            # url_count_record = tools.read_file('url_count.txt')
            # url_count_record =  url_count_record and int(url_count_record) or 0
            # url_count_record += len(urls_list)
            # tools.write_file('url_count.txt', str(url_count_record))

            # 汇报节点信息
            self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做

            # 存url
            self.put_urls(urls_list)
            self._is_show_wait = False

        # if self.is_all_have_done():
        #     log.debug('is_all_have_done end')
        #     self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count()))
        if len(self._urls) == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0:
                return True
            else:
                return False
        else:
            self._null_times = 0
            return False


    # @tools.log_function_time
    def put_urls(self, urls_list):
        for url_info in urls_list:
            try:
                url_info = eval(url_info)
            except Exception as e:
                url_info = None

            if url_info:
                self._urls.append(url_info)

    # @tools.log_function_time
    def get_urls(self, count):
        urls = []
        count = count if count <= len(self._urls) else len(self._urls)
        while count:
            urls.append(self._urls.popleft())
            count -= 1

        return urls
Ejemplo n.º 2
0
class TaskManager():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._news_url_table = 'news:news_urls'
        self._news_urls_dupefilter = 'news:news_urls_dupefilter'

    def get_task_count(self):
        '''
        @summary: redis 中是否有待做的url
        ---------
        ---------
        @result:
        '''

        return self._redisdb.zget_count(self._news_url_table)

    def get_ever_depth_count(self, total_depth=5):
        '''
        @summary:
        ---------
        @param total_depth: 不包含。 以客户角度的层数
        ---------
        @result:
        '''

        depth_count_info = {}
        total_count = 0
        for depth in range(total_depth):
            key = '第%s层url数' % (depth + 1)
            depth_count_info[key] = self._redisdb.sget_count(
                self._news_urls_dupefilter + str(depth))
            total_count += depth_count_info[key]

        depth_count_info['总url数'] = total_count
        return depth_count_info

    def get_task_from_oracle(self):
        tasks = []

        offset = 0
        while True:
            # 取任务
            task_sql = '''
                select *
                  from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                          from TAB_IOPM_SITE t
                         where classify = 1
                           and t.mointor_status = 701
                           and (t.position != 35 or t.position is null)
                           and rownum < {page_size})
                 where r >= {offset}
            '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset)

            results = self._oracledb.find(task_sql)
            offset += ONE_PAGE_SIZE

            if not results: break

            # 拼装成json格式的url
            for task in results:
                website_id = task[0]
                website_name = task[1]
                website_position = task[2]
                website_url = task[3]
                website_domain = tools.get_domain(website_url)
                spider_depth = task[4]

                remark = {
                    'website_name': website_name,
                    'website_position': website_position,
                    'website_url': website_url,
                    'website_domain': website_domain,
                    'spider_depth': spider_depth
                }
                url_dict = {
                    'site_id': 1,
                    'url': website_url,
                    'depth': 0,
                    'remark': remark,
                    'retry_times': 0
                }

                tasks.append(url_dict)

        return tasks

    def add_task_to_redis(self, tasks):
        for task in tasks:
            url = task.get('url')
            if url:
                url_id = tools.get_sha1(url)
                if self._redisdb.sadd(self._news_urls_dupefilter, url_id):
                    self._redisdb.zadd(self._news_url_table, task, prioritys=0)
                    # 下面是统计每层url数量用的表
                    self._redisdb.sadd('news:news_urls_dupefilter0', url_id)

    def clear_task(self):
        # 清空url指纹表
        self._redisdb.sdelete('news:news_urls_dupefilter')
        # 下面是统计每层url数量用的表
        self._redisdb.sdelete('news:news_urls_dupefilter0')
        self._redisdb.sdelete('news:news_urls_dupefilter1')
        self._redisdb.sdelete('news:news_urls_dupefilter2')
        self._redisdb.sdelete('news:news_urls_dupefilter3')
        self._redisdb.sdelete('news:news_urls_dupefilter4')