Python RedisDB.zget_count Examples

Programming Language: Python

Namespace/Package Name: db.redisdb

Class/Type: RedisDB

Method/Function: zget_count

Examples at hotexamples.com: 2

Python RedisDB.zget_count - 2 examples found. These are the top rated real world Python examples of db.redisdb.RedisDB.zget_count extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RedisDB(18)

clear(6)

zadd(6)

sadd(5)

zget(4)

get(3)

update(2)

sget_count(2)

sget(2)

zget_count(2)

get_key(2)

get_all(2)

delete_value(2)

count(2)

bgsave(1)

lpush(1)

hget(1)

sdelete(1)

set(1)

getkeys(1)

hset(1)

Example #1

Show file

class Collector(threading.Thread):
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')

    def run(self):
        while not self._thread_stop:
            try:
                self.__input_data()
            except Exception as e:
                log.error(e)

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True
        if self._finished_callback:
            self._finished_callback()

    # @tools.log_function_time
    def __input_data(self):
        if self._urls:
            log.debug('url 未处理完，不取url， url数量 = %s'%len(self._urls))
            return

        # 汇报节点信息
        self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做

        url_count = self._url_count # 先赋值
        # 根据等待节点数量，动态分配url
        worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0)
        if worker_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_urls)
            # 动态分配的数量 = 任务数量 / 休息的节点数量
            url_count = task_count // worker_wait_count

        url_count = url_count if url_count <= self._url_count else self._url_count

        urls_list = self._db.zget(self._tab_urls, count = url_count)

        if not urls_list:
            if not self._is_show_wait:
                log.info('等待任务...')
                self._is_show_wait = True
        else:
            # # 记录url数量 测试用
            # url_count_record = tools.read_file('url_count.txt')
            # url_count_record =  url_count_record and int(url_count_record) or 0
            # url_count_record += len(urls_list)
            # tools.write_file('url_count.txt', str(url_count_record))

            # 汇报节点信息
            self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做

            # 存url
            self.put_urls(urls_list)
            self._is_show_wait = False

        # if self.is_all_have_done():
        #     log.debug('is_all_have_done end')
        #     self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count()))
        if len(self._urls) == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0:
                return True
            else:
                return False
        else:
            self._null_times = 0
            return False


    # @tools.log_function_time
    def put_urls(self, urls_list):
        for url_info in urls_list:
            try:
                url_info = eval(url_info)
            except Exception as e:
                url_info = None

            if url_info:
                self._urls.append(url_info)

    # @tools.log_function_time
    def get_urls(self, count):
        urls = []
        count = count if count <= len(self._urls) else len(self._urls)
        while count:
            urls.append(self._urls.popleft())
            count -= 1

        return urls

Example #2

Show file

File: task_manager.py Project: zhaoruiquan/distributed-spider

class TaskManager():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._news_url_table = 'news:news_urls'
        self._news_urls_dupefilter = 'news:news_urls_dupefilter'

    def get_task_count(self):
        '''
        @summary: redis 中是否有待做的url
        ---------
        ---------
        @result:
        '''

        return self._redisdb.zget_count(self._news_url_table)

    def get_ever_depth_count(self, total_depth=5):
        '''
        @summary:
        ---------
        @param total_depth: 不包含。 以客户角度的层数
        ---------
        @result:
        '''

        depth_count_info = {}
        total_count = 0
        for depth in range(total_depth):
            key = '第%s层url数' % (depth + 1)
            depth_count_info[key] = self._redisdb.sget_count(
                self._news_urls_dupefilter + str(depth))
            total_count += depth_count_info[key]

        depth_count_info['总url数'] = total_count
        return depth_count_info

    def get_task_from_oracle(self):
        tasks = []

        offset = 0
        while True:
            # 取任务
            task_sql = '''
                select *
                  from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                          from TAB_IOPM_SITE t
                         where classify = 1
                           and t.mointor_status = 701
                           and (t.position != 35 or t.position is null)
                           and rownum < {page_size})
                 where r >= {offset}
            '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset)

            results = self._oracledb.find(task_sql)
            offset += ONE_PAGE_SIZE

            if not results: break

            # 拼装成json格式的url
            for task in results:
                website_id = task[0]
                website_name = task[1]
                website_position = task[2]
                website_url = task[3]
                website_domain = tools.get_domain(website_url)
                spider_depth = task[4]

                remark = {
                    'website_name': website_name,
                    'website_position': website_position,
                    'website_url': website_url,
                    'website_domain': website_domain,
                    'spider_depth': spider_depth
                }
                url_dict = {
                    'site_id': 1,
                    'url': website_url,
                    'depth': 0,
                    'remark': remark,
                    'retry_times': 0
                }

                tasks.append(url_dict)

        return tasks

    def add_task_to_redis(self, tasks):
        for task in tasks:
            url = task.get('url')
            if url:
                url_id = tools.get_sha1(url)
                if self._redisdb.sadd(self._news_urls_dupefilter, url_id):
                    self._redisdb.zadd(self._news_url_table, task, prioritys=0)
                    # 下面是统计每层url数量用的表
                    self._redisdb.sadd('news:news_urls_dupefilter0', url_id)

    def clear_task(self):
        # 清空url指纹表
        self._redisdb.sdelete('news:news_urls_dupefilter')
        # 下面是统计每层url数量用的表
        self._redisdb.sdelete('news:news_urls_dupefilter0')
        self._redisdb.sdelete('news:news_urls_dupefilter1')
        self._redisdb.sdelete('news:news_urls_dupefilter2')
        self._redisdb.sdelete('news:news_urls_dupefilter3')
        self._redisdb.sdelete('news:news_urls_dupefilter4')