Example #1
0
class UrlManager(threading.Thread, Singleton):
    def __init__(self, table_url = 'urls'):
        if not hasattr(self,'_table_url'):
            super(UrlManager, self).__init__()

            self._thread_stop = False

            self._urls_deque = collections.deque()
            self._db = RedisDB()
            self._table_url = table_url
            self._table_url_dupefilter = self._table_url + '_dupefilter'
            self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_url_to_db()
            except Exception as e:
                log.error(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_urls(self, urls):
        urls = urls if isinstance(urls, list) else [urls]
        for url in urls:
            self._urls_deque.append(url)

    def get_urls_count(self):
        return len(self._urls_deque)

    def clear_url(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_url)
        self._db.clear(self._table_url_dupefilter)

    def __add_url_to_db(self):
        url_list = []
        prioritys = []

        while self._urls_deque:
            url = self._urls_deque.popleft()
            url_id = tools.get_sha1(url.get('url'))
            depth = url.get('depth', 0)

            max_depth = url.get('remark',{}).get('spider_depth', 0)
            # 为了获取每层数量,指纹url暂时采用zset,且先校验指纹url,后校验最后一层url,不需要获取每层url时建议采用set存储,且先校验最后一层url
            if depth == max_depth - 1: #最后一层 url单独放,之后不需要清空
                if self._db.zadd(self._table_url_dupefilter, url_id, depth) and self._db.sadd(self._table_url_end_depth_dupefilter, url_id):
                    url_list.append(url)
                    prioritys.append(depth)

            elif self._db.zadd(self._table_url_dupefilter, url_id, depth):
                url_list.append(url)
                prioritys.append(depth)

            if len(url_list) > 100:
                log.debug('添加url到数据库')
                self._db.zadd(self._table_url, url_list, prioritys)
                url_list = []
                prioritys = []

        if url_list:
            log.debug('添加url到数据库')
            self._db.zadd(self._table_url, url_list, prioritys)
class TaskManager():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._news_url_table = 'news:news_urls'
        self._news_urls_dupefilter = 'news:news_urls_dupefilter'

    def get_task_count(self):
        '''
        @summary: redis 中是否有待做的url
        ---------
        ---------
        @result:
        '''

        return self._redisdb.zget_count(self._news_url_table)

    def get_ever_depth_count(self, total_depth=5):
        '''
        @summary:
        ---------
        @param total_depth: 不包含。 以客户角度的层数
        ---------
        @result:
        '''

        depth_count_info = {}
        total_count = 0
        for depth in range(total_depth):
            key = '第%s层url数' % (depth + 1)
            depth_count_info[key] = self._redisdb.sget_count(
                self._news_urls_dupefilter + str(depth))
            total_count += depth_count_info[key]

        depth_count_info['总url数'] = total_count
        return depth_count_info

    def get_task_from_oracle(self):
        tasks = []

        offset = 0
        while True:
            # 取任务
            task_sql = '''
                select *
                  from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                          from TAB_IOPM_SITE t
                         where classify = 1
                           and t.mointor_status = 701
                           and (t.position != 35 or t.position is null)
                           and rownum < {page_size})
                 where r >= {offset}
            '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset)

            results = self._oracledb.find(task_sql)
            offset += ONE_PAGE_SIZE

            if not results: break

            # 拼装成json格式的url
            for task in results:
                website_id = task[0]
                website_name = task[1]
                website_position = task[2]
                website_url = task[3]
                website_domain = tools.get_domain(website_url)
                spider_depth = task[4]

                remark = {
                    'website_name': website_name,
                    'website_position': website_position,
                    'website_url': website_url,
                    'website_domain': website_domain,
                    'spider_depth': spider_depth
                }
                url_dict = {
                    'site_id': 1,
                    'url': website_url,
                    'depth': 0,
                    'remark': remark,
                    'retry_times': 0
                }

                tasks.append(url_dict)

        return tasks

    def add_task_to_redis(self, tasks):
        for task in tasks:
            url = task.get('url')
            if url:
                url_id = tools.get_sha1(url)
                if self._redisdb.sadd(self._news_urls_dupefilter, url_id):
                    self._redisdb.zadd(self._news_url_table, task, prioritys=0)
                    # 下面是统计每层url数量用的表
                    self._redisdb.sadd('news:news_urls_dupefilter0', url_id)

    def clear_task(self):
        # 清空url指纹表
        self._redisdb.sdelete('news:news_urls_dupefilter')
        # 下面是统计每层url数量用的表
        self._redisdb.sdelete('news:news_urls_dupefilter0')
        self._redisdb.sdelete('news:news_urls_dupefilter1')
        self._redisdb.sdelete('news:news_urls_dupefilter2')
        self._redisdb.sdelete('news:news_urls_dupefilter3')
        self._redisdb.sdelete('news:news_urls_dupefilter4')
Example #3
0
class ArticleManager(threading.Thread, Singleton):
    def __init__(self, table_article = 'articles'):
        if not hasattr(self,'_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article
            self._table_article_bloomfilter = table_article + '_bloomfilter'

            self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter)

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_article_to_db()
            except Exception as e:
                log.error(e)

            log.debug('缓存中文章数量 %s'%len(self._articles_deque))
            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_articles(self, article):
        self._articles_deque.append(article)

        if self.get_articles_count() > MAX_ARTICLE_COUNT: # 超过最大缓存,总动调用
            self.__add_article_to_db()

    def get_articles_count(self):
        return len(self._table_article)

    def clear_article(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_article)

    def __add_article_to_db(self):
        article_list = []
        while self._articles_deque:
            article = self._articles_deque.popleft()
            # 查看article是否存在
            if self._bloomfilter.is_contains(article.get('uuid')):
                article_list.append(article)
            else:
                self._bloomfilter.insert(article.get('uuid'))

            if len(article_list) > 100:
                log.debug('添加article到数据库')
                self._db.sadd(self._table_article, article_list)
                article_list = []

        if article_list:
            log.debug('添加article到数据库')
            self._db.sadd(self._table_article, article_list)
Example #4
0
class CheckNewArticle():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._wechat_sogo = WechatSogou()

    def get_wait_check_account(self):
        '''
        @summary:
        ---------
        @param :
        ---------
        @result:
        '''
        # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章
        before_tow_hours = tools.timestamp_to_date(
            tools.get_current_timestamp() - 60 * 60 * 2)
        sql = '''
            select t.id,
                   t.domain,
                   t.name,
                   to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                   t.biz
              from TAB_IOPM_SITE t
             where t.biz is not null
               and mointor_status = 701
               and t.spider_status = 603
               and (t.last_article_release_time is null or
                   t.last_article_release_time <=
                   to_date('{}', 'yyyy-mm-dd hh24:mi:ss'))
        '''.format(before_tow_hours)

        accounts = self._oracledb.find(sql)

        # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发
        if not accounts and not self._redisdb.sget_count('wechat:account'):
            sql = '''
                select t.id,
                       t.domain,
                       t.name,
                       to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                       t.biz
                  from TAB_IOPM_SITE t
                 where t.biz is not null
                   and mointor_status = 701
                   and t.spider_status != 603
            '''

            accounts = self._oracledb.find(sql)

        return accounts

    def check_new_article(self, account):
        oralce_id, account_id, account_name, last_article_release_time, biz = account

        article_release_time = self._wechat_sogo.get_article_release_time(
            account_id=account_id, account=account_name)
        print(article_release_time)
        if article_release_time:
            last_article_release_time = last_article_release_time or ''
            if article_release_time >= tools.get_current_date(
                    '%Y-%m-%d'
            ) and article_release_time > last_article_release_time:
                print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name,
                                                       article_release_time))

                sql = '''
                    update TAB_IOPM_SITE t set t.spider_status = 601,
                     t.last_article_release_time =
                           to_date('{}', 'yyyy-mm-dd hh24:mi:ss')
                     where id = {}
                '''.format(article_release_time, oralce_id)

                # 多线程, 数据库需每个线程持有一个
                oracledb = OracleDB()
                oracledb.update(sql)
                oracledb.close()

                # 入redis, 作为微信爬虫的任务池
                data = (oralce_id, account_id, account_name,
                        last_article_release_time, biz)
                self._redisdb.sadd('wechat:account', data)
Example #5
0
class UrlManager(threading.Thread, Singleton):
    def __init__(self, table_url='urls'):
        if not hasattr(self, '_table_url'):
            super(UrlManager, self).__init__()

            self._thread_stop = False

            self._urls_deque = collections.deque()
            self._db = RedisDB()
            self._table_url = table_url
            self._table_url_dupefilter = self._table_url + '_dupefilter'
            self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_url_to_db()
            except Exception as e:
                log.error(e)

            log.debug('缓存url数量 %s' % len(self._urls_deque))
            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_urls(self, urls):
        urls = urls if isinstance(urls, list) else [urls]
        for url in urls:
            self._urls_deque.append(url)

        if self.get_urls_count() > MAX_URL_COUNT:  # 超过最大缓存,总动调用
            self.__add_url_to_db()

    def get_urls_count(self):
        return len(self._urls_deque)

    def clear_url(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_url)
        self._db.clear(self._table_url_dupefilter)

    def print_url(self, i):
        while self._urls_deque:
            url = self._urls_deque.popleft()
            print(i, '-->', url)

    def __add_url_to_db(self):
        url_list = []
        prioritys = []

        while self._urls_deque:
            url = self._urls_deque.popleft()
            url_id = tools.get_sha1(url.get('url'))
            depth = url.get('depth', 0)

            max_depth = url.get('remark', {}).get('spider_depth', 0)
            if depth == max_depth - 1:  #最后一层 url单独放,之后不需要清空
                if self._db.sadd(self._table_url_end_depth_dupefilter,
                                 url_id) and self._db.sadd(
                                     self._table_url_dupefilter, url_id):
                    url_list.append(url)
                    prioritys.append(depth)
                    # 统计每层的url数量,将url_id添加到每层的表,不做统计时可注释掉
                    self._db.sadd(self._table_url_dupefilter + str(depth),
                                  url_id)

            elif self._db.sadd(self._table_url_dupefilter, url_id):
                url_list.append(url)
                prioritys.append(depth)
                # 统计每层的url数量,将url_id添加到每层的表,不做统计时可注释掉
                self._db.sadd(self._table_url_dupefilter + str(depth), url_id)

            if len(url_list) > 100:
                log.debug('添加url到数据库')
                self._db.zadd(self._table_url, url_list, prioritys)
                url_list = []
                prioritys = []

        if url_list:
            log.debug('添加url到数据库')
            self._db.zadd(self._table_url, url_list, prioritys)