class UrlManager(threading.Thread, Singleton): def __init__(self, table_url = 'urls'): if not hasattr(self,'_table_url'): super(UrlManager, self).__init__() self._thread_stop = False self._urls_deque = collections.deque() self._db = RedisDB() self._table_url = table_url self._table_url_dupefilter = self._table_url + '_dupefilter' self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter' def run(self): while not self._thread_stop: try: self.__add_url_to_db() except Exception as e: log.error(e) tools.delay_time(1) def stop(self): self._thread_stop = True def put_urls(self, urls): urls = urls if isinstance(urls, list) else [urls] for url in urls: self._urls_deque.append(url) def get_urls_count(self): return len(self._urls_deque) def clear_url(self): ''' @summary: 删除redis里的数据 --------- --------- @result: ''' self._db.clear(self._table_url) self._db.clear(self._table_url_dupefilter) def __add_url_to_db(self): url_list = [] prioritys = [] while self._urls_deque: url = self._urls_deque.popleft() url_id = tools.get_sha1(url.get('url')) depth = url.get('depth', 0) max_depth = url.get('remark',{}).get('spider_depth', 0) # 为了获取每层数量,指纹url暂时采用zset,且先校验指纹url,后校验最后一层url,不需要获取每层url时建议采用set存储,且先校验最后一层url if depth == max_depth - 1: #最后一层 url单独放,之后不需要清空 if self._db.zadd(self._table_url_dupefilter, url_id, depth) and self._db.sadd(self._table_url_end_depth_dupefilter, url_id): url_list.append(url) prioritys.append(depth) elif self._db.zadd(self._table_url_dupefilter, url_id, depth): url_list.append(url) prioritys.append(depth) if len(url_list) > 100: log.debug('添加url到数据库') self._db.zadd(self._table_url, url_list, prioritys) url_list = [] prioritys = [] if url_list: log.debug('添加url到数据库') self._db.zadd(self._table_url, url_list, prioritys)
class TaskManager(): def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._news_url_table = 'news:news_urls' self._news_urls_dupefilter = 'news:news_urls_dupefilter' def get_task_count(self): ''' @summary: redis 中是否有待做的url --------- --------- @result: ''' return self._redisdb.zget_count(self._news_url_table) def get_ever_depth_count(self, total_depth=5): ''' @summary: --------- @param total_depth: 不包含。 以客户角度的层数 --------- @result: ''' depth_count_info = {} total_count = 0 for depth in range(total_depth): key = '第%s层url数' % (depth + 1) depth_count_info[key] = self._redisdb.sget_count( self._news_urls_dupefilter + str(depth)) total_count += depth_count_info[key] depth_count_info['总url数'] = total_count return depth_count_info def get_task_from_oracle(self): tasks = [] offset = 0 while True: # 取任务 task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and (t.position != 35 or t.position is null) and rownum < {page_size}) where r >= {offset} '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset) results = self._oracledb.find(task_sql) offset += ONE_PAGE_SIZE if not results: break # 拼装成json格式的url for task in results: website_id = task[0] website_name = task[1] website_position = task[2] website_url = task[3] website_domain = tools.get_domain(website_url) spider_depth = task[4] remark = { 'website_name': website_name, 'website_position': website_position, 'website_url': website_url, 'website_domain': website_domain, 'spider_depth': spider_depth } url_dict = { 'site_id': 1, 'url': website_url, 'depth': 0, 'remark': remark, 'retry_times': 0 } tasks.append(url_dict) return tasks def add_task_to_redis(self, tasks): for task in tasks: url = task.get('url') if url: url_id = tools.get_sha1(url) if self._redisdb.sadd(self._news_urls_dupefilter, url_id): self._redisdb.zadd(self._news_url_table, task, prioritys=0) # 下面是统计每层url数量用的表 self._redisdb.sadd('news:news_urls_dupefilter0', url_id) def clear_task(self): # 清空url指纹表 self._redisdb.sdelete('news:news_urls_dupefilter') # 下面是统计每层url数量用的表 self._redisdb.sdelete('news:news_urls_dupefilter0') self._redisdb.sdelete('news:news_urls_dupefilter1') self._redisdb.sdelete('news:news_urls_dupefilter2') self._redisdb.sdelete('news:news_urls_dupefilter3') self._redisdb.sdelete('news:news_urls_dupefilter4')
class ArticleManager(threading.Thread, Singleton): def __init__(self, table_article = 'articles'): if not hasattr(self,'_table_article'): super(ArticleManager, self).__init__() self._thread_stop = False self._articles_deque = collections.deque() self._db = RedisDB() self._table_article = table_article self._table_article_bloomfilter = table_article + '_bloomfilter' self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter) def run(self): while not self._thread_stop: try: self.__add_article_to_db() except Exception as e: log.error(e) log.debug('缓存中文章数量 %s'%len(self._articles_deque)) tools.delay_time(1) def stop(self): self._thread_stop = True def put_articles(self, article): self._articles_deque.append(article) if self.get_articles_count() > MAX_ARTICLE_COUNT: # 超过最大缓存,总动调用 self.__add_article_to_db() def get_articles_count(self): return len(self._table_article) def clear_article(self): ''' @summary: 删除redis里的数据 --------- --------- @result: ''' self._db.clear(self._table_article) def __add_article_to_db(self): article_list = [] while self._articles_deque: article = self._articles_deque.popleft() # 查看article是否存在 if self._bloomfilter.is_contains(article.get('uuid')): article_list.append(article) else: self._bloomfilter.insert(article.get('uuid')) if len(article_list) > 100: log.debug('添加article到数据库') self._db.sadd(self._table_article, article_list) article_list = [] if article_list: log.debug('添加article到数据库') self._db.sadd(self._table_article, article_list)
class CheckNewArticle(): def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._wechat_sogo = WechatSogou() def get_wait_check_account(self): ''' @summary: --------- @param : --------- @result: ''' # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章 before_tow_hours = tools.timestamp_to_date( tools.get_current_timestamp() - 60 * 60 * 2) sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status = 603 and (t.last_article_release_time is null or t.last_article_release_time <= to_date('{}', 'yyyy-mm-dd hh24:mi:ss')) '''.format(before_tow_hours) accounts = self._oracledb.find(sql) # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发 if not accounts and not self._redisdb.sget_count('wechat:account'): sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status != 603 ''' accounts = self._oracledb.find(sql) return accounts def check_new_article(self, account): oralce_id, account_id, account_name, last_article_release_time, biz = account article_release_time = self._wechat_sogo.get_article_release_time( account_id=account_id, account=account_name) print(article_release_time) if article_release_time: last_article_release_time = last_article_release_time or '' if article_release_time >= tools.get_current_date( '%Y-%m-%d' ) and article_release_time > last_article_release_time: print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name, article_release_time)) sql = ''' update TAB_IOPM_SITE t set t.spider_status = 601, t.last_article_release_time = to_date('{}', 'yyyy-mm-dd hh24:mi:ss') where id = {} '''.format(article_release_time, oralce_id) # 多线程, 数据库需每个线程持有一个 oracledb = OracleDB() oracledb.update(sql) oracledb.close() # 入redis, 作为微信爬虫的任务池 data = (oralce_id, account_id, account_name, last_article_release_time, biz) self._redisdb.sadd('wechat:account', data)
class UrlManager(threading.Thread, Singleton): def __init__(self, table_url='urls'): if not hasattr(self, '_table_url'): super(UrlManager, self).__init__() self._thread_stop = False self._urls_deque = collections.deque() self._db = RedisDB() self._table_url = table_url self._table_url_dupefilter = self._table_url + '_dupefilter' self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter' def run(self): while not self._thread_stop: try: self.__add_url_to_db() except Exception as e: log.error(e) log.debug('缓存url数量 %s' % len(self._urls_deque)) tools.delay_time(1) def stop(self): self._thread_stop = True def put_urls(self, urls): urls = urls if isinstance(urls, list) else [urls] for url in urls: self._urls_deque.append(url) if self.get_urls_count() > MAX_URL_COUNT: # 超过最大缓存,总动调用 self.__add_url_to_db() def get_urls_count(self): return len(self._urls_deque) def clear_url(self): ''' @summary: 删除redis里的数据 --------- --------- @result: ''' self._db.clear(self._table_url) self._db.clear(self._table_url_dupefilter) def print_url(self, i): while self._urls_deque: url = self._urls_deque.popleft() print(i, '-->', url) def __add_url_to_db(self): url_list = [] prioritys = [] while self._urls_deque: url = self._urls_deque.popleft() url_id = tools.get_sha1(url.get('url')) depth = url.get('depth', 0) max_depth = url.get('remark', {}).get('spider_depth', 0) if depth == max_depth - 1: #最后一层 url单独放,之后不需要清空 if self._db.sadd(self._table_url_end_depth_dupefilter, url_id) and self._db.sadd( self._table_url_dupefilter, url_id): url_list.append(url) prioritys.append(depth) # 统计每层的url数量,将url_id添加到每层的表,不做统计时可注释掉 self._db.sadd(self._table_url_dupefilter + str(depth), url_id) elif self._db.sadd(self._table_url_dupefilter, url_id): url_list.append(url) prioritys.append(depth) # 统计每层的url数量,将url_id添加到每层的表,不做统计时可注释掉 self._db.sadd(self._table_url_dupefilter + str(depth), url_id) if len(url_list) > 100: log.debug('添加url到数据库') self._db.zadd(self._table_url, url_list, prioritys) url_list = [] prioritys = [] if url_list: log.debug('添加url到数据库') self._db.zadd(self._table_url, url_list, prioritys)