def __init__(self, nickname): """ :param nickname: 公众号名称 """ self.nickname = nickname # 生成器转化成为list 赋值给self.posts self.posts = [] self.total_num = 0 self.crawled_num = 0 self.uncrawled_num = 0 for p in get_collection_article(nickname): # 过滤掉没有爬取阅读数据的文章 if 'read_num' in p: self.posts.append(p) self.crawled_num += 1 else: self.uncrawled_num += 1 self.total_num += 1 # if self.crawled_num != 0: self.gzh = GZH(self.posts) self.gzh.postsToDataframe() # echarts的option_data self.option_data = {} # 公众号名称 self.option_data['account_name'] = nickname # 累计发文总数 self.option_data['posts_info'] = ' 有效文章%d 其中%d具有阅读数据 还剩%d尚无阅读数据'%\ (self.total_num,self.crawled_num,self.uncrawled_num)
def get_all_articles_data(nickname): """ :param nickname:公众号昵称 :return:一个公众号的全部文章列表 {'title':'公众号名称','articles':[{},{},{}]} """ use_keys = [ 'article_id', 'p_date', 'read_num', 'like_num', 'reward_num', 'comment_num', 'author', 'mov', 'title', 'content_url' ] data = {} data['title'] = nickname data['articles'] = [] articles = get_collection_article(nickname) id_counter = 0 for article in articles: if 'title' not in article: continue id_counter += 1 use_data = {} # use_data = dict((k, article[k]) for k in use_keys) for k in use_keys: if k in article: use_data[k] = article[k] else: use_data[k] = '-' # 发文时间格式转化 use_data['p_date'] = use_data['p_date'].strftime("%Y/%m/%d") use_data['article_id'] = id_counter data['articles'].append(use_data) return data
def __init__(self, *args, **kwargs): """ :param args: :param kwargs: 实例化爬虫需要调用的函数 """ # 包含当前公众号所有不存在文本内容数据的生成器 self.current_nickname = TidyReqData.get_nickname() self.articles_list = get_collection_article(self.current_nickname, article={"$exists": False}, title={"$exists": True}) self.crawler_begin_time = time() self.crawler_parse_counter = 0
def __init__(self, *args, **kwargs): """ :param args: :param kwargs: 实例化爬虫需要调用的函数 """ # 包含当前公众号所有不存在文本内容数据的生成器 self.current_nickname = TidyReqData.get_nickname() print(self.current_nickname) articles_list = get_collection_article(self.current_nickname, read_num={"$exists": False}, comment_id={"$exists": True}) self.articles_list = [] for article in articles_list: self.articles_list.append(article) self.task_num = len(self.articles_list) self.task_counter = 0 self.begin_time = time() self.pre_time = time()
def index_db_docs(self, nickname): """ :param nickname:公众号昵称 :return: 从mongodb中获取一个公众号的全部数据 使用bulk操作index进入es """ # 先创建index index_name = self.create_index(nickname) # 从数据库中获取该公众号的全部文章 articles = get_collection_article(nickname, article={"$exists": True}, title={"$exists": True}) articles_cache = [] # mongodb的连接10分钟之后会过期 期间可能会出现完不成index的情况 故先将公众号的全部历史文章缓存 for article in articles: doc = dict((key, article[key]) for key in doc_schema) articles_cache.append(doc) # 使用bulk操作index文档 result = self.index_bulk(index_name, articles_cache) return result