def index(): if not request.json or not 'keywords' in request.json: return jsonify({'error': 'keywords is requirewd'}), 400 keywords = request.json['keywords'] response = get_articles(keywords) return jsonify(response), 201
def index(): trending = get_google_trendings() # utils topic = str("") if (request.args != {}): topic = str(request.args.get('query')) articles = get_articles(request.args.get('query')) articles = articles_check(articles) summary = multidoc_summary(articles[:3]) else: articles = [] summary = [] return render_template('index.html', trends=trending, articles=articles, summary=summary, topic=topic)
def calculate_tf_idf(self): keyword_result = dict() self.articles = get_articles('issue.xml') keyword_set, annotate_map, title_map = self.get_keywords(self.articles) doc_len = len(self.articles) for doc_num, article in enumerate(self.articles): article_list_len = len(annotate_map[doc_num]) title_list_len = len(title_map[doc_num]) sum_article_len = article_list_len + title_list_len for keyword in keyword_set: annotate_word_count = annotate_map[doc_num].count(keyword) title_word_count = title_map[doc_num].count(keyword) # calc tf tf_idf_title = title_word_count / title_list_len tf_idf_annotate = annotate_word_count / article_list_len tf_idf = (title_word_count + annotate_word_count) / sum_article_len # calc tf * idf if self.word_map_title.get(keyword, None) is not None: tf_idf_title *= math.log(doc_len / len(self.word_map_title[keyword])) else: tf_idf_title = 0 if self.word_map_annotate.get(keyword, None) is not None: tf_idf_annotate *= math.log( doc_len / len(self.word_map_annotate[keyword])) else: tf_idf_annotate = 0 if self.word_map.get(keyword, None) is not None: tf_idf *= math.log(doc_len / len(self.word_map[keyword])) tf_idf_full = 0.4 * tf_idf_annotate + 0.6 * tf_idf_title else: tf_idf = 0 if tf_idf > 0: if keyword_result.get(keyword, None) is None: keyword_result[keyword] = {} keyword_result[keyword][doc_num] = { 'tf_idf': tf_idf, 'tf_idf_full': tf_idf_full, 'tf_idf_title': tf_idf_title, 'tf_idf_annotate': tf_idf_annotate } self.keyword_result = keyword_result
'PonyCar': 'PonyCar马上用车', '小桔租车': '小桔租车官方微博' } primary_weixin_accounts = { 'EVCARD': 'EVCARD服务号', 'GoFun': 'GoFun出行', '盼达用车': '盼达用车', 'car2go': '即行car2go', '途歌': 'TOGO途歌', '摩范出行': '摩范出行', 'PonyCar': 'PONYCAR马上用车', '小桔租车': '小桔租车平台' } # 微博匹配账号信息 weibo_articles = utils.get_articles(weibo_filepath) weibo_articles.sort(key=lambda article: article['id']) iter_weibo_account = None for article in weibo_articles: # 当 id 变化时,更新用于匹配的迭代账号 if not (iter_weibo_account and iter_weibo_account['id'] == article['id']): iter_weibo_account = weibo_collection.find_one( {'id': article['id']}, {'_id': 0, 'id': 1, 'brand': 1, 'is_primary': 1, 'is_regional': 1, 'region': 1} ) article['brand'] = brands.get(iter_weibo_account['brand']) article['is_primary'] = '是' if iter_weibo_account['is_primary'] == 1 else '否' article['region'] = iter_weibo_account.get('region', '')
def match_brand_and_region(item): for index, a in enumerate(regional_accounts): if item['id'] == a['id']: item['brand'] = a['brand'] item['region'] = a['region'] break return item # ———————————— 处理 ———————————— # 创建工作簿 wb = Workbook() ws = wb.active ws.title = '微信主号' articles = utils.get_articles(article_filename) last_articles = utils.get_articles(last_article_filename) # 服务号数据概览 ws.append(['公众号名称', '发文数', '发文数环比增长', '阅读数', '阅读数环比增长']) for account in type2_accounts: publish_summary = summarize_publish_by_account(account['id'], articles) last_publish_summary = summarize_publish_by_account(account['id'], last_articles) article_count_relative = utils.calc_relative_ratio_1(last_publish_summary['article_count'], publish_summary['article_count']) read_sum_relative = utils.calc_relative_ratio_1(last_publish_summary['read_sum'], publish_summary['read_sum']) ws.append([account['name'], publish_summary['article_count'], article_count_relative,\ publish_summary['read_sum'], read_sum_relative]) # 订阅号数据概览
def cleanup_mention(mentions): mentions = list(filter(filter_offical_mention, mentions)) mentions = list(filter(filter_irrelevant_mention, mentions)) return mentions # ———————————— 处理 ———————————— # 创建工作簿 wb = Workbook() ws = wb.active ws.title = '微博主号' # 读取本月和上月的微博回采数据表,筛选出主号发文 primary_articles = list( filter(primary_filter, utils.get_articles(article_filename))) primary_summary = summarize_primary(primary_articles) last_primary_articles = list( filter(primary_filter, utils.get_articles(last_article_filename))) last_primary_summary = summarize_primary(last_primary_articles) # 分本月和上月的发文数和互动数,并计算环比涨幅 ws.append([ '', label + '发文数', label + '互动数', last_label + '发文数', last_label + '互动数', '发文数环比', '互动数环比' ]) for i, s in enumerate(primary_summary): ws.append([s['brand_name'], s['article_count'], s['interact_sum'],\ last_primary_summary[i]['article_count'], last_primary_summary[i]['interact_sum'],\ utils.calc_relative_ratio_1(last_primary_summary[i]['article_count'] ,s['article_count']),\ utils.calc_relative_ratio_1(last_primary_summary[i]['interact_sum'] ,s['interact_sum'])])
print vect_dist return vect_dist def get_w2vfeatures_list(article_list): w2vfeatures = [] i = 0 for article in article_list: print i i += 1 w2vfeatures.append(get_w2vfeature(article)) w2vfeatures = np.array(w2vfeatures).T return w2vfeatures train_articles = utils.get_articles(utils.TRAIN) train_labels = utils.get_labels(utils.TRAIN_LABELS) train_w2vfeatures = get_w2vfeatures_list(train_articles) np.savetxt("Features/google_word2vec_train.txt", train_features) # train_w2vfeatures = np.loadtxt("Features/google_word2vec_train.txt") # dev_articles = utils.get_articles(utils.DEV) # dev_labels = utils.get_labels(utils.DEV_LABELS) # dev_features = get_w2vfeatures_list(dev_articles) # np.savetxt("Features/google_word2vec_dev.txt",dev_features) # train_w2vfeatures = np.array(train_w2vfeatures).reshape(1,1000) plt.figure() plt.scatter(range(0, len(train_w2vfeatures)), train_w2vfeatures,