Esempio n. 1
0
def index():
    if not request.json or not 'keywords' in request.json:
        return jsonify({'error': 'keywords is requirewd'}), 400
    keywords = request.json['keywords']

    response = get_articles(keywords)
    return jsonify(response), 201
Esempio n. 2
0
def index():
    trending = get_google_trendings()  # utils
    topic = str("")
    if (request.args != {}):
        topic = str(request.args.get('query'))
        articles = get_articles(request.args.get('query'))
        articles = articles_check(articles)
        summary = multidoc_summary(articles[:3])
    else:
        articles = []
        summary = []
    return render_template('index.html',
                           trends=trending,
                           articles=articles,
                           summary=summary,
                           topic=topic)
 def calculate_tf_idf(self):
     keyword_result = dict()
     self.articles = get_articles('issue.xml')
     keyword_set, annotate_map, title_map = self.get_keywords(self.articles)
     doc_len = len(self.articles)
     for doc_num, article in enumerate(self.articles):
         article_list_len = len(annotate_map[doc_num])
         title_list_len = len(title_map[doc_num])
         sum_article_len = article_list_len + title_list_len
         for keyword in keyword_set:
             annotate_word_count = annotate_map[doc_num].count(keyword)
             title_word_count = title_map[doc_num].count(keyword)
             # calc tf
             tf_idf_title = title_word_count / title_list_len
             tf_idf_annotate = annotate_word_count / article_list_len
             tf_idf = (title_word_count +
                       annotate_word_count) / sum_article_len
             # calc tf * idf
             if self.word_map_title.get(keyword, None) is not None:
                 tf_idf_title *= math.log(doc_len /
                                          len(self.word_map_title[keyword]))
             else:
                 tf_idf_title = 0
             if self.word_map_annotate.get(keyword, None) is not None:
                 tf_idf_annotate *= math.log(
                     doc_len / len(self.word_map_annotate[keyword]))
             else:
                 tf_idf_annotate = 0
             if self.word_map.get(keyword, None) is not None:
                 tf_idf *= math.log(doc_len / len(self.word_map[keyword]))
                 tf_idf_full = 0.4 * tf_idf_annotate + 0.6 * tf_idf_title
             else:
                 tf_idf = 0
             if tf_idf > 0:
                 if keyword_result.get(keyword, None) is None:
                     keyword_result[keyword] = {}
                 keyword_result[keyword][doc_num] = {
                     'tf_idf': tf_idf,
                     'tf_idf_full': tf_idf_full,
                     'tf_idf_title': tf_idf_title,
                     'tf_idf_annotate': tf_idf_annotate
                 }
             self.keyword_result = keyword_result
Esempio n. 4
0
    'PonyCar': 'PonyCar马上用车',
    '小桔租车': '小桔租车官方微博'
}

primary_weixin_accounts = {
    'EVCARD': 'EVCARD服务号',
    'GoFun': 'GoFun出行',
    '盼达用车': '盼达用车',
    'car2go': '即行car2go',
    '途歌': 'TOGO途歌',
    '摩范出行': '摩范出行',
    'PonyCar': 'PONYCAR马上用车',
    '小桔租车': '小桔租车平台'
}
# 微博匹配账号信息
weibo_articles = utils.get_articles(weibo_filepath)
weibo_articles.sort(key=lambda article: article['id'])
iter_weibo_account = None
for article in weibo_articles:

    # 当 id 变化时,更新用于匹配的迭代账号
    if not (iter_weibo_account and iter_weibo_account['id'] == article['id']):
        iter_weibo_account = weibo_collection.find_one(
            {'id': article['id']},
            {'_id': 0, 'id': 1, 'brand': 1, 'is_primary': 1, 'is_regional': 1, 'region': 1}
        )

    article['brand'] = brands.get(iter_weibo_account['brand'])
    article['is_primary'] = '是' if iter_weibo_account['is_primary'] == 1 else '否'
    article['region'] = iter_weibo_account.get('region', '')
def match_brand_and_region(item):
    for index, a in enumerate(regional_accounts):
        if item['id'] == a['id']:
            item['brand'] = a['brand']
            item['region'] = a['region']
            break
    return item

# ———————————— 处理 ————————————

# 创建工作簿
wb = Workbook()
ws = wb.active
ws.title = '微信主号'

articles = utils.get_articles(article_filename)
last_articles = utils.get_articles(last_article_filename)


# 服务号数据概览
ws.append(['公众号名称', '发文数', '发文数环比增长', '阅读数', '阅读数环比增长'])
for account in type2_accounts:
    publish_summary = summarize_publish_by_account(account['id'], articles)
    last_publish_summary = summarize_publish_by_account(account['id'], last_articles)
    article_count_relative = utils.calc_relative_ratio_1(last_publish_summary['article_count'], publish_summary['article_count'])
    read_sum_relative = utils.calc_relative_ratio_1(last_publish_summary['read_sum'], publish_summary['read_sum'])
    ws.append([account['name'], publish_summary['article_count'], article_count_relative,\
        publish_summary['read_sum'], read_sum_relative])


# 订阅号数据概览
Esempio n. 6
0
def cleanup_mention(mentions):
    mentions = list(filter(filter_offical_mention, mentions))
    mentions = list(filter(filter_irrelevant_mention, mentions))
    return mentions


# ———————————— 处理 ————————————

# 创建工作簿
wb = Workbook()
ws = wb.active
ws.title = '微博主号'

# 读取本月和上月的微博回采数据表,筛选出主号发文
primary_articles = list(
    filter(primary_filter, utils.get_articles(article_filename)))
primary_summary = summarize_primary(primary_articles)
last_primary_articles = list(
    filter(primary_filter, utils.get_articles(last_article_filename)))
last_primary_summary = summarize_primary(last_primary_articles)

# 分本月和上月的发文数和互动数,并计算环比涨幅
ws.append([
    '', label + '发文数', label + '互动数', last_label + '发文数', last_label + '互动数',
    '发文数环比', '互动数环比'
])
for i, s in enumerate(primary_summary):
    ws.append([s['brand_name'], s['article_count'], s['interact_sum'],\
        last_primary_summary[i]['article_count'], last_primary_summary[i]['interact_sum'],\
        utils.calc_relative_ratio_1(last_primary_summary[i]['article_count'] ,s['article_count']),\
        utils.calc_relative_ratio_1(last_primary_summary[i]['interact_sum'] ,s['interact_sum'])])
    print vect_dist
    return vect_dist


def get_w2vfeatures_list(article_list):
    w2vfeatures = []
    i = 0
    for article in article_list:
        print i
        i += 1
        w2vfeatures.append(get_w2vfeature(article))
    w2vfeatures = np.array(w2vfeatures).T
    return w2vfeatures


train_articles = utils.get_articles(utils.TRAIN)
train_labels = utils.get_labels(utils.TRAIN_LABELS)
train_w2vfeatures = get_w2vfeatures_list(train_articles)
np.savetxt("Features/google_word2vec_train.txt", train_features)
# train_w2vfeatures = np.loadtxt("Features/google_word2vec_train.txt")

# dev_articles = utils.get_articles(utils.DEV)
# dev_labels = utils.get_labels(utils.DEV_LABELS)
# dev_features = get_w2vfeatures_list(dev_articles)
# np.savetxt("Features/google_word2vec_dev.txt",dev_features)

# train_w2vfeatures = np.array(train_w2vfeatures).reshape(1,1000)

plt.figure()
plt.scatter(range(0, len(train_w2vfeatures)),
            train_w2vfeatures,