コード例 #1
0
def title_cluster(df, save_df=False):
    """按新闻标题聚类"""
    df_title = df.copy()
    df_title = title_preprocess(df_title)
    word_library_list = counter.get_word_library(df_title['title_cut'])
    single_frequency_words_list = counter.get_single_frequency_words(df_title['title_cut'])
    max_features = len(word_library_list) - len(single_frequency_words_list) // 2
    title_matrix = modeling.feature_extraction(df_title['title_'], vectorizer='CountVectorizer',
                                               vec_args={'max_df': 1.0, 'min_df': 1, 'max_features': max_features})
    title_dbscan = modeling.get_cluster(title_matrix, cluster='DBSCAN',
                                        cluster_args={'eps': 0.4, 'min_samples': 4, 'metric': 'cosine'})
    title_labels = modeling.get_labels(title_dbscan)
    df_title['title_label'] = title_labels
    df_non_outliers = modeling.get_non_outliers_data(df_title, label_column='title_label')
    title_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['title_label'].tolist())
    print('按新闻标题聚类,一共有%d个簇(不包括离群点)' % title_label_num)
    title_rank = modeling.label2rank(title_labels)
    df_title['title_rank'] = title_rank
    for i in range(1, title_label_num + 1):
        df_ = df_title[df_title['title_rank'] == i]
        title_top_list = counter.get_most_common_words(df_['title_cut'], top_n=10)
        print(title_top_list)
    if save_df:
        df_title.drop(['content', 'title_', 'title_label'], axis=1, inplace=True)
        news_crawler.save_news(df_title, os.path.join(results_path, 'df_title_rank.csv'))
    return df_title
コード例 #2
0
def content_cluster(df, df_save=False):
    """按新闻内容聚类"""
    df_content = df.copy()
    df_content = content_preprocess(df_content)
    word_library_list = counter.get_word_library(df_content['content_cut'])
    single_frequency_words_list = counter.get_single_frequency_words(df_content['content_cut'])
    max_features = len(word_library_list) - len(single_frequency_words_list) // 2
    content_matrix = modeling.feature_extraction(df_content['content_'], vectorizer='CountVectorizer',
                                                 vec_args={'max_df': 0.95, 'min_df': 1, 'max_features': max_features})
    content_dbscan = modeling.get_cluster(content_matrix, cluster='DBSCAN',
                                          cluster_args={'eps': 0.35, 'min_samples': 4, 'metric': 'cosine'})
    content_labels = modeling.get_labels(content_dbscan)
    df_content['content_label'] = content_labels
    df_non_outliers = modeling.get_non_outliers_data(df_content, label_column='content_label')
    content_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['content_label'].tolist())
    print('按新闻内容聚类,一共有%d个簇(不包括离群点)' % content_label_num)
    content_rank = modeling.label2rank(content_labels)
    df_content['content_rank'] = content_rank
    for i in range(1, content_label_num + 1):
        df_ = df_content[df_content['content_rank'] == i]
        content_top_list = counter.get_most_common_words(df_['content_cut'], top_n=15, min_frequency=1)
        print(content_top_list)
    if df_save:
        df_content.drop(['content_', 'content_label'], axis=1, inplace=True)
        news_crawler.save_news(df_content, os.path.join(results_path, 'df_content_rank.csv'))
    return df_content
コード例 #3
0
 def cluster_word():
     n_clusters = Entry_N_Clusters.get()
     if n_clusters == '':
         messagebox.showinfo('Message', '请输入词汇聚类的类别数!')
         return
     n_clusters = int(n_clusters)
     top_words_list = counter.get_most_common_words(
         df_rank_i['content_cut'], top_n=5000, min_frequency=1)
     model = news_pandas.load_element(
         os.path.join(models_path, 'word2vec_model.pkl'))
     word_list, wordvec_list = modeling.get_word_and_wordvec(
         model, top_words_list)
     kmeans = modeling.get_cluster(wordvec_list,
                                   cluster='KMeans',
                                   cluster_args={
                                       'n_clusters': n_clusters,
                                       'random_state': 9
                                   })
     word_label = kmeans.labels_
     word_df = pd.DataFrame()
     word_df['word'] = word_list
     word_df['wordvec'] = wordvec_list
     word_df['word_label'] = word_label
     news_pandas.save_news(word_df, os.path.join(results_path,
                                                 'word_df.csv'))
     messagebox.showinfo('Message', '词汇聚类完成!')
コード例 #4
0
def show_hot_barh():
    try:
        df_non_outliers = news_pandas.load_news(
            os.path.join(results_path, 'news_non_outliers.csv'))
        df_non_outliers['content_cut'] = df_non_outliers['content_cut'].map(
            eval)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
        return
    rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank'])
    value = [
        df_non_outliers[df_non_outliers['rank'] == i].shape[0]
        for i in range(1, rank_num + 1)
    ]
    yticks1 = [
        str(
            counter.get_most_common_words(
                df_non_outliers[df_non_outliers['rank'] == i]['content_cut'],
                top_n=10)) + str(i) for i in range(1, rank_num + 1)
    ]
    # yticks2 = [modeling.get_key_sentences('\n'.join(df_non_outliers[df_non_outliers['rank'] == i]['title_']),
    #                                       num=1) for i in range(1, rank_num + 1)]
    drawing.draw_clustering_analysis_barh(rank_num,
                                          value,
                                          yticks1,
                                          title='热点新闻分布饼图')
コード例 #5
0
ファイル: main.py プロジェクト: jingwangfei/HotNewsAnalysis
def get_key_words():
    df_title = news_crawler.load_news(
        os.path.join(results_path, 'df_title_rank.csv'))
    df_content = news_crawler.load_news(
        os.path.join(results_path, 'df_content_rank.csv'))
    df_title['title_cut'] = df_title['title_cut'].map(eval)
    df_content['content_cut'] = df_content['content_cut'].map(eval)
    df_title_content = df_title.copy()
    df_title_content['content_cut'] = df_content['content_cut']
    df_title_content['content_rank'] = df_content['content_rank']
    df_title_content = modeling.get_non_outliers_data(
        df_title_content, label_column='title_rank')
    title_rank_num = counter.get_num_of_value_no_repeat(
        (df_title_content['title_rank']))
    for i in range(1, title_rank_num + 1):
        df_i = df_title_content[df_title_content['title_rank'] == i]
        title = '\n'.join(df_i['title'].tolist())
        title = modeling.get_key_sentences(title, num=1)
        print('热点:', title)
        content_rank = [k for k in df_i['content_rank']]
        content_rank = set(content_rank)
        for j in content_rank:
            df_j = df_i[df_i['content_rank'] == j]
            most_commmon_words = counter.get_most_common_words(
                df_j['content_cut'], top_n=20, min_frequency=5)
            if len(most_commmon_words) > 0:
                print('相关词汇:', most_commmon_words)
コード例 #6
0
def show_hot_words_details():
    top_words_list = counter.get_most_common_words(df_rank_i['content_cut'],
                                                   top_n=5000,
                                                   min_frequency=1)
    top_words = '\n'.join(top_words_list)
    news_pandas.save_text(top_words, os.path.join(texts_path, 'top_words.txt'))
    os.system(editor + ' ' + os.path.join(texts_path, 'top_words.txt') + ' &')
コード例 #7
0
def show_wordlib():
    try:
        document_segment = news_pandas.load_text(os.path.join(texts_path, 'document_segment.txt'))
    except FileNotFoundError:
        messagebox.showinfo('Message', '没有分词后的文件!')
        return
    words = document_segment.split()
    word_library = counter.get_most_common_words(words)
    word_library = [word for word in word_library if re.match(r'^[0-9A-Za-z\u4E00-\u9FFF]+$', word)]
    word_library = '\n'.join(word_library)
    news_pandas.save_text(word_library, os.path.join(texts_path, 'word_library.txt'))
    filename = os.path.join(texts_path, 'word_library.txt')
    editor(filename)