Esempio n. 1
0
def show_word_cluster_result():
    word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv'))
    word_df['wordvec'] = word_df['wordvec'].map(eval)
    wordvec_list = word_df['wordvec'].tolist()
    word_label = word_df['word_label'].tolist()
    word_pca_tsne = modeling.feature_reduction(wordvec_list, pca_n_components=3, tsne_n_components=2)
    drawing.draw_clustering_result(word_pca_tsne, word_label)
def cluster_content():
    eps_var = Entry_Eps.get()
    min_samples_var = Entry_MinSamples.get()
    if eps_var == '' or min_samples_var == '':
        messagebox.showinfo('Message', '请输全聚类参数!')
        return
    eps_var = float(eps_var)
    min_samples_var = int(min_samples_var)
    try:
        df = news_pandas.load_news(os.path.join(temp_news_path,
                                                'news_cut.csv'))
        df['content_cut'] = df['content_cut'].map(eval)
        df['content_'] = df['content_'].map(str)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行预处理!')
        return
    word_library_list = counter.get_word_library(df['content_cut'])
    single_frequency_words_list = counter.get_single_frequency_words(
        df['content_cut'])
    max_features = len(
        word_library_list) - len(single_frequency_words_list) // 2
    matrix = modeling.feature_extraction(df['content_'],
                                         vectorizer='TfidfVectorizer',
                                         vec_args={
                                             'max_df': 0.95,
                                             'min_df': 1,
                                             'max_features': max_features
                                         })
    dbscan = modeling.get_cluster(matrix,
                                  cluster='DBSCAN',
                                  cluster_args={
                                      'eps': eps_var,
                                      'min_samples': min_samples_var,
                                      'metric': 'cosine'
                                  })
    labels = modeling.get_labels(dbscan)
    df['label'] = labels
    ranks = modeling.label2rank(labels)
    df['rank'] = ranks
    news_pandas.save_news(df, os.path.join(results_path, 'news_label.csv'))
    df['matrix'] = matrix.toarray().tolist()
    df_non_outliers = df[df['label'] != -1].copy()
    if df_non_outliers.shape[0] == 0:
        messagebox.showinfo('Message', '不能聚类出任何热点,请重新设置聚类参数!')
        return
    data_pca_tsne = modeling.feature_reduction(
        df_non_outliers['matrix'].tolist(),
        pca_n_components=3,
        tsne_n_components=2)
    df_non_outliers['pca_tsne'] = data_pca_tsne.tolist()
    del df_non_outliers['matrix']
    news_pandas.save_news(df_non_outliers,
                          os.path.join(results_path, 'news_non_outliers.csv'))
    rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank'])
    hot_num.set(rank_num)
    messagebox.showinfo('Message', '按照新闻内容聚类完成!')