def title_cluster(df, save_df=False): """按新闻标题聚类""" df_title = df.copy() df_title = title_preprocess(df_title) word_library_list = counter.get_word_library(df_title['title_cut']) single_frequency_words_list = counter.get_single_frequency_words(df_title['title_cut']) max_features = len(word_library_list) - len(single_frequency_words_list) // 2 title_matrix = modeling.feature_extraction(df_title['title_'], vectorizer='CountVectorizer', vec_args={'max_df': 1.0, 'min_df': 1, 'max_features': max_features}) title_dbscan = modeling.get_cluster(title_matrix, cluster='DBSCAN', cluster_args={'eps': 0.4, 'min_samples': 4, 'metric': 'cosine'}) title_labels = modeling.get_labels(title_dbscan) df_title['title_label'] = title_labels df_non_outliers = modeling.get_non_outliers_data(df_title, label_column='title_label') title_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['title_label'].tolist()) print('按新闻标题聚类,一共有%d个簇(不包括离群点)' % title_label_num) title_rank = modeling.label2rank(title_labels) df_title['title_rank'] = title_rank for i in range(1, title_label_num + 1): df_ = df_title[df_title['title_rank'] == i] title_top_list = counter.get_most_common_words(df_['title_cut'], top_n=10) print(title_top_list) if save_df: df_title.drop(['content', 'title_', 'title_label'], axis=1, inplace=True) news_crawler.save_news(df_title, os.path.join(results_path, 'df_title_rank.csv')) return df_title
def show_word_pie(): word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv')) word_df['wordvec'] = word_df['wordvec'].map(eval) n_clusters = counter.get_num_of_value_no_repeat(word_df['word_label']) word_label_value = [word_df[word_df['word_label'] == i].shape[0] for i in range(n_clusters)] word_label_yticks = [word_df[word_df['word_label'] == i]['word'][:5].tolist() for i in range(n_clusters)] drawing.draw_clustering_analysis_pie(n_clusters, word_label_value, word_label_yticks)
def content_cluster(df, df_save=False): """按新闻内容聚类""" df_content = df.copy() df_content = content_preprocess(df_content) word_library_list = counter.get_word_library(df_content['content_cut']) single_frequency_words_list = counter.get_single_frequency_words(df_content['content_cut']) max_features = len(word_library_list) - len(single_frequency_words_list) // 2 content_matrix = modeling.feature_extraction(df_content['content_'], vectorizer='CountVectorizer', vec_args={'max_df': 0.95, 'min_df': 1, 'max_features': max_features}) content_dbscan = modeling.get_cluster(content_matrix, cluster='DBSCAN', cluster_args={'eps': 0.35, 'min_samples': 4, 'metric': 'cosine'}) content_labels = modeling.get_labels(content_dbscan) df_content['content_label'] = content_labels df_non_outliers = modeling.get_non_outliers_data(df_content, label_column='content_label') content_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['content_label'].tolist()) print('按新闻内容聚类,一共有%d个簇(不包括离群点)' % content_label_num) content_rank = modeling.label2rank(content_labels) df_content['content_rank'] = content_rank for i in range(1, content_label_num + 1): df_ = df_content[df_content['content_rank'] == i] content_top_list = counter.get_most_common_words(df_['content_cut'], top_n=15, min_frequency=1) print(content_top_list) if df_save: df_content.drop(['content_', 'content_label'], axis=1, inplace=True) news_crawler.save_news(df_content, os.path.join(results_path, 'df_content_rank.csv')) return df_content
def show_hot_barh(): try: df_non_outliers = news_pandas.load_news( os.path.join(results_path, 'news_non_outliers.csv')) df_non_outliers['content_cut'] = df_non_outliers['content_cut'].map( eval) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!') return rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank']) value = [ df_non_outliers[df_non_outliers['rank'] == i].shape[0] for i in range(1, rank_num + 1) ] yticks1 = [ str( counter.get_most_common_words( df_non_outliers[df_non_outliers['rank'] == i]['content_cut'], top_n=10)) + str(i) for i in range(1, rank_num + 1) ] # yticks2 = [modeling.get_key_sentences('\n'.join(df_non_outliers[df_non_outliers['rank'] == i]['title_']), # num=1) for i in range(1, rank_num + 1)] drawing.draw_clustering_analysis_barh(rank_num, value, yticks1, title='热点新闻分布饼图')
def get_key_words(): df_title = news_crawler.load_news( os.path.join(results_path, 'df_title_rank.csv')) df_content = news_crawler.load_news( os.path.join(results_path, 'df_content_rank.csv')) df_title['title_cut'] = df_title['title_cut'].map(eval) df_content['content_cut'] = df_content['content_cut'].map(eval) df_title_content = df_title.copy() df_title_content['content_cut'] = df_content['content_cut'] df_title_content['content_rank'] = df_content['content_rank'] df_title_content = modeling.get_non_outliers_data( df_title_content, label_column='title_rank') title_rank_num = counter.get_num_of_value_no_repeat( (df_title_content['title_rank'])) for i in range(1, title_rank_num + 1): df_i = df_title_content[df_title_content['title_rank'] == i] title = '\n'.join(df_i['title'].tolist()) title = modeling.get_key_sentences(title, num=1) print('热点:', title) content_rank = [k for k in df_i['content_rank']] content_rank = set(content_rank) for j in content_rank: df_j = df_i[df_i['content_rank'] == j] most_commmon_words = counter.get_most_common_words( df_j['content_cut'], top_n=20, min_frequency=5) if len(most_commmon_words) > 0: print('相关词汇:', most_commmon_words)
def cluster_content(): eps_var = Entry_Eps.get() min_samples_var = Entry_MinSamples.get() if eps_var == '' or min_samples_var == '': messagebox.showinfo('Message', '请输全聚类参数!') return eps_var = float(eps_var) min_samples_var = int(min_samples_var) try: df = news_pandas.load_news(os.path.join(temp_news_path, 'news_cut.csv')) df['content_cut'] = df['content_cut'].map(eval) df['content_'] = df['content_'].map(str) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行预处理!') return word_library_list = counter.get_word_library(df['content_cut']) single_frequency_words_list = counter.get_single_frequency_words( df['content_cut']) max_features = len( word_library_list) - len(single_frequency_words_list) // 2 matrix = modeling.feature_extraction(df['content_'], vectorizer='TfidfVectorizer', vec_args={ 'max_df': 0.95, 'min_df': 1, 'max_features': max_features }) dbscan = modeling.get_cluster(matrix, cluster='DBSCAN', cluster_args={ 'eps': eps_var, 'min_samples': min_samples_var, 'metric': 'cosine' }) labels = modeling.get_labels(dbscan) df['label'] = labels ranks = modeling.label2rank(labels) df['rank'] = ranks news_pandas.save_news(df, os.path.join(results_path, 'news_label.csv')) df['matrix'] = matrix.toarray().tolist() df_non_outliers = df[df['label'] != -1].copy() if df_non_outliers.shape[0] == 0: messagebox.showinfo('Message', '不能聚类出任何热点,请重新设置聚类参数!') return data_pca_tsne = modeling.feature_reduction( df_non_outliers['matrix'].tolist(), pca_n_components=3, tsne_n_components=2) df_non_outliers['pca_tsne'] = data_pca_tsne.tolist() del df_non_outliers['matrix'] news_pandas.save_news(df_non_outliers, os.path.join(results_path, 'news_non_outliers.csv')) rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank']) hot_num.set(rank_num) messagebox.showinfo('Message', '按照新闻内容聚类完成!')
def get_wordcloud(df, rank_column, text_list_column): """ 按照不同的簇生成每个簇的词云 :param df: pd.DataFrame,带有排名和分词后的文本列表数据 :param rank_column: 排名列名 :param text_list_column: 分词后的文本列表列名 """ df_non_outliers = modeling.get_non_outliers_data(df, label_column=rank_column) label_num = counter.get_num_of_value_no_repeat(df_non_outliers[rank_column].tolist()) for i in range(1, label_num + 1): df_ = df[df[rank_column] == i] list_ = counter.flat(df_[text_list_column].tolist()) modeling.list2wordcloud(list_, save_path=os.path.join(results_path, rank_column, '%d.png' % i), font_path=os.path.join(fonts_path, 'yw.ttf'))