def show_word_pie():
    word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv'))
    word_df['wordvec'] = word_df['wordvec'].map(eval)
    n_clusters = counter.get_num_of_value_no_repeat(word_df['word_label'])
    word_label_value = [word_df[word_df['word_label'] == i].shape[0] for i in range(n_clusters)]
    word_label_yticks = [word_df[word_df['word_label'] == i]['word'][:5].tolist() for i in range(n_clusters)]
    drawing.draw_clustering_analysis_pie(n_clusters, word_label_value, word_label_yticks)
def show_hot_barh():
    try:
        df_non_outliers = news_pandas.load_news(
            os.path.join(results_path, 'news_non_outliers.csv'))
        df_non_outliers['content_cut'] = df_non_outliers['content_cut'].map(
            eval)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
        return
    rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank'])
    value = [
        df_non_outliers[df_non_outliers['rank'] == i].shape[0]
        for i in range(1, rank_num + 1)
    ]
    yticks1 = [
        str(
            counter.get_most_common_words(
                df_non_outliers[df_non_outliers['rank'] == i]['content_cut'],
                top_n=10)) + str(i) for i in range(1, rank_num + 1)
    ]
    # yticks2 = [modeling.get_key_sentences('\n'.join(df_non_outliers[df_non_outliers['rank'] == i]['title_']),
    #                                       num=1) for i in range(1, rank_num + 1)]
    drawing.draw_clustering_analysis_barh(rank_num,
                                          value,
                                          yticks1,
                                          title='热点新闻分布饼图')
def show_word_cluster_result():
    word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv'))
    word_df['wordvec'] = word_df['wordvec'].map(eval)
    wordvec_list = word_df['wordvec'].tolist()
    word_label = word_df['word_label'].tolist()
    word_pca_tsne = modeling.feature_reduction(wordvec_list, pca_n_components=3, tsne_n_components=2)
    drawing.draw_clustering_result(word_pca_tsne, word_label)
def show_cluster_result():
    try:
        df_non_outliers = news_pandas.load_news(os.path.join(results_path, 'news_non_outliers.csv'))
        df_non_outliers['pca_tsne'] = df_non_outliers['pca_tsne'].map(eval)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
        return
    drawing.draw_clustering_result(df_non_outliers['pca_tsne'], df_non_outliers['label'])
def cluster_content():
    eps_var = Entry_Eps.get()
    min_samples_var = Entry_MinSamples.get()
    if eps_var == '' or min_samples_var == '':
        messagebox.showinfo('Message', '请输全聚类参数!')
        return
    eps_var = float(eps_var)
    min_samples_var = int(min_samples_var)
    try:
        df = news_pandas.load_news(os.path.join(temp_news_path,
                                                'news_cut.csv'))
        df['content_cut'] = df['content_cut'].map(eval)
        df['content_'] = df['content_'].map(str)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行预处理!')
        return
    word_library_list = counter.get_word_library(df['content_cut'])
    single_frequency_words_list = counter.get_single_frequency_words(
        df['content_cut'])
    max_features = len(
        word_library_list) - len(single_frequency_words_list) // 2
    matrix = modeling.feature_extraction(df['content_'],
                                         vectorizer='TfidfVectorizer',
                                         vec_args={
                                             'max_df': 0.95,
                                             'min_df': 1,
                                             'max_features': max_features
                                         })
    dbscan = modeling.get_cluster(matrix,
                                  cluster='DBSCAN',
                                  cluster_args={
                                      'eps': eps_var,
                                      'min_samples': min_samples_var,
                                      'metric': 'cosine'
                                  })
    labels = modeling.get_labels(dbscan)
    df['label'] = labels
    ranks = modeling.label2rank(labels)
    df['rank'] = ranks
    news_pandas.save_news(df, os.path.join(results_path, 'news_label.csv'))
    df['matrix'] = matrix.toarray().tolist()
    df_non_outliers = df[df['label'] != -1].copy()
    if df_non_outliers.shape[0] == 0:
        messagebox.showinfo('Message', '不能聚类出任何热点,请重新设置聚类参数!')
        return
    data_pca_tsne = modeling.feature_reduction(
        df_non_outliers['matrix'].tolist(),
        pca_n_components=3,
        tsne_n_components=2)
    df_non_outliers['pca_tsne'] = data_pca_tsne.tolist()
    del df_non_outliers['matrix']
    news_pandas.save_news(df_non_outliers,
                          os.path.join(results_path, 'news_non_outliers.csv'))
    rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank'])
    hot_num.set(rank_num)
    messagebox.showinfo('Message', '按照新闻内容聚类完成!')
def cut_content():
    try:
        news_df = news_pandas.load_news(os.path.join(news_path, 'news_df.csv'))
    except FileNotFoundError:
        messagebox.showinfo('Message', '没有选择新闻内容文本!')
        return
    document = '\n'.join([str(content) for content in news_df['content']])
    preprocessing.document2txt(document, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt'),
                               text_path=os.path.join(texts_path, 'document_segment.txt'))
    messagebox.showinfo('Message', '新闻内容文本分词完成!')
def select_news():
    filename = filedialog.askopenfilename(filetypes=[("csv file", "*.csv")])
    if len(filename) == 0:
        return
    news_df = news_pandas.load_news(filename)
    news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv'))
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
def load_data():
    """加载数据"""
    # sina_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sina_latest_news.csv'))
    # sohu_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sohu_latest_news.csv'))
    # xinhuanet_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_xinhuanet_latest_news.csv'))
    # sina_news_df = news_crawler.load_news(os.path.join(news_path, 'sina_latest_news.csv'))
    # sohu_news_df = news_crawler.load_news(os.path.join(news_path, 'sohu_latest_news.csv'))
    # xinhuanet_news_df = news_crawler.load_news(os.path.join(news_path, 'xinhuanet_latest_news.csv'))
    # news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True)
    save_file_path = os.path.join(news_path, 'news_df.csv')
    news_df = news_pandas.load_news(save_file_path)
    return news_df
def crawler():
    sina_top_n = Entry_Sina.get()
    sohu_top_n = Entry_Sohu.get()
    xinhuanet_top_n = Entry_XinhuaNet.get()
    sina_top_n = 0 if sina_top_n == '' else int(sina_top_n)
    sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n)
    xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n)
    sina_top_n = 0 if sina_top_n <= 0 else sina_top_n
    sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n
    xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n
    if sina_top_n + sohu_top_n + xinhuanet_top_n == 0:
        messagebox.showinfo('Message', '新闻数量不能全部为非正数!')
        return
    news_crawler.threaded_crawler(sina_top_n, sohu_top_n, xinhuanet_top_n)
    sina_news_df = pd.DataFrame()
    sohu_news_df = pd.DataFrame()
    xinhuanet_news_df = pd.DataFrame()
    if sina_top_n > 0:
        sina_news_df = news_pandas.load_news(
            os.path.join(news_path, 'sina_latest_news.csv'))
    if sohu_top_n > 0:
        sohu_news_df = news_pandas.load_news(
            os.path.join(news_path, 'sohu_latest_news.csv'))
    if xinhuanet_top_n > 0:
        xinhuanet_news_df = news_pandas.load_news(
            os.path.join(news_path, 'xinhuanet_latest_news.csv'))
    news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df],
                        ignore_index=True)
    news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv'))
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df,
                          os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
    messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))
 def show_cluster_n_words():
     n = Entry_Cluster_N.get()
     if n == '':
         messagebox.showinfo('Message', '请先输入想要查看的词汇属于第几类!')
         return
     n = int(n)
     try:
         word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv'))
     except FileNotFoundError:
         messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
         return
     word_df['wordvec'] = word_df['wordvec'].map(eval)
     words_i_df = word_df[word_df['word_label'] == n - 1].copy()
     cluster_i_words = '\n'.join(words_i_df['word'].tolist())
     news_pandas.save_text(cluster_i_words, os.path.join(texts_path, 'cluster_i_words.txt'))
     filename = os.path.join(texts_path, 'cluster_i_words.txt')
     editor(filename)
def show_details():
    top_num = Entry_TopHot.get()
    if top_num == '':
        messagebox.showinfo('Message', '请输入想查看的热点属于第几簇!')
        return
    top_num = int(top_num)
    try:
        df_non_outliers = news_pandas.load_news(os.path.join(results_path, 'news_non_outliers.csv'))
        df_non_outliers['content_cut'] = df_non_outliers['content_cut'].map(eval)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
        return
    global df_rank_i
    df_rank_i = df_non_outliers[df_non_outliers['rank'] == top_num]
    all_title = '\n'.join(df_rank_i['title_'].tolist())
    hot_title = modeling.get_key_sentences(all_title, num=1)

    detail_tk = tk.Tk()
    detail_tk.option_add("*Font", "helvetica 12 bold")
    detail_tk.geometry("720x540+323+114")
    detail_tk.title("第{}簇热点详情".format(top_num))

    Label_Title = tk.Label(detail_tk, text='''话题:''')
    Label_Title.place(relx=0.2, rely=0.1, height=26, width=66)
    # Label_HotTitle = tk.Label(detail_tk, text=hot_title, font=('SimHei', 12, 'bold'), fg='red')
    Label_HotTitle = tk.Label(detail_tk, text=hot_title, font=('helvetica', 12, 'bold'), fg='red')
    Label_HotTitle.place(relx=0.25, rely=0.15)
    Button_HotWords = tk.Button(detail_tk, text='''该处热点相关词汇''', command=show_hot_words_details)
    Button_HotWords.place(relx=0.25, rely=0.25, height=26, width=140)
    Button_HotTitles = tk.Button(detail_tk, text='''该处热点热门话题''', command=show_hot_titles)
    Button_HotTitles.place(relx=0.55, rely=0.25, height=26, width=140)

    Label_L_6 = tk.Label(detail_tk, text='''热点词汇分''')
    Label_L_6.place(relx=0.25, rely=0.4, height=18, width=90)
    n_to_cluster = tk.StringVar()
    Entry_N_Clusters = tk.Entry(detail_tk, textvariable=n_to_cluster)
    # n_to_cluster.set('15')
    Entry_N_Clusters.place(relx=0.37, rely=0.4, height=20, relwidth=0.07)
    Label_R_6 = tk.Label(detail_tk, text='''类聚类''')
    Label_R_6.place(relx=0.44, rely=0.4, height=18, width=50)

    def cluster_word():
        n_clusters = Entry_N_Clusters.get()
        if n_clusters == '':
            messagebox.showinfo('Message', '请输入词汇聚类的类别数!')
            return
        n_clusters = int(n_clusters)
        top_words_list = counter.get_most_common_words(df_rank_i['content_cut'], top_n=5000, min_frequency=1)
        model = news_pandas.load_element(os.path.join(models_path, 'word2vec_model.pkl'))
        word_list, wordvec_list = modeling.get_word_and_wordvec(model, top_words_list)
        kmeans = modeling.get_cluster(wordvec_list, cluster='KMeans', cluster_args={
            'n_clusters': n_clusters, 'random_state': 9})
        word_label = kmeans.labels_
        word_df = pd.DataFrame()
        word_df['word'] = word_list
        word_df['wordvec'] = wordvec_list
        word_df['word_label'] = word_label
        news_pandas.save_news(word_df, os.path.join(results_path, 'word_df.csv'))
        messagebox.showinfo('Message', '词汇聚类完成!')

    Button_WordsCluster = tk.Button(detail_tk, text='''词汇聚类''', command=cluster_word)
    Button_WordsCluster.place(relx=0.55, rely=0.4, height=26, width=80)

    Button_Show_Word_Cluster_Result = tk.Button(detail_tk, text='''查看词汇聚类效果''', command=show_word_cluster_result)
    Button_Show_Word_Cluster_Result.place(relx=0.38, rely=0.51, height=26, width=140)
    Button_Word_Barh = tk.Button(detail_tk, text='''查看词汇聚类条形图''', command=show_word_barh)
    Button_Word_Barh.place(relx=0.38, rely=0.61, height=26, width=154)
    Button_Word_Pie = tk.Button(detail_tk, text='''查看词汇聚类饼图''', command=show_word_pie)
    Button_Word_Pie.place(relx=0.38, rely=0.71, height=26, width=140)

    Label_L_7 = tk.Label(detail_tk, text='''第''')
    Label_L_7.place(relx=0.3, rely=0.84, height=18, width=16)
    cluster_n = tk.StringVar()
    Entry_Cluster_N = tk.Entry(detail_tk, textvariable=cluster_n)
    # cluster_n.set('1')
    Entry_Cluster_N.place(relx=0.34, rely=0.84, height=20, relwidth=0.07)
    Label_R_7 = tk.Label(detail_tk, text='''类词汇''')
    Label_R_7.place(relx=0.42, rely=0.84, height=18, width=50)

    def show_cluster_n_words():
        n = Entry_Cluster_N.get()
        if n == '':
            messagebox.showinfo('Message', '请先输入想要查看的词汇属于第几类!')
            return
        n = int(n)
        try:
            word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv'))
        except FileNotFoundError:
            messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
            return
        word_df['wordvec'] = word_df['wordvec'].map(eval)
        words_i_df = word_df[word_df['word_label'] == n - 1].copy()
        cluster_i_words = '\n'.join(words_i_df['word'].tolist())
        news_pandas.save_text(cluster_i_words, os.path.join(texts_path, 'cluster_i_words.txt'))
        filename = os.path.join(texts_path, 'cluster_i_words.txt')
        editor(filename)

    Button_Show_Cluster_N_Word = tk.Button(detail_tk, text='''查询''', command=show_cluster_n_words)
    Button_Show_Cluster_N_Word.place(relx=0.55, rely=0.84, height=26, width=50)

    detail_tk.mainloop()
def load_data():
    """加载数据"""
    save_file_path = os.path.join(news_path, 'news_df.csv')
    news_df = news_pandas.load_news(save_file_path)

    return news_df