def show_hot_titles():
    all_title = '\n'.join(df_rank_i['title_'].tolist())
    hot_titles = modeling.get_key_sentences(all_title, num=200)
    news_pandas.save_text(hot_titles, os.path.join(texts_path,
                                                   'hot_titles.txt'))
    filename = os.path.join(texts_path, 'hot_titles.txt')
    editor(filename)
Ejemplo n.º 2
0
def get_key_words():
    df_title = news_crawler.load_news(
        os.path.join(results_path, 'df_title_rank.csv'))
    df_content = news_crawler.load_news(
        os.path.join(results_path, 'df_content_rank.csv'))
    df_title['title_cut'] = df_title['title_cut'].map(eval)
    df_content['content_cut'] = df_content['content_cut'].map(eval)
    df_title_content = df_title.copy()
    df_title_content['content_cut'] = df_content['content_cut']
    df_title_content['content_rank'] = df_content['content_rank']
    df_title_content = modeling.get_non_outliers_data(
        df_title_content, label_column='title_rank')
    title_rank_num = counter.get_num_of_value_no_repeat(
        (df_title_content['title_rank']))
    for i in range(1, title_rank_num + 1):
        df_i = df_title_content[df_title_content['title_rank'] == i]
        title = '\n'.join(df_i['title'].tolist())
        title = modeling.get_key_sentences(title, num=1)
        print('热点:', title)
        content_rank = [k for k in df_i['content_rank']]
        content_rank = set(content_rank)
        for j in content_rank:
            df_j = df_i[df_i['content_rank'] == j]
            most_commmon_words = counter.get_most_common_words(
                df_j['content_cut'], top_n=20, min_frequency=5)
            if len(most_commmon_words) > 0:
                print('相关词汇:', most_commmon_words)
Ejemplo n.º 3
0
def show_details():
    top_num = Entry_TopHot.get()
    if top_num == '':
        messagebox.showinfo('Message', '请输入想查看的热点属于第几簇!')
        return
    top_num = int(top_num)
    try:
        df_non_outliers = news_pandas.load_news(os.path.join(results_path, 'news_non_outliers.csv'))
        df_non_outliers['content_cut'] = df_non_outliers['content_cut'].map(eval)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
        return
    global df_rank_i
    df_rank_i = df_non_outliers[df_non_outliers['rank'] == top_num]
    all_title = '\n'.join(df_rank_i['title_'].tolist())
    hot_title = modeling.get_key_sentences(all_title, num=1)

    detail_tk = tk.Tk()
    detail_tk.option_add("*Font", "helvetica 12 bold")
    detail_tk.geometry("720x540+323+114")
    detail_tk.title("第{}簇热点详情".format(top_num))

    Label_Title = tk.Label(detail_tk, text='''话题:''')
    Label_Title.place(relx=0.2, rely=0.1, height=26, width=66)
    # Label_HotTitle = tk.Label(detail_tk, text=hot_title, font=('SimHei', 12, 'bold'), fg='red')
    Label_HotTitle = tk.Label(detail_tk, text=hot_title, font=('helvetica', 12, 'bold'), fg='red')
    Label_HotTitle.place(relx=0.25, rely=0.15)
    Button_HotWords = tk.Button(detail_tk, text='''该处热点相关词汇''', command=show_hot_words_details)
    Button_HotWords.place(relx=0.25, rely=0.25, height=26, width=140)
    Button_HotTitles = tk.Button(detail_tk, text='''该处热点热门话题''', command=show_hot_titles)
    Button_HotTitles.place(relx=0.55, rely=0.25, height=26, width=140)

    Label_L_6 = tk.Label(detail_tk, text='''热点词汇分''')
    Label_L_6.place(relx=0.25, rely=0.4, height=18, width=90)
    n_to_cluster = tk.StringVar()
    Entry_N_Clusters = tk.Entry(detail_tk, textvariable=n_to_cluster)
    # n_to_cluster.set('15')
    Entry_N_Clusters.place(relx=0.37, rely=0.4, height=20, relwidth=0.07)
    Label_R_6 = tk.Label(detail_tk, text='''类聚类''')
    Label_R_6.place(relx=0.44, rely=0.4, height=18, width=50)

    def cluster_word():
        n_clusters = Entry_N_Clusters.get()
        if n_clusters == '':
            messagebox.showinfo('Message', '请输入词汇聚类的类别数!')
            return
        n_clusters = int(n_clusters)
        top_words_list = counter.get_most_common_words(df_rank_i['content_cut'], top_n=5000, min_frequency=1)
        model = news_pandas.load_element(os.path.join(models_path, 'word2vec_model.pkl'))
        word_list, wordvec_list = modeling.get_word_and_wordvec(model, top_words_list)
        kmeans = modeling.get_cluster(wordvec_list, cluster='KMeans', cluster_args={
            'n_clusters': n_clusters, 'random_state': 9})
        word_label = kmeans.labels_
        word_df = pd.DataFrame()
        word_df['word'] = word_list
        word_df['wordvec'] = wordvec_list
        word_df['word_label'] = word_label
        news_pandas.save_news(word_df, os.path.join(results_path, 'word_df.csv'))
        messagebox.showinfo('Message', '词汇聚类完成!')

    Button_WordsCluster = tk.Button(detail_tk, text='''词汇聚类''', command=cluster_word)
    Button_WordsCluster.place(relx=0.55, rely=0.4, height=26, width=80)

    Button_Show_Word_Cluster_Result = tk.Button(detail_tk, text='''查看词汇聚类效果''', command=show_word_cluster_result)
    Button_Show_Word_Cluster_Result.place(relx=0.38, rely=0.51, height=26, width=140)
    Button_Word_Barh = tk.Button(detail_tk, text='''查看词汇聚类条形图''', command=show_word_barh)
    Button_Word_Barh.place(relx=0.38, rely=0.61, height=26, width=154)
    Button_Word_Pie = tk.Button(detail_tk, text='''查看词汇聚类饼图''', command=show_word_pie)
    Button_Word_Pie.place(relx=0.38, rely=0.71, height=26, width=140)

    Label_L_7 = tk.Label(detail_tk, text='''第''')
    Label_L_7.place(relx=0.3, rely=0.84, height=18, width=16)
    cluster_n = tk.StringVar()
    Entry_Cluster_N = tk.Entry(detail_tk, textvariable=cluster_n)
    # cluster_n.set('1')
    Entry_Cluster_N.place(relx=0.34, rely=0.84, height=20, relwidth=0.07)
    Label_R_7 = tk.Label(detail_tk, text='''类词汇''')
    Label_R_7.place(relx=0.42, rely=0.84, height=18, width=50)

    def show_cluster_n_words():
        n = Entry_Cluster_N.get()
        if n == '':
            messagebox.showinfo('Message', '请先输入想要查看的词汇属于第几类!')
            return
        n = int(n)
        try:
            word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv'))
        except FileNotFoundError:
            messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!')
            return
        word_df['wordvec'] = word_df['wordvec'].map(eval)
        words_i_df = word_df[word_df['word_label'] == n - 1].copy()
        cluster_i_words = '\n'.join(words_i_df['word'].tolist())
        news_pandas.save_text(cluster_i_words, os.path.join(texts_path, 'cluster_i_words.txt'))
        filename = os.path.join(texts_path, 'cluster_i_words.txt')
        editor(filename)

    Button_Show_Cluster_N_Word = tk.Button(detail_tk, text='''查询''', command=show_cluster_n_words)
    Button_Show_Cluster_N_Word.place(relx=0.55, rely=0.84, height=26, width=50)

    detail_tk.mainloop()
Ejemplo n.º 4
0
 def f(text):
     text = preprocessing.clean_content(text)
     text = modeling.get_key_sentences(text, num=1)
     return text