コード例 #1
0
def crawler():
    sina_top_n = Entry_Sina.get()
    sohu_top_n = Entry_Sohu.get()
    xinhuanet_top_n = Entry_XinhuaNet.get()
    sina_top_n = 0 if sina_top_n == '' else int(sina_top_n)
    sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n)
    xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n)
    sina_top_n = 0 if sina_top_n <= 0 else sina_top_n
    sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n
    xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n
    if sina_top_n + sohu_top_n + xinhuanet_top_n == 0:
        messagebox.showinfo('Message', '新闻数量不能全部为非正数!')
        return
    news_df_file_path = os.path.join(news_path, 'news_df.csv')
    threaded_crawler(sina_top_n,
                     sohu_top_n,
                     xinhuanet_top_n,
                     save_file_path=news_df_file_path)
    news_df = load_news(news_df_file_path)
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df,
                          os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
    messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))
コード例 #2
0
 def cluster_word():
     n_clusters = Entry_N_Clusters.get()
     if n_clusters == '':
         messagebox.showinfo('Message', '请输入词汇聚类的类别数!')
         return
     n_clusters = int(n_clusters)
     top_words_list = counter.get_most_common_words(
         df_rank_i['content_cut'], top_n=5000, min_frequency=1)
     model = news_pandas.load_element(
         os.path.join(models_path, 'word2vec_model.pkl'))
     word_list, wordvec_list = modeling.get_word_and_wordvec(
         model, top_words_list)
     kmeans = modeling.get_cluster(wordvec_list,
                                   cluster='KMeans',
                                   cluster_args={
                                       'n_clusters': n_clusters,
                                       'random_state': 9
                                   })
     word_label = kmeans.labels_
     word_df = pd.DataFrame()
     word_df['word'] = word_list
     word_df['wordvec'] = wordvec_list
     word_df['word_label'] = word_label
     news_pandas.save_news(word_df, os.path.join(results_path,
                                                 'word_df.csv'))
     messagebox.showinfo('Message', '词汇聚类完成!')
コード例 #3
0
def preprocess():
    if filter_df0.shape[0] == 0:
        messagebox.showinfo('Message', '未选择新闻数据!')
        return
    df = filter_df0.copy()
    df['title_'] = df['title'].map(
        lambda x: preprocessing.clean_title_blank(x))
    df['content_'] = df['content'].map(
        lambda x: preprocessing.clean_content(x))
    df['content_'] = df['content_'].map(
        lambda x: preprocessing.get_num_en_ch(x))
    df['content_cut'] = df['content_'].map(lambda x: preprocessing.pseg_cut(
        x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.get_words_by_flags(
            x, flags=['n.*', 'v.*', 'eng', 't', 's', 'j', 'l', 'i']))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.disambiguation_cut(
            x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.individual_character_cut(
            x,
            os.path.join(extra_dict_path, 'self_individual_character_dict.txt')
        ))
    df['content_'] = df['content_cut'].map(lambda x: ' '.join(x))
    news_pandas.save_news(df, os.path.join(temp_news_path, 'news_cut.csv'))
    messagebox.showinfo('Message', '数据预处理完成!')
コード例 #4
0
def cluster_content():
    eps_var = Entry_Eps.get()
    min_samples_var = Entry_MinSamples.get()
    if eps_var == '' or min_samples_var == '':
        messagebox.showinfo('Message', '请输全聚类参数!')
        return
    eps_var = float(eps_var)
    min_samples_var = int(min_samples_var)
    try:
        df = news_pandas.load_news(os.path.join(temp_news_path,
                                                'news_cut.csv'))
        df['content_cut'] = df['content_cut'].map(eval)
        df['content_'] = df['content_'].map(str)
    except FileNotFoundError:
        messagebox.showinfo('Message', '请先对新闻内容文本进行预处理!')
        return
    word_library_list = counter.get_word_library(df['content_cut'])
    single_frequency_words_list = counter.get_single_frequency_words(
        df['content_cut'])
    max_features = len(
        word_library_list) - len(single_frequency_words_list) // 2
    matrix = modeling.feature_extraction(df['content_'],
                                         vectorizer='TfidfVectorizer',
                                         vec_args={
                                             'max_df': 0.95,
                                             'min_df': 1,
                                             'max_features': max_features
                                         })
    dbscan = modeling.get_cluster(matrix,
                                  cluster='DBSCAN',
                                  cluster_args={
                                      'eps': eps_var,
                                      'min_samples': min_samples_var,
                                      'metric': 'cosine'
                                  })
    labels = modeling.get_labels(dbscan)
    df['label'] = labels
    ranks = modeling.label2rank(labels)
    df['rank'] = ranks
    news_pandas.save_news(df, os.path.join(results_path, 'news_label.csv'))
    df['matrix'] = matrix.toarray().tolist()
    df_non_outliers = df[df['label'] != -1].copy()
    if df_non_outliers.shape[0] == 0:
        messagebox.showinfo('Message', '不能聚类出任何热点,请重新设置聚类参数!')
        return
    data_pca_tsne = modeling.feature_reduction(
        df_non_outliers['matrix'].tolist(),
        pca_n_components=3,
        tsne_n_components=2)
    df_non_outliers['pca_tsne'] = data_pca_tsne.tolist()
    del df_non_outliers['matrix']
    news_pandas.save_news(df_non_outliers,
                          os.path.join(results_path, 'news_non_outliers.csv'))
    rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank'])
    hot_num.set(rank_num)
    messagebox.showinfo('Message', '按照新闻内容聚类完成!')
コード例 #5
0
def select_news():
    filename = filedialog.askopenfilename(filetypes=[("csv file", "*.csv")])
    if len(filename) == 0:
        return
    news_df = news_pandas.load_news(filename)
    news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv'))
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
コード例 #6
0
def data_filter():
    if filter_df.shape[0] == 0:
        messagebox.showinfo('Message', '未选择任何新闻数据!')
        return
    date_f = Entry_Date.get()
    day_f = Entry_Day.get()
    if date_f == '' or day_f == '':
        messagebox.showinfo('Message', '请先填写筛选的日期和天数!')
        return
    global filter_df0
    filter_df0 = preprocessing.get_data(filter_df, last_time=date_f + ' 23:59', delta=int(day_f))
    news_pandas.save_news(filter_df0, os.path.join(temp_news_path, 'filter_news_by_time.csv'))
    news_num = filter_df0.shape[0]
    filter_n.set(news_num)
コード例 #7
0
def crawler():
    sina_top_n = Entry_Sina.get()
    sohu_top_n = Entry_Sohu.get()
    xinhuanet_top_n = Entry_XinhuaNet.get()
    sina_top_n = 0 if sina_top_n == '' else int(sina_top_n)
    sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n)
    xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n)
    sina_top_n = 0 if sina_top_n <= 0 else sina_top_n
    sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n
    xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n
    if sina_top_n + sohu_top_n + xinhuanet_top_n == 0:
        messagebox.showinfo('Message', '新闻数量不能全部为非正数!')
        return
    news_crawler.threaded_crawler(sina_top_n, sohu_top_n, xinhuanet_top_n)
    sina_news_df = pd.DataFrame()
    sohu_news_df = pd.DataFrame()
    xinhuanet_news_df = pd.DataFrame()
    if sina_top_n > 0:
        sina_news_df = news_pandas.load_news(
            os.path.join(news_path, 'sina_latest_news.csv'))
    if sohu_top_n > 0:
        sohu_news_df = news_pandas.load_news(
            os.path.join(news_path, 'sohu_latest_news.csv'))
    if xinhuanet_top_n > 0:
        xinhuanet_news_df = news_pandas.load_news(
            os.path.join(news_path, 'xinhuanet_latest_news.csv'))
    news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df],
                        ignore_index=True)
    news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv'))
    global filter_df
    filter_df = preprocessing.data_filter(news_df)
    news_pandas.save_news(filter_df,
                          os.path.join(temp_news_path, 'filter_news.csv'))
    news_num = filter_df.shape[0]
    sum_top_n.set(news_num)
    messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))