Esempio n. 1
0
def title_cluster(df, save_df=False):
    """按新闻标题聚类"""
    df_title = df.copy()
    df_title = title_preprocess(df_title)
    word_library_list = counter.get_word_library(df_title['title_cut'])
    single_frequency_words_list = counter.get_single_frequency_words(df_title['title_cut'])
    max_features = len(word_library_list) - len(single_frequency_words_list) // 2
    title_matrix = modeling.feature_extraction(df_title['title_'], vectorizer='CountVectorizer',
                                               vec_args={'max_df': 1.0, 'min_df': 1, 'max_features': max_features})
    title_dbscan = modeling.get_cluster(title_matrix, cluster='DBSCAN',
                                        cluster_args={'eps': 0.4, 'min_samples': 4, 'metric': 'cosine'})
    title_labels = modeling.get_labels(title_dbscan)
    df_title['title_label'] = title_labels
    df_non_outliers = modeling.get_non_outliers_data(df_title, label_column='title_label')
    title_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['title_label'].tolist())
    print('按新闻标题聚类,一共有%d个簇(不包括离群点)' % title_label_num)
    title_rank = modeling.label2rank(title_labels)
    df_title['title_rank'] = title_rank
    for i in range(1, title_label_num + 1):
        df_ = df_title[df_title['title_rank'] == i]
        title_top_list = counter.get_most_common_words(df_['title_cut'], top_n=10)
        print(title_top_list)
    if save_df:
        df_title.drop(['content', 'title_', 'title_label'], axis=1, inplace=True)
        news_crawler.save_news(df_title, os.path.join(results_path, 'df_title_rank.csv'))
    return df_title
Esempio n. 2
0
def content_cluster(df, df_save=False):
    """按新闻内容聚类"""
    df_content = df.copy()
    df_content = content_preprocess(df_content)
    word_library_list = counter.get_word_library(df_content['content_cut'])
    single_frequency_words_list = counter.get_single_frequency_words(df_content['content_cut'])
    max_features = len(word_library_list) - len(single_frequency_words_list) // 2
    content_matrix = modeling.feature_extraction(df_content['content_'], vectorizer='CountVectorizer',
                                                 vec_args={'max_df': 0.95, 'min_df': 1, 'max_features': max_features})
    content_dbscan = modeling.get_cluster(content_matrix, cluster='DBSCAN',
                                          cluster_args={'eps': 0.35, 'min_samples': 4, 'metric': 'cosine'})
    content_labels = modeling.get_labels(content_dbscan)
    df_content['content_label'] = content_labels
    df_non_outliers = modeling.get_non_outliers_data(df_content, label_column='content_label')
    content_label_num = counter.get_num_of_value_no_repeat(df_non_outliers['content_label'].tolist())
    print('按新闻内容聚类,一共有%d个簇(不包括离群点)' % content_label_num)
    content_rank = modeling.label2rank(content_labels)
    df_content['content_rank'] = content_rank
    for i in range(1, content_label_num + 1):
        df_ = df_content[df_content['content_rank'] == i]
        content_top_list = counter.get_most_common_words(df_['content_cut'], top_n=15, min_frequency=1)
        print(content_top_list)
    if df_save:
        df_content.drop(['content_', 'content_label'], axis=1, inplace=True)
        news_crawler.save_news(df_content, os.path.join(results_path, 'df_content_rank.csv'))
    return df_content
Esempio n. 3
0
def key_content(df, df_save=False):
    """获取摘要"""

    def f(text):
        text = preprocessing.clean_content(text)
        text = modeling.get_key_sentences(text, num=1)
        return text

    df['abstract'] = df['content'].map(f)
    if df_save:
        df.drop(['content'], axis=1, inplace=True)
        news_crawler.save_news(df, os.path.join(results_path, 'df_abstract.csv'))
    return df