Esempio n. 1
0
def title_preprocess(df_title):
    """标题分词处理"""
    df_title['title_'] = df_title['title'].map(
        lambda x: preprocessing.clean_title(x))
    df_title['title_'] = df_title['title_'].map(
        lambda x: preprocessing.get_num_en_ch(x))
    df_title['title_cut'] = df_title['title_'].map(
        lambda x: preprocessing.pseg_cut(
            x,
            userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
    df_title['title_cut'] = df_title['title_cut'].map(
        lambda x: preprocessing.get_words_by_flags(
            x, flags=['n.*', '.*n', 'v.*', 's', 'j', 'l', 'i', 'eng']))
    df_title['title_cut'] = df_title['title_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'HIT_stop_words.txt')))
    df_title['title_cut'] = df_title['title_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
    df_title['title_cut'] = df_title['title_cut'].map(
        lambda x: preprocessing.disambiguation_cut(
            x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
    df_title['title_cut'] = df_title['title_cut'].map(
        lambda x: preprocessing.individual_character_cut(
            x,
            os.path.join(extra_dict_path, 'self_individual_character_dict.txt')
        ))
    df_title['title_'] = df_title['title_cut'].map(lambda x: ' '.join(x))
    return df_title
def preprocess():
    if filter_df0.shape[0] == 0:
        messagebox.showinfo('Message', '未选择新闻数据!')
        return
    df = filter_df0.copy()
    df['title_'] = df['title'].map(
        lambda x: preprocessing.clean_title_blank(x))
    df['content_'] = df['content'].map(
        lambda x: preprocessing.clean_content(x))
    df['content_'] = df['content_'].map(
        lambda x: preprocessing.get_num_en_ch(x))
    df['content_cut'] = df['content_'].map(lambda x: preprocessing.pseg_cut(
        x, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.get_words_by_flags(
            x, flags=['n.*', 'v.*', 'eng', 't', 's', 'j', 'l', 'i']))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.stop_words_cut(
            x, os.path.join(extra_dict_path, 'self_stop_words.txt')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.disambiguation_cut(
            x, os.path.join(extra_dict_path, 'self_disambiguation_dict.json')))
    df['content_cut'] = df['content_cut'].map(
        lambda x: preprocessing.individual_character_cut(
            x,
            os.path.join(extra_dict_path, 'self_individual_character_dict.txt')
        ))
    df['content_'] = df['content_cut'].map(lambda x: ' '.join(x))
    news_pandas.save_news(df, os.path.join(temp_news_path, 'news_cut.csv'))
    messagebox.showinfo('Message', '数据预处理完成!')