コード例 #1
0
    def count_word_freq(self,data):
        '''
            统计每个词 在各个类别中的次数,每个词有四个统计项:
                1. FAVOR:	在favor类别中的出现的次数
                2. AGAINST:在AGAINST类别中的出现的次数
                3. NONE	: 在NONE类别中的出现的次数
                4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
                5. SUPPORT: 最高词频词频项/(FREQ)

        :param data:
        :return:
        '''
        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

        feature_encoder = FeatureEncoder(train_data=data['WORDS'].as_matrix(),
                                         verbose=0,
                                         padding_mode='none',
                                         need_segmented=False,
                                         full_mode=True,
                                         remove_stopword=True,
                                         replace_number=True,
                                         lowercase=True,
                                         remove_url=True,
                                         sentence_padding_length=7,
                                         add_unkown_word=False,
                                         mask_zero=False,
                                         zhs2zht=True,
                                         )

        # print feature_encoder.train_padding_index
        train_X_features = feature_encoder.to_onehot_array()

        np.save('result/train_X_feature',train_X_features)

        print train_X_features.shape
        print train_X_features[:5]
        vocabulary = feature_encoder.vocabulary
        print ','.join(vocabulary)
        print feature_encoder.vocabulary_size
        np.save('result/vocabulary',vocabulary)

        freq = np.sum(train_X_features,axis=0)
        favor_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'FAVOR'],axis=0)
        against_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'AGAINST'],axis=0)
        none_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'NONE'],axis=0)



        support = np.nan_to_num([max(favor,against,none)/(1.0*(favor+against+none)) for favor,against,none in zip(favor_freq,against_freq,none_freq)])
        print freq
        print favor_freq
        print against_freq
        print none_freq
        count_data = pd.DataFrame(data={
            u'WORD':vocabulary,
            u'FAVOR':favor_freq,
            u'AGAINST':against_freq,
            u'NONE':none_freq,
            u'SUPPORT':support,
            u'FREQ':freq,
        })
        count_data = count_data.sort_values(by=[u'SUPPORT',u'FREQ','WORD'],ascending=False)
        count_data = count_data[[u'WORD',u'FAVOR',u'AGAINST',u'NONE',u'FREQ',u'SUPPORT']]
        count_data.to_csv('result/word_count.csv',
                          sep='\t',
                          index=False,
                          header=True,
                          encoding='utf8',
                          )
        print count_data.head()
コード例 #2
0
def count_word_freq():
    '''
        对文件(train_data/TaskAA_all_data_2986.csv)统计词频。
        统计每个词 在各个类别中的次数,每个词有五个统计项:
            1. FAVOR:	在favor类别中的出现的次数
            2. AGAINST:在AGAINST类别中的出现的次数
            3. NONE	: 在NONE类别中的出现的次数
            4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
            5. SUPPORT: 最高词频词频项/(FREQ)
        步骤如下:
            1. 将所有句子转成onehot编码
            2. 统计每个词的5种统计值

    :return:
    '''
    # -------------- region start : 1. 将所有句子转成onehot编码,并保存数据 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('1. 将所有句子转成onehot编码,并保存数据')
        print('1. 将所有句子转成onehot编码,并保存数据')
    # -------------- code start : 开始 -------------

    from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
    # print train_dataA.head()
    print(train_dataA.shape)
    feature_encoder = FeatureEncoder(train_data=train_dataA['WORDS'].as_matrix(),
                                     verbose=0,
                                     padding_mode='none',
                                     need_segmented=False,
                                     full_mode=True,
                                     remove_stopword=True,
                                     replace_number=True,
                                     lowercase=True,
                                     remove_url=True,
                                     sentence_padding_length=7,
                                     add_unkown_word=False,
                                     mask_zero=False,
                                     zhs2zht=True,
                                     )

    # printfeature_encoder.train_padding_index
    train_X_features = feature_encoder.to_onehot_array()

    np.save('result/train_X_feature', train_X_features)

    print(train_X_features.shape)
    print(train_X_features[:5])
    vocabulary = feature_encoder.vocabulary
    print(','.join(vocabulary))
    print('字典个数有:%d' % feature_encoder.vocabulary_size)
    np.save('result/vocabulary', vocabulary)

    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
    # -------------- region end : 1. 将所有句子转成onehot编码,并保存数据 ---------------

    # -------------- region start : 2. 统计每个词的5种统计值 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('2. 统计每个词的5种统计值')
        print('2. 统计每个词的5种统计值')
    # -------------- code start : 开始 -------------

    # 总词频
    freq = np.sum(train_X_features, axis=0)
    # favor类中词频
    favor_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'FAVOR'], axis=0)
    # against类中词频
    against_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'AGAINST'], axis=0)
    # none类中词频
    none_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'NONE'], axis=0)
    # 支持度 :最高词频/总词频
    support = np.nan_to_num([max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in
                             zip(favor_freq, against_freq, none_freq)])

    print(freq)
    print(favor_freq)
    print(against_freq)
    print(none_freq)

    count_data = pd.DataFrame(data={
        u'WORD': vocabulary,
        u'FAVOR': favor_freq,
        u'AGAINST': against_freq,
        u'NONE': none_freq,
        u'SUPPORT': support,
        u'FREQ': freq,
    })
    # 排序
    count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False)
    count_data = count_data[[u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT']]
    # 保存
    count_data.to_csv('result/word_count_%d.csv' % feature_encoder.vocabulary_size,
                      sep='\t',
                      index=False,
                      header=True,
                      encoding='utf8',
                      )

    print(count_data.head())
    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
コード例 #3
0
ファイル: main.py プロジェクト: JDwangmo/coprocessor
                                 full_mode=True,
                                 remove_stopword=False,
                                 replace_number=True,
                                 lowercase=True,
                                 zhs2zht=True,
                                 remove_url=True,
                                 padding_mode='center',
                                 add_unkown_word=True,
                                 mask_zero=True,
                                 )


train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix())
feature_encoder.print_model_descibe()

feature_encoder.to_onehot_array()
quit()
train_y = train_data['LABEL_INDEX'].as_matrix()

test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix())

test_all_y = test_data['LABEL_INDEX'].as_matrix()

print(train_X_feature.shape)
print(test_all_X_feature.shape)

logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 2. 转换数据的格式并特征编码 -------------
# ****************************************************************
コード例 #4
0
    def count_word_freq(self, data):
        '''
            统计每个词 在各个类别中的次数,每个词有四个统计项:
                1. FAVOR:	在favor类别中的出现的次数
                2. AGAINST:在AGAINST类别中的出现的次数
                3. NONE	: 在NONE类别中的出现的次数
                4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
                5. SUPPORT: 最高词频词频项/(FREQ)

        :param data:
        :return:
        '''
        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

        feature_encoder = FeatureEncoder(
            train_data=data['WORDS'].as_matrix(),
            verbose=0,
            padding_mode='none',
            need_segmented=False,
            full_mode=True,
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            remove_url=True,
            sentence_padding_length=7,
            add_unkown_word=False,
            mask_zero=False,
            zhs2zht=True,
        )

        # print feature_encoder.train_padding_index
        train_X_features = feature_encoder.to_onehot_array()

        np.save('result/train_X_feature', train_X_features)

        print train_X_features.shape
        print train_X_features[:5]
        vocabulary = feature_encoder.vocabulary
        print ','.join(vocabulary)
        print feature_encoder.vocabulary_size
        np.save('result/vocabulary', vocabulary)

        freq = np.sum(train_X_features, axis=0)
        favor_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'FAVOR'], axis=0)
        against_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'AGAINST'], axis=0)
        none_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'NONE'], axis=0)

        support = np.nan_to_num([
            max(favor, against, none) / (1.0 * (favor + against + none)) for
            favor, against, none in zip(favor_freq, against_freq, none_freq)
        ])
        print freq
        print favor_freq
        print against_freq
        print none_freq
        count_data = pd.DataFrame(
            data={
                u'WORD': vocabulary,
                u'FAVOR': favor_freq,
                u'AGAINST': against_freq,
                u'NONE': none_freq,
                u'SUPPORT': support,
                u'FREQ': freq,
            })
        count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'],
                                            ascending=False)
        count_data = count_data[[
            u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT'
        ]]
        count_data.to_csv(
            'result/word_count.csv',
            sep='\t',
            index=False,
            header=True,
            encoding='utf8',
        )
        print count_data.head()
コード例 #5
0
def count_word_freq():
    '''
        对文件(train_data/TaskAA_all_data_2986.csv)统计词频。
        统计每个词 在各个类别中的次数,每个词有五个统计项:
            1. FAVOR:	在favor类别中的出现的次数
            2. AGAINST:在AGAINST类别中的出现的次数
            3. NONE	: 在NONE类别中的出现的次数
            4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
            5. SUPPORT: 最高词频词频项/(FREQ)
        步骤如下:
            1. 将所有句子转成onehot编码
            2. 统计每个词的5种统计值

    :return:
    '''
    # -------------- region start : 1. 将所有句子转成onehot编码,并保存数据 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('1. 将所有句子转成onehot编码,并保存数据')
        print('1. 将所有句子转成onehot编码,并保存数据')
    # -------------- code start : 开始 -------------

    from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
    # print train_dataA.head()
    print(train_dataA.shape)
    feature_encoder = FeatureEncoder(
        train_data=train_dataA['WORDS'].as_matrix(),
        verbose=0,
        padding_mode='none',
        need_segmented=False,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        remove_url=True,
        sentence_padding_length=7,
        add_unkown_word=False,
        mask_zero=False,
        zhs2zht=True,
    )

    # printfeature_encoder.train_padding_index
    train_X_features = feature_encoder.to_onehot_array()

    np.save('result/train_X_feature', train_X_features)

    print(train_X_features.shape)
    print(train_X_features[:5])
    vocabulary = feature_encoder.vocabulary
    print(','.join(vocabulary))
    print('字典个数有:%d' % feature_encoder.vocabulary_size)
    np.save('result/vocabulary', vocabulary)

    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
    # -------------- region end : 1. 将所有句子转成onehot编码,并保存数据 ---------------

    # -------------- region start : 2. 统计每个词的5种统计值 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('2. 统计每个词的5种统计值')
        print('2. 统计每个词的5种统计值')
    # -------------- code start : 开始 -------------

    # 总词频
    freq = np.sum(train_X_features, axis=0)
    # favor类中词频
    favor_freq = np.sum(
        train_X_features[train_dataA['STANCE'].as_matrix() == u'FAVOR'],
        axis=0)
    # against类中词频
    against_freq = np.sum(
        train_X_features[train_dataA['STANCE'].as_matrix() == u'AGAINST'],
        axis=0)
    # none类中词频
    none_freq = np.sum(
        train_X_features[train_dataA['STANCE'].as_matrix() == u'NONE'], axis=0)
    # 支持度 :最高词频/总词频
    support = np.nan_to_num([
        max(favor, against, none) / (1.0 * (favor + against + none))
        for favor, against, none in zip(favor_freq, against_freq, none_freq)
    ])

    print(freq)
    print(favor_freq)
    print(against_freq)
    print(none_freq)

    count_data = pd.DataFrame(
        data={
            u'WORD': vocabulary,
            u'FAVOR': favor_freq,
            u'AGAINST': against_freq,
            u'NONE': none_freq,
            u'SUPPORT': support,
            u'FREQ': freq,
        })
    # 排序
    count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'],
                                        ascending=False)
    count_data = count_data[[
        u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT'
    ]]
    # 保存
    count_data.to_csv(
        'result/word_count_%d.csv' % feature_encoder.vocabulary_size,
        sep='\t',
        index=False,
        header=True,
        encoding='utf8',
    )

    print(count_data.head())
    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)