Ejemplo n.º 1
0
    def get_feature_encoder(**kwargs):
        '''
            获取该分类器的特征编码器

        :param kwargs:  可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。
        :return:
        '''

        assert kwargs.has_key('input_length'), '请提供 input_length 的属性值'

        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
        from data_processing_util.feature_encoder.feature_encoder_merge import FeatureEncoderMerge

        w2v_feature_encoder = FeatureEncoder(
            sentence_padding_length=kwargs['input_length'],
            verbose=kwargs.get('verbose', 0),
            need_segmented=True,
            full_mode=kwargs.get('full_mode', False),
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            padding_mode='center',
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word'),
            word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False),
            word2vec_model_file_path=kwargs.get('word2vec_model_file_path',
                                                None))

        bow_feature_encoder = FeatureEncoder(
            sentence_padding_length=kwargs['input_length'],
            verbose=kwargs.get('verbose', 0),
            need_segmented=True,
            full_mode=kwargs.get('full_mode', False),
            replace_number=True,
            remove_stopword=True,
            lowercase=True,
            padding_mode='left',
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word'),
            zhs2zht=True,
            remove_url=True,
            # 设置为True,输出 onehot array
            to_onehot_array=True,
            word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False),
            word2vec_model_file_path=kwargs.get('word2vec_model_file_path',
                                                None))

        return FeatureEncoderMerge(bow_feature_encoder=bow_feature_encoder,
                                   w2v_feature_encoder=w2v_feature_encoder)
Ejemplo n.º 2
0
    def get_feature_encoder(**kwargs):
        '''
            获取该分类器的特征编码器

        :param kwargs:  可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。
        :return:
        '''

        assert kwargs.has_key('input_length'), '请提供 input_length 的属性值'

        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
        feature_encoder = FeatureEncoder(
            sentence_padding_length=kwargs['input_length'],
            verbose=kwargs.get('verbose', 0),
            need_segmented=True,
            full_mode=kwargs.get('full_mode', False),
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            padding_mode='center',
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word')
        )

        return feature_encoder
Ejemplo n.º 3
0
    def get_feature_encoder(**kwargs):
        '''
            返回 该模型的输入 特征编码器

        :param kwargs: 可设置参数 [ sentence_padding_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],word2vec_to_solve_oov[#,False],word2vec_model_file_path[#,None],加*表示必须提供,加#表示可选,不写则默认。

        :return:
        '''

        assert kwargs.has_key('input_length'), '请提供 input_length 的属性值'

        feature_encoder = FeatureEncoder(
            sentence_padding_length=kwargs['input_length'],
            verbose=kwargs.get('verbose', 0),
            need_segmented=True,
            full_mode=kwargs.get('full_mode', False),
            replace_number=True,
            remove_stopword=True,
            lowercase=True,
            padding_mode='left',
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word'),
            zhs2zht=True,
            remove_url=True,
            # 设置为True,输出 onehot array
            to_onehot_array=True,
            word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False),
            word2vec_model_file_path=kwargs.get('word2vec_model_file_path',
                                                None))
        if kwargs.get('verbose', 0) > 0:
            pprint.pprint(kwargs)

        return feature_encoder
Ejemplo n.º 4
0
    def get_feature_encoder(**kwargs):
        """
            获取该分类器的特征编码器

        :param kwargs:  可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。
        :return:
        """

        assert kwargs.has_key('input_length'), '请提供 input_length 的属性值'

        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
        feature_encoder = FeatureEncoder(
            need_segmented=kwargs.get('need_segmented', True),
            sentence_padding_length=kwargs['input_length'],
            verbose=kwargs.get('verbose', 0),
            full_mode=kwargs.get('full_mode', False),
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            padding_mode=kwargs.get('padding_mode', 'center'),
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word'),
            vocabulary_including_test_set=kwargs.get(
                'vocabulary_including_test_set', True),
            update_dictionary=kwargs.get('update_dictionary', True))

        return feature_encoder
Ejemplo n.º 5
0
    def get_feature_encoder(**kwargs):
        '''
            获取该分类器的特征编码器

        :param kwargs:  word_input_length,seg_input_length
        :return:
        '''

        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
        word_feature_encoder = FeatureEncoder(
            sentence_padding_length=kwargs['word_input_length'],
            verbose=0,
            need_segmented=True,
            full_mode=False,
            replace_number=True,
            remove_stopword=True,
            lowercase=True,
            padding_mode='left',
            add_unkown_word=True,
            feature_type='word',
            zhs2zht=True,
            remove_url=True,
            # 设置为True,输出 onehot array
            to_onehot_array=True,
        )

        seg_feature_encoder = FeatureEncoder(
            sentence_padding_length=kwargs['seg_input_length'],
            verbose=0,
            need_segmented=True,
            full_mode=False,
            replace_number=True,
            remove_stopword=True,
            lowercase=True,
            padding_mode='left',
            add_unkown_word=True,
            feature_type='seg',
            zhs2zht=True,
            remove_url=True,
            # 设置为True,输出 onehot array
            to_onehot_array=True,
        )

        return word_feature_encoder,seg_feature_encoder
Ejemplo n.º 6
0
def test_onehot_bow_cnn():
    # 使用样例
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子', '你好', '你妹']
    test_y = [2, 3, 0]
    sentence_padding_length = 8
    from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
    word_feature_encoder = FeatureEncoder(
        sentence_padding_length=sentence_padding_length,
        verbose=0,
        need_segmented=True,
        full_mode=True,
        replace_number=True,
        remove_stopword=True,
        lowercase=True,
        padding_mode='left',
        add_unkown_word=True,
        feature_type='word',
        zhs2zht=True,
        remove_url=True,
        # 设置为True,输出 onehot array
        to_onehot_array=True,
    )

    train_X_word_feature = word_feature_encoder.fit_transform(train_X)
    test_X_word_feature = word_feature_encoder.transform(test_X)
    print(','.join(word_feature_encoder.vocabulary))
    print train_X_word_feature.shape
    print train_X_word_feature

    seg_feature_encoder = FeatureEncoder(
        sentence_padding_length=sentence_padding_length,
        verbose=0,
        need_segmented=True,
        full_mode=True,
        replace_number=True,
        remove_stopword=True,
        lowercase=True,
        padding_mode='left',
        add_unkown_word=True,
        feature_type='seg',
        zhs2zht=True,
        remove_url=True,
        # 设置为True,输出 onehot array
        to_onehot_array=True,
    )

    train_X_seg_feature = seg_feature_encoder.fit_transform(train_X)
    test_X_seg_feature = seg_feature_encoder.transform(test_X)
    print(','.join(seg_feature_encoder.vocabulary))
    print train_X_seg_feature.shape
    print train_X_seg_feature

    # quit()
    onehot_cnn = MultiChannelOnehotBowCNN(
        rand_seed=1377,
        verbose=1,
        feature_encoder=(word_feature_encoder,seg_feature_encoder),
        # optimizers='adadelta',
        optimizers='sgd',
        word_input_length=sentence_padding_length,
        seg_input_length=sentence_padding_length,
        word_input_dim=word_feature_encoder.vocabulary_size,
        seg_input_dim=seg_feature_encoder.vocabulary_size,
        num_labels=5,
        l1_conv_filter_type=[
            [1, 2, -1, 'valid', (0, 1), 0., 'relu', 'none'],
            [1, 3, -1, 'valid', (0, 1), 0., 'relu', 'none'],
            [1, -1, -1, 'bow', (0, 1), 0., 'relu', 'none'],
        ],
        l2_conv_filter_type=[
            # [16, 2, -1, 'valid',(2,1),0.5, 'relu', 'none']
        ],
        full_connected_layer_units=[
            (50, 0.5, 'relu', 'none'),
        ],
        embedding_dropout_rate=0.,
        nb_epoch=30,
        nb_batch=5,
        earlyStoping_patience=20,
        lr=1e-2,
    )
    onehot_cnn.print_model_descibe()
    # 训练模型
    # 从保存的pickle中加载模型
    # onehot_cnn.model_from_pickle('model/modelA.pkl')
    print(onehot_cnn.fit(([train_X_word_feature,train_X_seg_feature], trian_y),
                   ([test_X_word_feature,test_X_seg_feature], test_y)))
    print(trian_y)
    # loss, train_accuracy = onehot_cnn.model.evaluate(train_X_feature, trian_y)

    # onehot_cnn.accuracy((train_X_word_feature, trian_y), transform_input=False)
    print(onehot_cnn.batch_predict([test_X_word_feature,test_X_seg_feature], transform_input=False))
    print(onehot_cnn.batch_predict_bestn([test_X_word_feature,test_X_seg_feature], transform_input=False, bestn=2))
    quit()
    print onehot_cnn.batch_predict(test_X, transform_input=True)
    print onehot_cnn.predict(test_X[0], transform_input=True)
    onehot_cnn.accuracy((test_X, test_y), transform_input=True)
    # 保存模型
    # onehot_cnn.save_model('model/modelA.pkl')

    print onehot_cnn.predict('你好吗', transform_input=True)
train_X = (train_data['TARGET'] + ',' + train_data['TEXT']).as_matrix()
test_X = (test_data['TARGET'] + ',' + test_data['TEXT']).as_matrix()

train_y = train_data['STANCE'].map(label_to_index).as_matrix()
test_y = test_data['STANCE'].map(label_to_index).as_matrix()
# print set(train_data['STANCE'])
# print train_y
# print test_y
# quit()
feature_encoder = FeatureEncoder(
                                 sentence_padding_length=config['sentence_padding_length'],
                                 verbose=0,
                                 need_segmented=config['need_segmented'],
                                 full_mode=True,
                                 replace_number=True,
                                 remove_stopword=True,
                                 lowercase=True,
                                 padding_mode='center',
                                 add_unkown_word=True,
                                 mask_zero=True,
                                 zhs2zht=True,
                                 remove_url=True,
                                 )

train_X_feature = feature_encoder.fit_transform(train_data=train_X)
test_X_feature = map(feature_encoder.transform_sentence, test_X)

feature_encoder.print_sentence_length_detail
print feature_encoder.vocabulary_size
# print ','.join(sorted(feature_encoder.vocabulary))
# quit()
feature_encoder.print_model_descibe()
Ejemplo n.º 8
0
    def count_word_freq(self, data):
        '''
            统计每个词 在各个类别中的次数,每个词有四个统计项:
                1. FAVOR:	在favor类别中的出现的次数
                2. AGAINST:在AGAINST类别中的出现的次数
                3. NONE	: 在NONE类别中的出现的次数
                4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
                5. SUPPORT: 最高词频词频项/(FREQ)

        :param data:
        :return:
        '''
        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

        feature_encoder = FeatureEncoder(
            train_data=data['WORDS'].as_matrix(),
            verbose=0,
            padding_mode='none',
            need_segmented=False,
            full_mode=True,
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            remove_url=True,
            sentence_padding_length=7,
            add_unkown_word=False,
            mask_zero=False,
            zhs2zht=True,
        )

        # print feature_encoder.train_padding_index
        train_X_features = feature_encoder.to_onehot_array()

        np.save('result/train_X_feature', train_X_features)

        print train_X_features.shape
        print train_X_features[:5]
        vocabulary = feature_encoder.vocabulary
        print ','.join(vocabulary)
        print feature_encoder.vocabulary_size
        np.save('result/vocabulary', vocabulary)

        freq = np.sum(train_X_features, axis=0)
        favor_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'FAVOR'], axis=0)
        against_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'AGAINST'], axis=0)
        none_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'NONE'], axis=0)

        support = np.nan_to_num([
            max(favor, against, none) / (1.0 * (favor + against + none)) for
            favor, against, none in zip(favor_freq, against_freq, none_freq)
        ])
        print freq
        print favor_freq
        print against_freq
        print none_freq
        count_data = pd.DataFrame(
            data={
                u'WORD': vocabulary,
                u'FAVOR': favor_freq,
                u'AGAINST': against_freq,
                u'NONE': none_freq,
                u'SUPPORT': support,
                u'FREQ': freq,
            })
        count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'],
                                            ascending=False)
        count_data = count_data[[
            u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT'
        ]]
        count_data.to_csv(
            'result/word_count.csv',
            sep='\t',
            index=False,
            header=True,
            encoding='utf8',
        )
        print count_data.head()
Ejemplo n.º 9
0
# ****************************************************************
# +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')


from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

feature_encoder = FeatureEncoder(
    sentence_padding_length=sentence_padding_length,
    verbose=1,
    need_segmented=True,
    full_mode=full_mode,
    remove_stopword=True,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    feature_type=feature_type,
)


train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix())
feature_encoder.print_model_descibe()
feature_encoder.print_sentence_length_detail()

# train_y = train_data['LABEL_INDEX'].as_matrix()

test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix())
Ejemplo n.º 10
0

if __name__ == '__main__':
    # 使用样例
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['你好', '你好', '你妹']
    test_y = [3, 1, 1]
    sentence_padding_length = 10
    feature_encoder = FeatureEncoder(
                                     sentence_padding_length=sentence_padding_length,
                                     verbose=0,
                                     need_segmented=True,
                                     full_mode=True,
                                     remove_stopword=True,
                                     replace_number=True,
                                     lowercase=True,
                                     zhs2zht=True,
                                     remove_url=True,
                                     padding_mode='center',
                                     add_unkown_word=True,
                                     mask_zero=True
                                     )
    train_X_features = feature_encoder.fit_transform(train_data=train_X)
    test_X_features = feature_encoder.transform(test_X)
    print(train_X_features)
    dcnn = DynamicCNN(
        rand_seed=1337,
        verbose=1,
        batch_size=2,
        vocab_size=feature_encoder.vocabulary_size,
        word_embedding_dim=48,
Ejemplo n.º 11
0
# -------------- code start : 开始 -------------


# 将TARGET和TEXT字段进行拼接
train_X = (train_data['TARGET'] + ',' + train_data['TEXT']).as_matrix()
test_X = (test_data['TARGET'] + ',' + test_data['TEXT']).as_matrix()

train_y = train_data['STANCE'].map(label_to_index).as_matrix()
test_y = test_data['STANCE'].map(label_to_index).as_matrix()
feature_encoder = FeatureEncoder(
    sentence_padding_length=config['sentence_padding_length'],
    verbose=0,
    need_segmented=config['need_segmented'],
    full_mode=True,
    remove_stopword=True,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    mask_zero=True
)
train_X_features = feature_encoder.fit_transform(train_data=train_X)
test_X_features = feature_encoder.transform(test_X)

feature_encoder.print_sentence_length_detail
print(feature_encoder.vocabulary_size)
# print ','.join(sorted(feature_encoder.vocabulary))
# quit()
feature_encoder.print_model_descibe()
# -------------- code start : 结束 -------------
Ejemplo n.º 12
0
logging.debug('=' * 20)
logging.debug('生成w2v(补齐)特征生成器')


from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
from deep_learning.cnn.wordEmbedding_cnn.wordEmbedding_cnn_model import WordEmbeddingCNN

sentence_padding_length = config['padding_length']

feature_encoder = FeatureEncoder(
    sentence_padding_length=sentence_padding_length,
    verbose=1,
    need_segmented=True,
    full_mode=True,
    remove_stopword=False,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    mask_zero=True,
)

train_w2v_features = feature_encoder.fit_transform(train_data['SENTENCE'].as_matrix())
test_w2v_features  = feature_encoder.transform(test_data['SENTENCE'].as_matrix())
train_y = train_data['LABEL_INDEX'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()

print train_w2v_features[0]
print test_w2v_features.shape
Ejemplo n.º 13
0
    print('2. 转换数据格式,以可以进行分类')
# -------------- code start : 开始 -------------

# 将TARGET和TEXT字段进行拼接
train_X = (train_data['TARGET'] + ',' + train_data['TEXT']).as_matrix()
test_X = (test_data['TARGET'] + ',' + test_data['TEXT']).as_matrix()

train_y = train_data['STANCE'].map(label_to_index).as_matrix()
test_y = test_data['STANCE'].map(label_to_index).as_matrix()
feature_encoder = FeatureEncoder(
    sentence_padding_length=config['sentence_padding_length'],
    verbose=0,
    need_segmented=config['need_segmented'],
    full_mode=True,
    remove_stopword=True,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    mask_zero=True)
train_X_features = feature_encoder.fit_transform(train_data=train_X)
test_X_features = feature_encoder.transform(test_X)

feature_encoder.print_sentence_length_detail
print(feature_encoder.vocabulary_size)
# print ','.join(sorted(feature_encoder.vocabulary))
# quit()
feature_encoder.print_model_descibe()
# -------------- code start : 结束 -------------
Ejemplo n.º 14
0
# -------------- code start : 开始 -------------

# 将TARGET和TEXT字段进行拼接
train_X = (train_data['TARGET'] + ',' + train_data['TEXT']).as_matrix()
test_X = (test_data['TARGET'] + ',' + test_data['TEXT']).as_matrix()

train_y = train_data['STANCE'].map(label_to_index).as_matrix()
test_y = test_data['STANCE'].map(label_to_index).as_matrix()

feature_encoder = FeatureEncoder(train_data=train_X,
                                 sentence_padding_length=config['sentence_padding_length'],
                                 verbose=0,
                                 need_segmented=config['need_segmented'],
                                 full_mode=True,
                                 replace_number=True,
                                 remove_stopword=True,
                                 lowercase=True,
                                 padding_mode='center',
                                 add_unkown_word=True,
                                 mask_zero=True,
                                 zhs2zht=True,
                                 remove_url=True,
                                 )

train_X_feature = feature_encoder.train_padding_index
test_X_feature = map(feature_encoder.transform_sentence, test_X)

feature_encoder.print_sentence_length_detail
print feature_encoder.vocabulary_size
# print ','.join(sorted(feature_encoder.vocabulary))
# quit()
feature_encoder.print_model_descibe()
Ejemplo n.º 15
0
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')


from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

sentence_padding_length = config['padding_length']
feature_encoder = FeatureEncoder(
    sentence_padding_length=sentence_padding_length,
    verbose=1,
    need_segmented=True,
    full_mode=True,
    remove_stopword=False,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    mask_zero=True,
    to_onehot_array=True,
)


train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix())

feature_encoder.print_model_descibe()

train_y = train_data['LABEL_INDEX'].as_matrix()
Ejemplo n.º 16
0
def count_word_freq():
    '''
        对文件(train_data/TaskAA_all_data_2986.csv)统计词频。
        统计每个词 在各个类别中的次数,每个词有五个统计项:
            1. FAVOR:	在favor类别中的出现的次数
            2. AGAINST:在AGAINST类别中的出现的次数
            3. NONE	: 在NONE类别中的出现的次数
            4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
            5. SUPPORT: 最高词频词频项/(FREQ)
        步骤如下:
            1. 将所有句子转成onehot编码
            2. 统计每个词的5种统计值

    :return:
    '''
    # -------------- region start : 1. 将所有句子转成onehot编码,并保存数据 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('1. 将所有句子转成onehot编码,并保存数据')
        print('1. 将所有句子转成onehot编码,并保存数据')
    # -------------- code start : 开始 -------------

    from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
    # print train_dataA.head()
    print(train_dataA.shape)
    feature_encoder = FeatureEncoder(train_data=train_dataA['WORDS'].as_matrix(),
                                     verbose=0,
                                     padding_mode='none',
                                     need_segmented=False,
                                     full_mode=True,
                                     remove_stopword=True,
                                     replace_number=True,
                                     lowercase=True,
                                     remove_url=True,
                                     sentence_padding_length=7,
                                     add_unkown_word=False,
                                     mask_zero=False,
                                     zhs2zht=True,
                                     )

    # printfeature_encoder.train_padding_index
    train_X_features = feature_encoder.to_onehot_array()

    np.save('result/train_X_feature', train_X_features)

    print(train_X_features.shape)
    print(train_X_features[:5])
    vocabulary = feature_encoder.vocabulary
    print(','.join(vocabulary))
    print('字典个数有:%d' % feature_encoder.vocabulary_size)
    np.save('result/vocabulary', vocabulary)

    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
    # -------------- region end : 1. 将所有句子转成onehot编码,并保存数据 ---------------

    # -------------- region start : 2. 统计每个词的5种统计值 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('2. 统计每个词的5种统计值')
        print('2. 统计每个词的5种统计值')
    # -------------- code start : 开始 -------------

    # 总词频
    freq = np.sum(train_X_features, axis=0)
    # favor类中词频
    favor_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'FAVOR'], axis=0)
    # against类中词频
    against_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'AGAINST'], axis=0)
    # none类中词频
    none_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'NONE'], axis=0)
    # 支持度 :最高词频/总词频
    support = np.nan_to_num([max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in
                             zip(favor_freq, against_freq, none_freq)])

    print(freq)
    print(favor_freq)
    print(against_freq)
    print(none_freq)

    count_data = pd.DataFrame(data={
        u'WORD': vocabulary,
        u'FAVOR': favor_freq,
        u'AGAINST': against_freq,
        u'NONE': none_freq,
        u'SUPPORT': support,
        u'FREQ': freq,
    })
    # 排序
    count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False)
    count_data = count_data[[u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT']]
    # 保存
    count_data.to_csv('result/word_count_%d.csv' % feature_encoder.vocabulary_size,
                      sep='\t',
                      index=False,
                      header=True,
                      encoding='utf8',
                      )

    print(count_data.head())
    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
Ejemplo n.º 17
0

if __name__ == '__main__':
    # 使用样例
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['你好', '你好', '你妹']
    test_y = [3, 1, 1]
    sentence_padding_length = 10
    feature_encoder = FeatureEncoder(
        sentence_padding_length=sentence_padding_length,
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        padding_mode='center',
        add_unkown_word=True,
        mask_zero=True)
    train_X_features = feature_encoder.fit_transform(train_data=train_X)
    print(train_X_features)
    dcnn = DynamicCNN(
        rand_seed=1337,
        verbose=2,
        batch_size=1,
        vocab_size=feature_encoder.vocabulary_size,
        word_embedding_dim=48,
        # input_length=None,
Ejemplo n.º 18
0
    def count_word_freq(self,data):
        '''
            统计每个词 在各个类别中的次数,每个词有四个统计项:
                1. FAVOR:	在favor类别中的出现的次数
                2. AGAINST:在AGAINST类别中的出现的次数
                3. NONE	: 在NONE类别中的出现的次数
                4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
                5. SUPPORT: 最高词频词频项/(FREQ)

        :param data:
        :return:
        '''
        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

        feature_encoder = FeatureEncoder(train_data=data['WORDS'].as_matrix(),
                                         verbose=0,
                                         padding_mode='none',
                                         need_segmented=False,
                                         full_mode=True,
                                         remove_stopword=True,
                                         replace_number=True,
                                         lowercase=True,
                                         remove_url=True,
                                         sentence_padding_length=7,
                                         add_unkown_word=False,
                                         mask_zero=False,
                                         zhs2zht=True,
                                         )

        # print feature_encoder.train_padding_index
        train_X_features = feature_encoder.to_onehot_array()

        np.save('result/train_X_feature',train_X_features)

        print train_X_features.shape
        print train_X_features[:5]
        vocabulary = feature_encoder.vocabulary
        print ','.join(vocabulary)
        print feature_encoder.vocabulary_size
        np.save('result/vocabulary',vocabulary)

        freq = np.sum(train_X_features,axis=0)
        favor_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'FAVOR'],axis=0)
        against_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'AGAINST'],axis=0)
        none_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'NONE'],axis=0)



        support = np.nan_to_num([max(favor,against,none)/(1.0*(favor+against+none)) for favor,against,none in zip(favor_freq,against_freq,none_freq)])
        print freq
        print favor_freq
        print against_freq
        print none_freq
        count_data = pd.DataFrame(data={
            u'WORD':vocabulary,
            u'FAVOR':favor_freq,
            u'AGAINST':against_freq,
            u'NONE':none_freq,
            u'SUPPORT':support,
            u'FREQ':freq,
        })
        count_data = count_data.sort_values(by=[u'SUPPORT',u'FREQ','WORD'],ascending=False)
        count_data = count_data[[u'WORD',u'FAVOR',u'AGAINST',u'NONE',u'FREQ',u'SUPPORT']]
        count_data.to_csv('result/word_count.csv',
                          sep='\t',
                          index=False,
                          header=True,
                          encoding='utf8',
                          )
        print count_data.head()
Ejemplo n.º 19
0
def count_word_freq():
    '''
        对文件(train_data/TaskAA_all_data_2986.csv)统计词频。
        统计每个词 在各个类别中的次数,每个词有五个统计项:
            1. FAVOR:	在favor类别中的出现的次数
            2. AGAINST:在AGAINST类别中的出现的次数
            3. NONE	: 在NONE类别中的出现的次数
            4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
            5. SUPPORT: 最高词频词频项/(FREQ)
        步骤如下:
            1. 将所有句子转成onehot编码
            2. 统计每个词的5种统计值

    :return:
    '''
    # -------------- region start : 1. 将所有句子转成onehot编码,并保存数据 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('1. 将所有句子转成onehot编码,并保存数据')
        print('1. 将所有句子转成onehot编码,并保存数据')
    # -------------- code start : 开始 -------------

    from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
    # print train_dataA.head()
    print(train_dataA.shape)
    feature_encoder = FeatureEncoder(
        train_data=train_dataA['WORDS'].as_matrix(),
        verbose=0,
        padding_mode='none',
        need_segmented=False,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        remove_url=True,
        sentence_padding_length=7,
        add_unkown_word=False,
        mask_zero=False,
        zhs2zht=True,
    )

    # printfeature_encoder.train_padding_index
    train_X_features = feature_encoder.to_onehot_array()

    np.save('result/train_X_feature', train_X_features)

    print(train_X_features.shape)
    print(train_X_features[:5])
    vocabulary = feature_encoder.vocabulary
    print(','.join(vocabulary))
    print('字典个数有:%d' % feature_encoder.vocabulary_size)
    np.save('result/vocabulary', vocabulary)

    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
    # -------------- region end : 1. 将所有句子转成onehot编码,并保存数据 ---------------

    # -------------- region start : 2. 统计每个词的5种统计值 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)
        logging.debug('2. 统计每个词的5种统计值')
        print('2. 统计每个词的5种统计值')
    # -------------- code start : 开始 -------------

    # 总词频
    freq = np.sum(train_X_features, axis=0)
    # favor类中词频
    favor_freq = np.sum(
        train_X_features[train_dataA['STANCE'].as_matrix() == u'FAVOR'],
        axis=0)
    # against类中词频
    against_freq = np.sum(
        train_X_features[train_dataA['STANCE'].as_matrix() == u'AGAINST'],
        axis=0)
    # none类中词频
    none_freq = np.sum(
        train_X_features[train_dataA['STANCE'].as_matrix() == u'NONE'], axis=0)
    # 支持度 :最高词频/总词频
    support = np.nan_to_num([
        max(favor, against, none) / (1.0 * (favor + against + none))
        for favor, against, none in zip(favor_freq, against_freq, none_freq)
    ])

    print(freq)
    print(favor_freq)
    print(against_freq)
    print(none_freq)

    count_data = pd.DataFrame(
        data={
            u'WORD': vocabulary,
            u'FAVOR': favor_freq,
            u'AGAINST': against_freq,
            u'NONE': none_freq,
            u'SUPPORT': support,
            u'FREQ': freq,
        })
    # 排序
    count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'],
                                        ascending=False)
    count_data = count_data[[
        u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT'
    ]]
    # 保存
    count_data.to_csv(
        'result/word_count_%d.csv' % feature_encoder.vocabulary_size,
        sep='\t',
        index=False,
        header=True,
        encoding='utf8',
    )

    print(count_data.head())
    # -------------- code start : 结束 -------------
    if verbose > 1:
        logging.debug('-' * 20)
        print('-' * 20)