lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=feature_type, ) train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix()) feature_encoder.print_model_descibe() feature_encoder.print_sentence_length_detail() # train_y = train_data['LABEL_INDEX'].as_matrix() test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix()) test_all_y = test_data['LABEL_INDEX'].as_matrix() print(train_X_feature.shape) print(test_all_X_feature.shape) logging.debug('=' * 20) # **************************************************************** # ------------- region end : 2. 转换数据的格式并特征编码 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 3. 模型的训练 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('3. 模型的训练')
def test_onehot_bow_cnn(): # 使用样例 train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子', '你好', '你妹'] test_y = [2, 3, 0] sentence_padding_length = 8 from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder word_feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='word', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) train_X_word_feature = word_feature_encoder.fit_transform(train_X) test_X_word_feature = word_feature_encoder.transform(test_X) print(','.join(word_feature_encoder.vocabulary)) print train_X_word_feature.shape print train_X_word_feature seg_feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='seg', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) train_X_seg_feature = seg_feature_encoder.fit_transform(train_X) test_X_seg_feature = seg_feature_encoder.transform(test_X) print(','.join(seg_feature_encoder.vocabulary)) print train_X_seg_feature.shape print train_X_seg_feature # quit() onehot_cnn = MultiChannelOnehotBowCNN( rand_seed=1377, verbose=1, feature_encoder=(word_feature_encoder,seg_feature_encoder), # optimizers='adadelta', optimizers='sgd', word_input_length=sentence_padding_length, seg_input_length=sentence_padding_length, word_input_dim=word_feature_encoder.vocabulary_size, seg_input_dim=seg_feature_encoder.vocabulary_size, num_labels=5, l1_conv_filter_type=[ [1, 2, -1, 'valid', (0, 1), 0., 'relu', 'none'], [1, 3, -1, 'valid', (0, 1), 0., 'relu', 'none'], [1, -1, -1, 'bow', (0, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[ # [16, 2, -1, 'valid',(2,1),0.5, 'relu', 'none'] ], full_connected_layer_units=[ (50, 0.5, 'relu', 'none'), ], embedding_dropout_rate=0., nb_epoch=30, nb_batch=5, earlyStoping_patience=20, lr=1e-2, ) onehot_cnn.print_model_descibe() # 训练模型 # 从保存的pickle中加载模型 # onehot_cnn.model_from_pickle('model/modelA.pkl') print(onehot_cnn.fit(([train_X_word_feature,train_X_seg_feature], trian_y), ([test_X_word_feature,test_X_seg_feature], test_y))) print(trian_y) # loss, train_accuracy = onehot_cnn.model.evaluate(train_X_feature, trian_y) # onehot_cnn.accuracy((train_X_word_feature, trian_y), transform_input=False) print(onehot_cnn.batch_predict([test_X_word_feature,test_X_seg_feature], transform_input=False)) print(onehot_cnn.batch_predict_bestn([test_X_word_feature,test_X_seg_feature], transform_input=False, bestn=2)) quit() print onehot_cnn.batch_predict(test_X, transform_input=True) print onehot_cnn.predict(test_X[0], transform_input=True) onehot_cnn.accuracy((test_X, test_y), transform_input=True) # 保存模型 # onehot_cnn.save_model('model/modelA.pkl') print onehot_cnn.predict('你好吗', transform_input=True)
sentence_padding_length=sentence_padding_length, verbose=1, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, mask_zero=True, ) train_w2v_features = feature_encoder.fit_transform(train_data['SENTENCE'].as_matrix()) test_w2v_features = feature_encoder.transform(test_data['SENTENCE'].as_matrix()) train_y = train_data['LABEL_INDEX'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() print train_w2v_features[0] print test_w2v_features.shape # train_w2v_features = np.transpose(train_w2v_features,(0,2,1)) # test_w2v_features = np.transpose(test_w2v_features,(0,2,1)) logging.debug('=' * 20) # ------------------------------------------------------------------------------ # -------------- region end : 生成w2v(补齐)特征生成器 --------------- # ------------------------------------------------------------------------------
feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, mask_zero=True ) train_X_features = feature_encoder.fit_transform(train_data=train_X) test_X_features = feature_encoder.transform(test_X) print(train_X_features) dcnn = DynamicCNN( rand_seed=1337, verbose=1, batch_size=2, vocab_size=feature_encoder.vocabulary_size, word_embedding_dim=48, # input_length=None, input_length=sentence_padding_length, num_labels=4, conv_filter_type=[[100, 2, 'full'], [100, 4, 'full'], # [100,6,5,'valid'], ], ktop=1,
zhs2zht=True, remove_url=True, padding_mode="center", add_unkown_word=True, mask_zero=True, to_onehot_array=True, ) train_X_feature = feature_encoder.fit_transform(train_data=train_data["SENTENCE"].as_matrix()) feature_encoder.print_model_descibe() train_y = train_data["LABEL_INDEX"].as_matrix() test_X_feature = feature_encoder.transform(test_data["SENTENCE"].as_matrix()) test_y = test_data["LABEL_INDEX"].as_matrix() print (train_X_feature.shape) print (test_X_feature.shape) logging.debug("=" * 20) # **************************************************************** # ------------- region end : 2. 转换数据的格式并特征编码 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 3、构建onehot编码 +++++++++++++ # **************************************************************** logging.debug("=" * 20)