コード例 #1
0
ファイル: main.py プロジェクト: JDwangmo/coprocessor
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    feature_type=feature_type,
)


train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix())
feature_encoder.print_model_descibe()
feature_encoder.print_sentence_length_detail()

# train_y = train_data['LABEL_INDEX'].as_matrix()

test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix())

test_all_y = test_data['LABEL_INDEX'].as_matrix()

print(train_X_feature.shape)
print(test_all_X_feature.shape)
logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 2. 转换数据的格式并特征编码 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 3. 模型的训练 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('3. 模型的训练')
コード例 #2
0
def test_onehot_bow_cnn():
    # 使用样例
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子', '你好', '你妹']
    test_y = [2, 3, 0]
    sentence_padding_length = 8
    from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
    word_feature_encoder = FeatureEncoder(
        sentence_padding_length=sentence_padding_length,
        verbose=0,
        need_segmented=True,
        full_mode=True,
        replace_number=True,
        remove_stopword=True,
        lowercase=True,
        padding_mode='left',
        add_unkown_word=True,
        feature_type='word',
        zhs2zht=True,
        remove_url=True,
        # 设置为True,输出 onehot array
        to_onehot_array=True,
    )

    train_X_word_feature = word_feature_encoder.fit_transform(train_X)
    test_X_word_feature = word_feature_encoder.transform(test_X)
    print(','.join(word_feature_encoder.vocabulary))
    print train_X_word_feature.shape
    print train_X_word_feature

    seg_feature_encoder = FeatureEncoder(
        sentence_padding_length=sentence_padding_length,
        verbose=0,
        need_segmented=True,
        full_mode=True,
        replace_number=True,
        remove_stopword=True,
        lowercase=True,
        padding_mode='left',
        add_unkown_word=True,
        feature_type='seg',
        zhs2zht=True,
        remove_url=True,
        # 设置为True,输出 onehot array
        to_onehot_array=True,
    )

    train_X_seg_feature = seg_feature_encoder.fit_transform(train_X)
    test_X_seg_feature = seg_feature_encoder.transform(test_X)
    print(','.join(seg_feature_encoder.vocabulary))
    print train_X_seg_feature.shape
    print train_X_seg_feature

    # quit()
    onehot_cnn = MultiChannelOnehotBowCNN(
        rand_seed=1377,
        verbose=1,
        feature_encoder=(word_feature_encoder,seg_feature_encoder),
        # optimizers='adadelta',
        optimizers='sgd',
        word_input_length=sentence_padding_length,
        seg_input_length=sentence_padding_length,
        word_input_dim=word_feature_encoder.vocabulary_size,
        seg_input_dim=seg_feature_encoder.vocabulary_size,
        num_labels=5,
        l1_conv_filter_type=[
            [1, 2, -1, 'valid', (0, 1), 0., 'relu', 'none'],
            [1, 3, -1, 'valid', (0, 1), 0., 'relu', 'none'],
            [1, -1, -1, 'bow', (0, 1), 0., 'relu', 'none'],
        ],
        l2_conv_filter_type=[
            # [16, 2, -1, 'valid',(2,1),0.5, 'relu', 'none']
        ],
        full_connected_layer_units=[
            (50, 0.5, 'relu', 'none'),
        ],
        embedding_dropout_rate=0.,
        nb_epoch=30,
        nb_batch=5,
        earlyStoping_patience=20,
        lr=1e-2,
    )
    onehot_cnn.print_model_descibe()
    # 训练模型
    # 从保存的pickle中加载模型
    # onehot_cnn.model_from_pickle('model/modelA.pkl')
    print(onehot_cnn.fit(([train_X_word_feature,train_X_seg_feature], trian_y),
                   ([test_X_word_feature,test_X_seg_feature], test_y)))
    print(trian_y)
    # loss, train_accuracy = onehot_cnn.model.evaluate(train_X_feature, trian_y)

    # onehot_cnn.accuracy((train_X_word_feature, trian_y), transform_input=False)
    print(onehot_cnn.batch_predict([test_X_word_feature,test_X_seg_feature], transform_input=False))
    print(onehot_cnn.batch_predict_bestn([test_X_word_feature,test_X_seg_feature], transform_input=False, bestn=2))
    quit()
    print onehot_cnn.batch_predict(test_X, transform_input=True)
    print onehot_cnn.predict(test_X[0], transform_input=True)
    onehot_cnn.accuracy((test_X, test_y), transform_input=True)
    # 保存模型
    # onehot_cnn.save_model('model/modelA.pkl')

    print onehot_cnn.predict('你好吗', transform_input=True)
コード例 #3
0
ファイル: main.py プロジェクト: JDwangmo/coprocessor
    sentence_padding_length=sentence_padding_length,
    verbose=1,
    need_segmented=True,
    full_mode=True,
    remove_stopword=False,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    mask_zero=True,
)

train_w2v_features = feature_encoder.fit_transform(train_data['SENTENCE'].as_matrix())
test_w2v_features  = feature_encoder.transform(test_data['SENTENCE'].as_matrix())
train_y = train_data['LABEL_INDEX'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()

print train_w2v_features[0]
print test_w2v_features.shape

# train_w2v_features = np.transpose(train_w2v_features,(0,2,1))
# test_w2v_features = np.transpose(test_w2v_features,(0,2,1))


logging.debug('=' * 20)
# ------------------------------------------------------------------------------
# -------------- region end : 生成w2v(补齐)特征生成器 ---------------
# ------------------------------------------------------------------------------
コード例 #4
0
 feature_encoder = FeatureEncoder(
                                  sentence_padding_length=sentence_padding_length,
                                  verbose=0,
                                  need_segmented=True,
                                  full_mode=True,
                                  remove_stopword=True,
                                  replace_number=True,
                                  lowercase=True,
                                  zhs2zht=True,
                                  remove_url=True,
                                  padding_mode='center',
                                  add_unkown_word=True,
                                  mask_zero=True
                                  )
 train_X_features = feature_encoder.fit_transform(train_data=train_X)
 test_X_features = feature_encoder.transform(test_X)
 print(train_X_features)
 dcnn = DynamicCNN(
     rand_seed=1337,
     verbose=1,
     batch_size=2,
     vocab_size=feature_encoder.vocabulary_size,
     word_embedding_dim=48,
     # input_length=None,
     input_length=sentence_padding_length,
     num_labels=4,
     conv_filter_type=[[100, 2, 'full'],
                       [100, 4, 'full'],
                       # [100,6,5,'valid'],
                       ],
     ktop=1,
コード例 #5
0
ファイル: cv.py プロジェクト: JDwangmo/coprocessor
    zhs2zht=True,
    remove_url=True,
    padding_mode="center",
    add_unkown_word=True,
    mask_zero=True,
    to_onehot_array=True,
)


train_X_feature = feature_encoder.fit_transform(train_data=train_data["SENTENCE"].as_matrix())

feature_encoder.print_model_descibe()

train_y = train_data["LABEL_INDEX"].as_matrix()

test_X_feature = feature_encoder.transform(test_data["SENTENCE"].as_matrix())

test_y = test_data["LABEL_INDEX"].as_matrix()

print (train_X_feature.shape)
print (test_X_feature.shape)

logging.debug("=" * 20)
# ****************************************************************
# ------------- region end : 2. 转换数据的格式并特征编码 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 3、构建onehot编码 +++++++++++++
# ****************************************************************
logging.debug("=" * 20)