Exemple #1
0
def TrainLstmCrf(data_name, model_name):
    n_classes = 4
    max_len = 75
    batch_size = 128
    epoch = 100
    tags = ['S', 'B', 'I', 'E']
    sentences, words = get_sents(datasets=data_name)
    print(len(sentences), len(words))
    word2idx = {w: i + 1 for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}
    vocab_size = len(words)

    X = [[word2idx[w[0]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len,
                      sequences=X,
                      padding="post",
                      value=vocab_size - 1)

    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len,
                      sequences=y,
                      padding="post",
                      value=tag2idx["E"])
    y = [to_categorical(i, num_classes=n_classes) for i in y]
    # 获得数据
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
    print(len(X_tr), len(y_tr), len(X_te), len(y_te))
    s = np.asarray([max_len] * batch_size, dtype='int32')

    # 建立模型
    word_ids = Input(batch_shape=(batch_size, max_len), dtype='int32')
    sequence_lengths = Input(batch_shape=[batch_size, 1], dtype='int32')
    print(sequence_lengths)
    word_embeddings = Embedding(vocab_size, n_classes)(word_ids)
    blstm = Bidirectional(LSTM(units=50,
                               return_sequences=True))(word_embeddings)
    model = TimeDistributed(Dense(4, activation='tanh'))(blstm)
    crf = CrfModel()
    pred = crf(inputs=[model, sequence_lengths])
    model = Model(inputs=[word_ids, sequence_lengths], outputs=[pred])
    print("word_ids:{}".format(word_ids))
    print("sequence_lengths:{}".format(sequence_lengths))
    model.compile(optimizer="rmsprop", loss=crf.loss, metrics=['accuracy'])

    print(model.summary())

    k = 0
    for batch_x, batch_y in minibatches(X_tr, y_tr, batch_size=batch_size):
        model.fit([batch_x, s],
                  np.array(batch_y),
                  epochs=epoch,
                  batch_size=batch_size)
        k += 1
        if k % 50 == 0:
            model.save("./models/{}_{}".format(k, model_name))
            print("saved")

    # 保存模型
    model.save(model_name)
Exemple #2
0
    pos_emb = Embedding(input_dim=len(pos),
                        output_dim=10,
                        input_length=max_len)(pos_input)
    modified_input = keras.layers.concatenate([word_emb, pos_emb])
    model_1 = Bidirectional(
        LSTM(units=50, return_sequences=True,
             recurrent_dropout=0.1))(modified_input)
    model = TimeDistributed(Dense(50, activation="relu"))(
        model_1)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model([input, pos_input], out)
    model.compile(optimizer="rmsprop",
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])
    print(model.summary())
    history = model.fit([X_tr, X_pos_tr],
                        np.array(y_tr),
                        batch_size=32,
                        epochs=60,
                        validation_split=0.1,
                        verbose=1)
    #Testing
    test_pred = model.predict([X_te, X_pos_te], verbose=1)
    idx2tag = {i: w for w, i in tag2idx.items()}
    pred_labels = pred2label(test_pred)
    test_labels = pred2label(y_te)
    print("Recall, Precision and F-score are",
          get_recall_precision(test_labels, pred_labels, "Destination"))
    model.save("BILSTM+CRF_with_pos_without_embeddings.model")