def train():
    input = Input(shape=(max_len, ))
    model = Embedding(input_dim=n_words, output_dim=50,
                      input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(
        LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_tags, activation='softmax'))(
        model)  # softmax output layer

    model = Model(input, out)
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()

    # checkpoint
    # filepath = "../result/bilstm-weights-{epoch:02d}-{val_acc:.2f}.hdf5"
    # checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    # history=model.fit(X_train,np.array(y_train),batch_size=32,epochs=5,validation_split=0.1,verbose=1,callbacks=[checkpoint])

    history = model.fit(X_train,
                        np.array(y_train),
                        batch_size=32,
                        epochs=5,
                        validation_split=0.1,
                        verbose=1)
    # 保存模型
    model.save(filepath="../result/bi-lstm.h5")

    hist = pd.DataFrame(history.history)
    plt.figure(figsize=(12, 12))
    plt.plot(hist["acc"])
    plt.plot(hist["val_acc"])
    plt.show()
    def train(self):
        input = Input(shape=(120,))
        model = Embedding(input_dim=self.num_words, output_dim=50, input_length=120)(input)
        model = Dropout(0.1)(model)
        model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
        out = TimeDistributed(Dense(self.num_entities, activation="softmax"))(model)

        model = Model(input, out)

        model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

        history = model.fit(x=self.X_train, y=np.array(self.Y_train), batch_size=64, epochs=10,
                            validation_data=(self.X_validation, self.Y_validation))

        model.save("../models/ner_" + str(datetime.utcnow().microsecond))

        test_eval = model.evaluate(self.X_test, self.Y_test, verbose=0)
        print('Test loss:', test_eval[0])
        print('Test accuracy:', test_eval[1])

        return model, history
Esempio n. 3
0
def run():
    sentences = read_train_file(TRAIN_PATH)
    word_map, tag_map = create_word_idx()
    max_len = max([len(s) for s in sentences])
    X = [[word_map[w[0]] for w in s] for s in sentences]
    n_words = len(word_map)
    n_tags = 9
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
    y = [[tag_map[w[2]] for w in s] for s in sentences]
    y_testing = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag_map["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y_testing]

    dev_sentences = read_train_file(DEV_PATH)
    dev_max_len = max([len(s) for s in dev_sentences])
    X_dev = [[word_map[w[0]] for w in s] for s in dev_sentences]
    X_dev = pad_sequences(maxlen=max_len, sequences=X_dev, padding="post", value=n_words - 1)
    y_dev = [[tag_map[w[2]] for w in s] for s in dev_sentences]
    y_dev = pad_sequences(maxlen=max_len, sequences=y_dev, padding="post", value=tag_map["O"])
    y_dev = [to_categorical(i, num_classes=9) for i in y_dev]

    test_sentences = read_train_file(TEST_PATH)
    test_max_len = 33
    X_test = [[word_map[w[0]] for w in s] for s in test_sentences]
    X_test = pad_sequences(maxlen=test_max_len, sequences=X_test, padding="post", value=n_words - 1)
    y_test = [[tag_map[w[2]] for w in s] for s in test_sentences]
    y_test = pad_sequences(maxlen=test_max_len, sequences=y_test, padding="post", value=tag_map["O"])

    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words + 1, output_dim=20, input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model)
    model = TimeDistributed(Dense(n_tags, activation="relu"))(model)  # softmax output layer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output

    model = Model(input, out)
    model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
    history = model.fit(X, np.array(y), batch_size=32, epochs=5, verbose=1)
    model.save('simple_model.h5')
    return model, tag_map, X_test, y_test
Esempio n. 4
0
def train():
    input = Input(shape=(input_max_len, ))
    model = Embedding(vocab_size,
                      embedding_size,
                      weights=[glove_embedding_matrix()],
                      input_length=input_max_len,
                      trainable=False)(input)
    model = Bidirectional(
        LSTM(embedding_size,
             dropout=dropout,
             recurrent_dropout=recurrent_dropout,
             return_sequences=True))(model)
    model = Bidirectional(
        LSTM(2 * embedding_size,
             dropout=dropout,
             recurrent_dropout=recurrent_dropout,
             return_sequences=True))(model)
    model = TimeDistributed(Dense(embedding_size, activation='sigmoid'))(model)
    model = Flatten()(model)
    model = Dense(input_max_len, activation='sigmoid')(model)
    out = model
    model = Model(input, out)

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    print(model.summary())
    history = model.fit(padded_features,
                        np.array(final_label_updated),
                        validation_split=validation_split,
                        epochs=epochs,
                        batch_size=batch_size,
                        verbose=logging,
                        shuffle=True)
    model.save(model_name)
    metrics(history, model)
              metrics=["accuracy"])
model.summary()

history = model.fit(X_train,
                    np.array(y_train),
                    batch_size=32,
                    epochs=1,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

loss, accuracy = model.evaluate(X_test, np.array(y_test))

# save model
print('saved model to ', args.output_model_path)
model.save(MODEL_FILE)
with file_io.FileIO(MODEL_FILE, mode='rb') as input_f:
    with file_io.FileIO(args.output_model_path + '/' + MODEL_FILE,
                        mode='wb+') as output_f:
        output_f.write(input_f.read())

# write out metrics
metrics = {
    'metrics': [{
        'name': 'accuracy-score',
        'numberValue': accuracy,
        'format': "PERCENTAGE",
    }]
}

with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
Esempio n. 6
0
    test_sentence = tokenize(sentence)  # Tokenization
    # Preprocessing
    x_test_sent = pad_sequences(
        sequences=[[word2idx.get(w, 0) for w in test_sentence]],
        padding="post",
        value=word2idx["PAD"],
        maxlen=MAX_LEN)
    # Evaluation
    p = model.predict(np.array([x_test_sent[0]]))
    p = np.argmax(p, axis=-1)
    # Visualization
    print("{:15}||{}".format("Word", "Prediction"))
    print(30 * "=")
    for w, pred in zip(test_sentence, p[0]):
        print("{:15}: {:5}".format(w, idx2tag[pred]))


interact_manual(
    get_prediction,
    sentence=widgets.Textarea(placeholder='Next Monday is Christmas!'))

# Saving Vocab
with open('/path/to/save/word_to_index.pickle', 'wb') as handle:
    pickle.dump(word2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Saving Vocab
with open('/path/to/save/tag_to_index.pickle', 'wb') as handle:
    pickle.dump(tag2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

model.save('/path/to/save/lstm_crf_weights')
Esempio n. 7
0
  monitor = EarlyStopping(monitor = "val_acc", min_delta = 0.0001,
                          patience = 3, verbose = 1, mode = "max");
  board = TensorBoard(log_dir = "log/{}".format(arguments.id));

  model.fit(([inputs, cues] if arguments.cues else inputs), outputs,
            validation_split = arguments.vs,
            batch_size = arguments.bs, epochs = arguments.epochs,
            callbacks = [monitor, board],
            verbose = 1);
  if arguments.debug:
    print("model.evaluate() on training: {}"
          "".format(model.evaluate(([inputs, cues] if arguments.cues
                                    else inputs),
                                   outputs, verbose = 1)));

  model.save(arguments.id + ".h5");

  #
  # in a few, rare circumstances, we allow ourselves to re-interpret variable
  # names, as is the case of .inputs. and .outputs. here: now turning our focus
  # to the evaluation data.
  #
  n = 0;
  unknown = 0;
  inputs = np.zeros((len(test), LENGTH), dtype = int);
  cues = np.zeros((len(test), LENGTH), dtype = int);
  golds = np.zeros((len(test), LENGTH,
                    len(classes) - (2 if arguments.cues else 0)),
                   dtype = int);
  for i, sentence in enumerate(test):
    n += len(sentence["nodes"]);
X = X.reshape((200000, 75))

print("Reshaping emb...")
embedding_matrix = embedding_matrix.reshape((VOCAB_DIM, EMBEDDING_DIM))

print("Reshaping Y...")
Y = Y.reshape((200000, max_len, n_tags))

input = Input(shape=(max_len, ))
model = Embedding(input_dim=VOCAB_DIM,
                  output_dim=EMBEDDING_DIM,
                  input_length=max_len,
                  weights=[embedding_matrix],
                  mask_zero=True,
                  trainable=True)(input)  # 100-dim embedding
model = Bidirectional(
    LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = Dense(n_tags, activation="softmax")(model)

model = Model(input, out)
model.compile(loss='categorical_crossentropy',
              optimizer='nadam',
              metrics=['accuracy'])
model.summary()

# Fit
model.fit(X, Y, batch_size=512, epochs=4, validation_split=0.1, verbose=1)

# Save
model.save('./models/model_lstm_100.h5')
Esempio n. 9
0
crf = CRF(len(labels))  # CRF layer
out = crf(model)  # output

model = Model(input, out)

if not os.path.isfile(model_name):
    model.compile(optimizer="rmsprop",
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])
    history = model.fit(X_tr,
                        np.array(y_tr),
                        batch_size=32,
                        epochs=20,
                        validation_split=0.1,
                        verbose=1)
    model.save(model_name)
else:
    custom_objects = {
        'CRF': CRF,
        'crf_loss': crf_loss,
        'crf_viterbi_accuracy': crf_viterbi_accuracy
    }
    model = load_model(model_name, custom_objects=custom_objects)

# plot_model(model, to_file='lstm_crf.png')

# Evaluation
y_pred = model.predict(X_te)
y_pred = np.argmax(y_pred, axis=-1)
y_test_act = np.argmax(y_te, axis=-1)
dataset_size = train_data.shape[0]
batches_per_epoch = dataset_size / batch_size
lr_decay = (1. / (1 / 32) - 1) / batches_per_epoch
model.compile(optimizer=Adam(lr=0.016, decay=0.001),
              loss=crf.loss_function,
              metrics=[crf.accuracy])
model.summary()
history = model.fit(X_tr,
                    np.array(y_tr),
                    batch_size=batch_size,
                    epochs=ephochs,
                    validation_data=(X_v, np.array(y_v)),
                    verbose=1)

### Save Model
model.save(root_path + '/models/lstm/w2v_bilstm_crf_1.h5')

from keras.models import load_model
# model = load_model(root_path + '/models/lstm/w2v_bilstm_crf.h5')

# history is a dictionary,keys are val_loss,val_acc,loss,acc
hist = pd.DataFrame(history.history)
fig = plt.figure(figsize=(12, 12))
# add subplots
sub_fig1 = fig.add_subplot(1, 2, 1)  # 1 row 2 cols 1st figure
sub_fig2 = fig.add_subplot(1, 2, 2)
# set titles
sub_fig1.set_title('Accuracy')
sub_fig2.set_title('Loss')
print(hist)
# set values and labels
Esempio n. 11
0
#split dataset
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

#building the layers of the neural network
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
model = Model(input, out)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5, validation_split=0.1, verbose=1)

filename = 'ner.sav'
model.save(filename)


#hist = pd.DataFrame(history.history)
#
#plt.figure(figsize=(12,12))
#plt.plot(hist["acc"])
#plt.plot(hist["val_acc"])
#plt.show()
#
##testing some predictions
##use this model to post a new sentence
i = 2318
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
Esempio n. 12
0
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional


with open ('ci_files/x_set', 'rb') as fp:
    X = pickle.load(fp)

with open ('ci_files/y_set', 'rb') as fp:
    y = pickle.load(fp)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

input_layer = Input(shape=(56,))
model = Embedding(input_dim=26302, output_dim=56, input_length=56)(input_layer)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(17, activation="softmax"))(model)

model = Model(input_layer, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.fit(X_train, np.array(y_train), batch_size=32, epochs=1, validation_split=0.2, verbose=1)

model.save("model.pt")
def train_eval(data_path, model_name, option='simple', emb_path=None):
    """Train a model with the data in path.
    Save it (and the formatting) as model_name.
    If option is 'emb', emb_path is the path to the embedding to be used.
    """

    # get the data
    try:
        X_train, y_train = get_data(data_path + '/train')
        X_val, y_val = get_data(data_path + '/val')
        X_test, y_test = get_data(data_path + '/test')
    except:
        raise Exception("Some data file does not exist")

    # preprocess the texts
    for X in [X_train, X_val, X_test]:
        preprocess_text(X)

    # Keras needs the sequences to be numerical and padded, as well as the labels
    # We will need all the words and labels for this

    words = list(set([w for sent in X_train + X_val + X_test for w in sent]))
    labels = list(set([l for sent in y_train for l in sent]))

    words.append('--PAD--')
    # labels.append('--PAD--')

    n_labels = len(labels)
    n_words = len(words)

    words2num = {word: i for i, word in enumerate(words)}
    labels2num = {label: i for i, label in enumerate(labels)}

    # a trick for NER...
    if 'O' in labels2num:
        labels2num['--PAD--'] = labels2num['O']
    else:
        labels2num['--PAD--'] = enumerate(labels) + 1

    [X_train_num, X_val_num, X_test_num
     ] = [process_sequences(X, words2num) for X in [X_train, X_val, X_test]]

    [y_train_num, y_val_num, y_test_num
     ] = [process_sequences(y, labels2num) for y in [y_train, y_val, y_test]]
    [y_train_num, y_val_num,
     y_test_num] = [[to_categorical(i, num_classes=n_labels) for i in y]
                    for y in [y_train_num, y_val_num, y_test_num]]

    if option == 'emb':
        try:
            emb_dict = KeyedVectors.load(emb_path)

        except:
            raise Exception("Embedding file does not exist")

        emb_matrix = np.zeros((len(words), emb_dict.vector_size))

        for i, w in enumerate(words):
            # Build a matrix for the indexes with the vector values of corresponding words
            # If the word does not exist in the embedding, keep zeros
            if w in emb_dict:
                emb_matrix[i] = emb_dict[w]

    # We build a Bidirectional LSTM
    input = Input(shape=(None, ))
    if option == 'emb':
        model = Embedding(input_dim=n_words,
                          output_dim=emb_dict.vector_size,
                          weights=[emb_matrix])(input)
    else:
        model = Embedding(input_dim=n_words, output_dim=50)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(
        LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_labels, activation="softmax"))(
        model)  # TimeDistributed keeps the outputs for each sequence separated
    # crf = CRF(n_labels)  # CRF layer
    # out = crf(model)
    model = Model(input, out)

    if option == 'crf':
        crf = CRF(n_labels)  # CRF layer
        out = crf(model)
        model.compile(optimizer="rmsprop",
                      loss=crf.loss_function,
                      metrics=[crf.accuracy])

    else:
        model.compile(optimizer="rmsprop",
                      loss="categorical_crossentropy",
                      metrics=["accuracy"])

    # Fit the model using the validation data
    model.fit(X_train_num,
              np.array(y_train_num),
              batch_size=32,
              epochs=5,
              validation_data=(X_val_num, np.array(y_val_num)),
              verbose=1)

    # Save the model
    model.save('{}.hdf5'.format(model_name), overwrite=True)
    formatter = {
        'labels': labels,
        'words': 'words',
        'words2num': words2num,
        'labels2num': labels2num
    }
    with open('{}-preproc.json'.format(model_name), 'w+') as f:
        json.dump(formatter, f)

    # Evaluate the model on the test data
    predictions = model.predict(X_test_num)
    results = model.evaluate(X_test_num, np.array(y_test_num))

    print("Overall results for the predictions: {}".format(results))

    # This values are not very clear because of class imbalance
    # Make a better evaluation
    predictions = np.argmax(predictions, axis=-1)
    predictions = [[labels[i] for i in pred] for pred in predictions]
    evaluate(y_test, predictions, labels)

    return (predictions)
Esempio n. 14
0
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

word_to_index_path = sys.argv[4] + '.pickle'
with open(word_to_index_path, 'wb') as file:
    pickle.dump(word_to_index, file)

tag_to_index_path = sys.argv[5] + '.pickle'
with open(tag_to_index_path, 'wb') as file:
    pickle.dump(tag_to_index, file)

path = sys.argv[3] + '.sav'
model.save(path)

# Evaluation
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

# Convert the index to tag
y_pred = [[idx2tag[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true]

print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))
report = flat_classification_report(y_pred=y_pred,
                                    y_true=y_test_true,
                                    labels=tags)
print(report)
##Training model
model.compile(optimizer='adam',
              loss=crf.loss_function,
              metrics=[crf.accuracy, 'accuracy'])
model.summary()
# filepath="Model Version/ner_{val_accuracy:.2f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
# callbacks_list = [checkpoint]
# history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=3, validation_split=0.05, verbose=1, callbacks=callbacks_list)
history = model.fit(X_train,
                    np.array(y_train),
                    batch_size=512,
                    epochs=3,
                    validation_split=0.05,
                    verbose=1)
model.save("Model Version/ner_kw.hdf5")
plot_history(history)

# #Loading model
# model = k.models.load_model("Model Version/ner_kw.hdf5", custom_objects={'CRF': crf, 'crf_loss': crf.loss_function, 'crf_viterbi_accuracy': crf.accuracy})
# print("Loaded model from disk")
# model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])

i = 0
pred_sing = model.predict(np.array([X_test[i]]))
pred_sing = np.argmax(pred_sing, axis=-1)
gt = np.argmax(y_test[i], axis=-1)
print('\ngt', gt)
print("\n{:14}: ({:5}): {}".format("Word", "True", "Pred"))
for idx, (w, pred) in enumerate(zip(X_test[i], pred_sing[0])):
    print("{:14}: ({:5}): {}".format(words[w], idx2tag[gt[idx]], tags[pred]))
Esempio n. 16
0
# Save the model (weights)
# save_all_weights | load_all_weights: saves model and optimizer weights (save_weights and save)
model.save_weights("pretrained_models\\fulltext_model_weights.h5"
                   )  # sentences_model_weights.h5
'''
# `assert_consumed` can be used as validation that all variable values have been restored from the checkpoint. 
# See `tf.train.Checkpoint.restore` for other methods in the Status object.
print(load_status.assert_consumed())

# Check that all of the pretrained weights have been loaded.
for a, b in zip(pretrained.weights, model.weights):
    np.testing.assert_allclose(a.numpy(), b.numpy())
'''

# Save the model (architecture, loss, metrics, optimizer state, weights)
model.save('pretrained_models\\fulltext_bi_lstm_crf_dense_linear.h5'
           )  # sentences_bi_lstm_crf_dense_linear.h5
'''
# Load the model
from keras.models import load_model
model = load_model('pretrained_models\\fulltext_bi_lstm_crf_dense_linear.h5', custom_objects={'CRF': CRF(number_labels), 'num_classes': number_labels})  #  , 'loss': crf.loss, 'metrics': [crf.accuracy]
'''

# ======================================================================================================================
# Count the total running time
# ======================================================================================================================

total_time = str(timedelta(seconds=(time.time() - start_time)))
print("\n--- %s running time ---" % total_time)

# ======================================================================================================================
# Track model loss per epoch