def train(self):
        input = Input(shape=(120,))
        model = Embedding(input_dim=self.num_words, output_dim=50, input_length=120)(input)
        model = Dropout(0.1)(model)
        model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
        out = TimeDistributed(Dense(self.num_entities, activation="softmax"))(model)

        model = Model(input, out)

        model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

        history = model.fit(x=self.X_train, y=np.array(self.Y_train), batch_size=64, epochs=10,
                            validation_data=(self.X_validation, self.Y_validation))

        model.save("../models/ner_" + str(datetime.utcnow().microsecond))

        test_eval = model.evaluate(self.X_test, self.Y_test, verbose=0)
        print('Test loss:', test_eval[0])
        print('Test accuracy:', test_eval[1])

        return model, history
Esempio n. 2
0
def hyperopt_train_test(params):

    epsilon = 10**params['epsilon_exp']
    optimizer = optimizers.adam(lr=params['learning_rate'], epsilon=epsilon)

    if dmc_parameters["use_embedding_layer"]:
        input = Input(shape=(dmc_parameters["max_seq_len"], ))
        model = Embedding(input_dim=dmc_parameters["one_hot_vector_len"],
                          output_dim=params['embedding_layer_output'],
                          input_length=dmc_parameters["max_seq_len"])(input)
        model = Dropout(rate=params['embedding_dropout'])(model)
    else:
        input = Input(shape=(dmc_parameters["max_seq_len"],
                             dmc_parameters["one_hot_vector_len"]))
        model = input
    if params['bi_lstm1_units'] > 0:
        model = Bidirectional(
            CuDNNLSTM(units=params['bi_lstm1_units'],
                      return_sequences=True))(model)
    if params['bi_lstm2_units'] > 0:
        model = Bidirectional(
            CuDNNLSTM(units=params['bi_lstm2_units'],
                      return_sequences=True))(model)
    if dmc_parameters["use_crf_layer"]:
        crf = CRF(dmc_parameters["num_tags"])  # CRF layer
        out = crf(model)  # output
        model = Model(input, out)
        model.compile(optimizer=optimizer,
                      loss=losses.crf_loss,
                      metrics=[metrics.crf_accuracy,
                               avg_proximity_metric()])
    else:
        out = TimeDistributed(
            Dense(dmc_parameters["num_tags"], activation="softmax"))(model)
        model = Model(input, out)
        model.compile(optimizer=optimizer,
                      loss="categorical_crossentropy",
                      metrics=["accuracy", avg_proximity_metric()])
    model.summary()
    es = EarlyStopping(monitor='val_loss',
                       min_delta=0,
                       patience=dmc_parameters["patience"],
                       verbose=False,
                       mode='min',
                       restore_best_weights=True)
    history = model.fit(X_tr,
                        np.array(y_tr),
                        batch_size=dmc_parameters['batch_size'],
                        epochs=dmc_parameters["epochs"],
                        validation_data=(X_vl, np.array(y_vl)),
                        verbose=False,
                        shuffle=True,
                        callbacks=[es])
    loss, acc, prox = model.evaluate(x=X_vl,
                                     y=np.array(y_vl),
                                     batch_size=dmc_parameters['batch_size'],
                                     verbose=False)
    validation_labels = deepMirCut.pred2label(y_vl, dmc_parameters)
    validation_pred = model.predict(X_vl, verbose=False)
    pred_labels = deepMirCut.pred2label(validation_pred, dmc_parameters)
    fScore = f1_score(validation_labels, pred_labels)
    return loss, acc, prox, fScore
    model)  # softmax output layer
model = Model(model_input, out)
model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

history = model.fit(X_train,
                    np.array(y_train),
                    batch_size=32,
                    epochs=1,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

loss, accuracy = model.evaluate(X_test, np.array(y_test))

# save model
print('saved model to ', args.output_model_path)
model.save(MODEL_FILE)
with file_io.FileIO(MODEL_FILE, mode='rb') as input_f:
    with file_io.FileIO(args.output_model_path + '/' + MODEL_FILE,
                        mode='wb+') as output_f:
        output_f.write(input_f.read())

# write out metrics
metrics = {
    'metrics': [{
        'name': 'accuracy-score',
        'numberValue': accuracy,
        'format': "PERCENTAGE",
Esempio n. 4
0
                metrics = ["accuracy"]);
  print(model.summary());

  monitor = EarlyStopping(monitor = "val_acc", min_delta = 0.0001,
                          patience = 3, verbose = 1, mode = "max");
  board = TensorBoard(log_dir = "log/{}".format(arguments.id));

  model.fit(([inputs, cues] if arguments.cues else inputs), outputs,
            validation_split = arguments.vs,
            batch_size = arguments.bs, epochs = arguments.epochs,
            callbacks = [monitor, board],
            verbose = 1);
  if arguments.debug:
    print("model.evaluate() on training: {}"
          "".format(model.evaluate(([inputs, cues] if arguments.cues
                                    else inputs),
                                   outputs, verbose = 1)));

  model.save(arguments.id + ".h5");

  #
  # in a few, rare circumstances, we allow ourselves to re-interpret variable
  # names, as is the case of .inputs. and .outputs. here: now turning our focus
  # to the evaluation data.
  #
  n = 0;
  unknown = 0;
  inputs = np.zeros((len(test), LENGTH), dtype = int);
  cues = np.zeros((len(test), LENGTH), dtype = int);
  golds = np.zeros((len(test), LENGTH,
                    len(classes) - (2 if arguments.cues else 0)),
Esempio n. 5
0
                  weights=[embedding_matrix],
                  input_length=MAX_SEQUENCE_LENGTH)(input)
model = Bidirectional(LSTM(100, return_sequences=True, dropout=0.50),
                      merge_mode='concat')(model)
model = TimeDistributed(Dense(100, activation='relu'))(model)
model = Flatten()(model)
model = Dense(100, activation='relu')(model)
output = Dense(27, activation='softmax')(model)
model = Model(input, output)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# In[ ]:

model.fit(X_train, Y_train, validation_split=0.15, epochs=20, verbose=2)

# In[ ]:

# evaluate the model
loss, accuracy = model.evaluate(X_test, Y_test, verbose=2)
print('Accuracy: %f' % (accuracy * 100))

# In[66]:

from sklearn.metrics import classification_report, confusion_matrix
Y_pred = model.predict(X_test)
y_pred = np.array([np.argmax(pred) for pred in Y_pred])
print('  Classification Report:\n', classification_report(Y_test, y_pred),
      '\n')
Esempio n. 6
0
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out


# evaluate the model
#test_pred=model.predict(X_test)

# from sklearn.metrics import classification_report

# y_pred = model.predict(X_test, batch_size=64, verbose=1)
# y_pred_bool = np.argmax(y_pred, axis=1)
# print(y_pred)
#print(classification_report(y_test, y_pred_bool))
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test,
                                                             y_test,
                                                             verbose=0)
print("loss:%f accuracy:%f f1_score:%f precision:%f recall:%f" %
      (loss, accuracy, f1_score, precision, recall))
#pred_labels = pred2label(test_pred)
#test_labels = pred2label(y_test)
#print(pred_labels)
#print(classification_report(test_labels,pred_labels))
# f1,pr,rec=0,0,0
# for i in range(len(test_labels)):
#     pr+=precision_score(test_labels[i],pred_labels[i],average='micro')
#     f1+=f1_score(test_labels[i],pred_labels[i],average='micro')
#     rec+=recall_score(test_labels[i],pred_labels[i],average='micro')
# pr/=len(test_labels)
# rec/=len(test_labels)
# f1/=len(test_labels)
# set titles
sub_fig1.set_title('Accuracy')
sub_fig2.set_title('Loss')
print(hist)
# set values and labels
sub_fig1.plot(hist["crf_viterbi_accuracy"], label='acc')
sub_fig1.plot(hist["val_crf_viterbi_accuracy"], label='val_acc')
sub_fig1.legend(loc="lower right")
sub_fig2.plot(hist["loss"], label='loss')
sub_fig2.plot(hist["val_loss"], label='val_loss')
sub_fig2.legend(loc="upper right")
plt.xlabel('epoch')
# show figure
plt.show()

score = model.evaluate(X_te, np.array(y_te), batch_size=batch_size, verbose=1)
print(model.metrics_names)
print("Score:")
print(score)

# ## Prediction on test set
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
# print("Input:")
# print(X_te[0])
# print("Supposed output:")
# print(y_te)
# print(np.array(y_te))
test_pred = model.predict([X_te], verbose=1)
# print("Prediction result:")
# print(test_pred[0])
idx2tag = {i: w for w, i in tags2idx.items()}
def train_eval(data_path, model_name, option='simple', emb_path=None):
    """Train a model with the data in path.
    Save it (and the formatting) as model_name.
    If option is 'emb', emb_path is the path to the embedding to be used.
    """

    # get the data
    try:
        X_train, y_train = get_data(data_path + '/train')
        X_val, y_val = get_data(data_path + '/val')
        X_test, y_test = get_data(data_path + '/test')
    except:
        raise Exception("Some data file does not exist")

    # preprocess the texts
    for X in [X_train, X_val, X_test]:
        preprocess_text(X)

    # Keras needs the sequences to be numerical and padded, as well as the labels
    # We will need all the words and labels for this

    words = list(set([w for sent in X_train + X_val + X_test for w in sent]))
    labels = list(set([l for sent in y_train for l in sent]))

    words.append('--PAD--')
    # labels.append('--PAD--')

    n_labels = len(labels)
    n_words = len(words)

    words2num = {word: i for i, word in enumerate(words)}
    labels2num = {label: i for i, label in enumerate(labels)}

    # a trick for NER...
    if 'O' in labels2num:
        labels2num['--PAD--'] = labels2num['O']
    else:
        labels2num['--PAD--'] = enumerate(labels) + 1

    [X_train_num, X_val_num, X_test_num
     ] = [process_sequences(X, words2num) for X in [X_train, X_val, X_test]]

    [y_train_num, y_val_num, y_test_num
     ] = [process_sequences(y, labels2num) for y in [y_train, y_val, y_test]]
    [y_train_num, y_val_num,
     y_test_num] = [[to_categorical(i, num_classes=n_labels) for i in y]
                    for y in [y_train_num, y_val_num, y_test_num]]

    if option == 'emb':
        try:
            emb_dict = KeyedVectors.load(emb_path)

        except:
            raise Exception("Embedding file does not exist")

        emb_matrix = np.zeros((len(words), emb_dict.vector_size))

        for i, w in enumerate(words):
            # Build a matrix for the indexes with the vector values of corresponding words
            # If the word does not exist in the embedding, keep zeros
            if w in emb_dict:
                emb_matrix[i] = emb_dict[w]

    # We build a Bidirectional LSTM
    input = Input(shape=(None, ))
    if option == 'emb':
        model = Embedding(input_dim=n_words,
                          output_dim=emb_dict.vector_size,
                          weights=[emb_matrix])(input)
    else:
        model = Embedding(input_dim=n_words, output_dim=50)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(
        LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_labels, activation="softmax"))(
        model)  # TimeDistributed keeps the outputs for each sequence separated
    # crf = CRF(n_labels)  # CRF layer
    # out = crf(model)
    model = Model(input, out)

    if option == 'crf':
        crf = CRF(n_labels)  # CRF layer
        out = crf(model)
        model.compile(optimizer="rmsprop",
                      loss=crf.loss_function,
                      metrics=[crf.accuracy])

    else:
        model.compile(optimizer="rmsprop",
                      loss="categorical_crossentropy",
                      metrics=["accuracy"])

    # Fit the model using the validation data
    model.fit(X_train_num,
              np.array(y_train_num),
              batch_size=32,
              epochs=5,
              validation_data=(X_val_num, np.array(y_val_num)),
              verbose=1)

    # Save the model
    model.save('{}.hdf5'.format(model_name), overwrite=True)
    formatter = {
        'labels': labels,
        'words': 'words',
        'words2num': words2num,
        'labels2num': labels2num
    }
    with open('{}-preproc.json'.format(model_name), 'w+') as f:
        json.dump(formatter, f)

    # Evaluate the model on the test data
    predictions = model.predict(X_test_num)
    results = model.evaluate(X_test_num, np.array(y_test_num))

    print("Overall results for the predictions: {}".format(results))

    # This values are not very clear because of class imbalance
    # Make a better evaluation
    predictions = np.argmax(predictions, axis=-1)
    predictions = [[labels[i] for i in pred] for pred in predictions]
    evaluate(y_test, predictions, labels)

    return (predictions)