Esempio n. 1
0
def bilstm_crf_predcit():

    # 重新初始化模型,构建配置信息,和train部分一样
    input = Input(shape=(max_len, ))
    model = Embedding(input_dim=n_words + 1,
                      output_dim=20,
                      input_length=max_len,
                      mask_zero=True)(input)  # 20-dim embedding
    model = Bidirectional(
        LSTM(units=50, return_sequences=True,
             recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(
        model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)

    # 恢复权重
    save_load_utils.load_all_weights(model, filepath="result/bilstm-crf.h5")

    p = model.predict(np.array([x_test_sent[0]]))
    p = np.argmax(p, axis=-1)
    print("{:15}||{}".format("Word", "Prediction"))
    print(30 * "=")
    for w, pred in zip(test_sentence, p[0]):
        print("{:15}: {:5}".format(w, tags[pred]))
    def cross_validate(self, X, y):
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
        input = Input(shape=(self.max_len, ))
        model = Embedding(input_dim=self.n_words,
                          output_dim=50,
                          input_length=self.max_len)(input)
        model = Dropout(0.1)(model)
        model = Bidirectional(
            LSTM(units=100, return_sequences=True,
                 recurrent_dropout=0.1))(model)
        out = TimeDistributed(Dense(self.n_labels, activation="softmax"))(
            model)  # softmax output layer

        model = Model(input, out)
        model.compile(optimizer="rmsprop",
                      loss="categorical_crossentropy",
                      metrics=["accuracy"])
        history = model.fit(X_tr,
                            np.array(y_tr),
                            batch_size=32,
                            epochs=1,
                            validation_split=0.1,
                            verbose=1)

        p = model.predict(np.array([X_te[10]]))
        p = np.argmax(p, axis=-1)

        for w, pred in zip(X_te[10], p[0]):
            if self.words[w] != 'PADGARBAGE':
                print("{:15}: {}".format(self.words[w], self.labels[pred]))
Esempio n. 3
0
def run(X_train,
        Y_train,
        X_val,
        Y_val,
        embedding_matrix,
        vocab_size,
        maxlen=40,
        emb_dim=300,
        neg_ratio=0,
        hidden_dim=300,
        drop=0.2,
        r_drop=0.1):
    ##build model
    input = Input(shape=(maxlen, ))
    model = Embedding(vocab_size,
                      emb_dim,
                      weights=[embedding_matrix],
                      input_length=maxlen,
                      trainable=False)(input)
    model = Dropout(drop)(model)
    model = Bidirectional(
        LSTM(hidden_dim, return_sequences=True,
             recurrent_dropout=r_drop))(model)
    model = Dropout(drop)(model)
    out = TimeDistributed(Dense(1, activation='sigmoid'))(model)

    model = Model(input, out)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['acc'])
    earlyStop = [EarlyStopping(monitor='val_loss', patience=1)]
    history = model.fit(X_train,
                        Y_train,
                        batch_size=64,
                        epochs=10,
                        validation_data=(X_val, Y_val),
                        callbacks=earlyStop)

    pred = model.predict(X_val)
    Y_pred = np.squeeze(pred)
    test = [[1 if y >= threshold else 0 for y in x] for x in Y_pred]
    test_arr = np.asarray(test)
    test_arr = np.reshape(test_arr, (-1))
    target = np.reshape(Y_val, (-1))

    print(
        metrics.precision_recall_fscore_support(target,
                                                test_arr,
                                                average=None,
                                                labels=[0, 1]))

    #     Y_pred_ = [[1 if y>=threshold else 0 for y in x] for x in Y_pred]
    Y_val_ = np.squeeze(Y_val)

    print("Evaluate: dev seg exact")
    pred_out_dir = out_dir + 'seg_' + str(neg_ratio) + 'neg'
    gold_dir = '../../data/val_segs/' + 'seg_' + str(neg_ratio) + 'neg'
    p, r, f = seg_exact_match(test, Y_val_, pred_out_dir, gold_dir)

    return model, history, p, r, f
Esempio n. 4
0
def run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, maxlen=40, emb_dim=300, neg_ratio=0, hidden_dim=300, drop=0.2, r_drop=0.1):
    ##build model
#     input = Input(shape=(maxlen,))
#     model = Embedding(vocab_size, emb_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)(input)
#     model = Dropout(drop)(model)
#     model = Bidirectional(LSTM(hidden_dim, return_sequences=True, recurrent_dropout=r_drop))(model)
#     model = Dropout(drop)(model)
#     out = TimeDistributed(Dense(1, activation='sigmoid'))(model)
    input = Input(shape=(maxlen,))
    model = Embedding(vocab_size, emb_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)(input)
    model = Bidirectional(LSTM(hidden_dim, return_sequences=True, recurrent_dropout=r_drop))(model)
    model = TimeDistributed(Dense(hidden_dim//4, activation='relu'))(model)
    model = TimeDistributed(Dropout(drop))(model)
    ##use CRF instead of Dense
    crf = CRF(2)
    out = crf(model)

    model = Model(input, out)
    
    Y_train_2 = keras.utils.to_categorical(Y_train)
    Y_val_2 = keras.utils.to_categorical(Y_val)
    
    model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy]) 
    earlyStop = [EarlyStopping(monitor='val_loss', patience=1)]
    history = model.fit(X_train, Y_train_2, batch_size=64, epochs=10, 
                       validation_data=(X_val, Y_val_2), callbacks=earlyStop)


    preds = model.predict(X_val)
    test = [[np.argmax(y) for y in x] for x in preds]
    test_arr = np.asarray(test)
    test_arr = np.reshape(test_arr, (-1))

    print (metrics.precision_recall_fscore_support(np.reshape(Y_val,(-1)), test_arr, average=None,
                                              labels=[0, 1]))

    
#     Y_pred_ = [[1 if y>=threshold else 0 for y in x] for x in Y_pred]
    Y_val_ = np.squeeze(Y_val)

    print ("Evaluate: dev seg exact")
    pred_out_dir = out_dir+'seg_'+str(neg_ratio)+'neg'
    gold_dir = '../../data/val_segs/'+'seg_'+str(neg_ratio)+'neg'
    p, r, f = seg_exact_match(test, Y_val_, pred_out_dir, gold_dir)
    
    return model, history, p, r, f
Esempio n. 5
0
# #
# save_load_utils.save_all_weights(model, 'lstm_crf.model', include_optimizer=False)
#
# hist = pd.DataFrame(history.history)
#
#
# plt.style.use("ggplot")
# plt.figure(figsize=(12,12))
# plt.plot(hist["acc"])
# plt.plot(hist["val_acc"])
# plt.show()


save_load_utils.load_all_weights(model, 'lstm_crf.model')

test_pred = model.predict(X_te, verbose=2)

idx2tag = {i: w for w, i in tag2idx.items()}
# print(idx2tag)
print(test_pred)


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
                    verbose=1)  #, callbacks = [tbCallBack])

hist = pd.DataFrame(history.history)
hist
plt.style.use('ggplot')
plt.figure(figsize=(12, 12))
plt.plot(hist['crf_viterbi_accuracy'])
plt.plot(hist['val_crf_viterbi_accuracy'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

# pip install seqeval

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

test_pred = model.predict(X_test, verbose=1)

idx2tag = {i: w for w, i in tag2idx.items()}


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

Esempio n. 7
0
def create_model(x_train, y_train, x_test, y_test):
    args = parse_args()
    set_logger(args.log_path, args.log_level)
    logging.debug('Args:')
    logging.debug(args)
    lang = construct_languages(args.train)
    assert len(lang) == 1
    lang = lang[0]
    game = initialize_game(train_file=lang.train,
                           test_file=lang.test,
                           dev_file=lang.dev,
                           emb_file=lang.emb,
                           budget=args.budget,
                           max_seq_len=args.max_seq_len,
                           max_vocab_size=args.max_vocab_size,
                           emb_size=args.embedding_size,
                           model_name=args.model_name)
    max_len = args.max_seq_len
    input_dim = args.max_vocab_size
    output_dim = args.embedding_size
    embedding_matrix = game.w2v
    logging.debug('building Keras model...')
    input = Input(shape=(max_len, ))
    model = Embedding(input_dim=input_dim,
                      output_dim=output_dim,
                      input_length=max_len,
                      weights=[embedding_matrix],
                      trainable=False)(input)
    model = Dropout(0.1)(model)
    n_units = 128
    model = Bidirectional(
        LSTM(units=n_units, return_sequences=True,
             recurrent_dropout=0.1))(model)
    n_tags = 5
    out = TimeDistributed(Dense(n_tags, activation='softmax'))(model)
    model = Model(input, out)
    logging.debug('Model type: ')
    logging.debug(type(model))
    logging.debug('Model summary: ')
    logging.debug(model.summary())
    rmsprop = keras.optimizers.RMSprop(lr={{choice([0.0001])}})
    model.compile(optimizer=rmsprop,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    logging.debug('done building model...')
    logging.debug('starting training...')
    num_train_examples = len(x_train)
    for i in range(num_train_examples):
        print('i: ', i)
        model.fit(x_train[:i],
                  y_train[:i],
                  batch_size=200,
                  epochs=20,
                  verbose=0)
    logging.debug('done training...')
    logging.debug('starting testing...')
    num_samples = x_test.shape[0]
    logging.debug('Number of samples: {}'.format(num_samples))
    max_batch_size = 4096
    batch_size = min(num_samples, max_batch_size)
    predictions_probability = model.predict(x_test, batch_size=batch_size)
    predictions = numpy.argmax(predictions_probability, axis=-1)
    fscore = compute_fscore(Y_pred=predictions, Y_true=y_test)
    logging.debug('done testing...')
    return -fscore
Esempio n. 8
0
bilstm_model.predict(np.zeros((1, 50)))


input = Input(shape=(max_len,))
bilstm_crf_model = Embedding(input_dim=n_words + 1, output_dim=20,
                      input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
bilstm_crf_model = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.1))(bilstm_crf_model)  # variational biLSTM
bilstm_crf_model = TimeDistributed(Dense(50, activation="relu"))(bilstm_crf_model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(bilstm_crf_model)  # output
bilstm_crf_model = Model(input, out)
save_load_utils.load_all_weights(bilstm_crf_model, filepath="result/bilstm-crf.h5")


bilstm_crf_model.predict(np.zeros((1, 50)))
print('test done.')

# 测试数据
def build_input(test_sentence):
    test_sentence =test_sentence.split(" ")
    x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                                    padding="post", value=0, maxlen=max_len)
    return test_sentence,x_test_sent


def bilstm_predcit(model,test_sentence,x_test_sent):
    pred = model.predict(np.array([x_test_sent[0]]))
    pred = np.argmax(pred, axis=-1)

    temp = []
Esempio n. 9
0
def bilstm_crf(train_loc, test_loc):
    train_pre = preprocess(train_loc)
    test_pre = preprocess(test_loc)
    cc_train = cuu(train_pre)
    cc_test = cuu(test_pre)
    words_all, tags_all = combine_all(cc_train, cc_test)
    n_words = len(words_all)
    n_tags = len(tags_all)

    max_len = 130
    word2idx = {w: i for i, w in enumerate(words_all)}
    tag2idx = {t: i for i, t in enumerate(tags_all)}

    X = [[word2idx[w[0]] for w in s] for s in cc_train]
    X = pad_sequences(maxlen=max_len,
                      sequences=X,
                      padding="post",
                      value=n_words - 1)
    X1 = [[word2idx[w[0]] for w in s] for s in cc_test]
    X1 = pad_sequences(maxlen=max_len,
                       sequences=X1,
                       padding="post",
                       value=n_words - 1)
    y = [[tag2idx[w[1]] for w in s] for s in cc_train]
    y = pad_sequences(maxlen=max_len,
                      sequences=y,
                      padding="post",
                      value=tag2idx["O"])
    y1 = [[tag2idx[w[1]] for w in s] for s in cc_test]
    y1 = pad_sequences(maxlen=max_len,
                       sequences=y1,
                       padding="post",
                       value=tag2idx["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]

    input = Input(shape=(max_len, ))
    model = Embedding(input_dim=n_words + 1,
                      output_dim=50,
                      input_length=max_len,
                      mask_zero=True)(input)  # 20-dim embedding
    model = Bidirectional(
        LSTM(units=250, return_sequences=True,
             recurrent_dropout=0.2))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(
        model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer="adam",
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])
    model.summary()
    history = model.fit(X, np.array(y), batch_size=4, epochs=15, verbose=1)
    test_pred = model.predict(X, verbose=1)
    idx2tag = {i: w for w, i in tag2idx.items()}

    pred_labels = pred2label(test_pred, idx2tag)
    true_labels = pred2label(y, idx2tag)
    f1_train = f1_score(true_labels, pred_labels)
    precision_train = precision_score(true_labels, pred_labels)
    recall_train = recall_score(true_labels, pred_labels)
    train_scores = [f1_train, precision_train, recall_train]

    y1 = [to_categorical(i, num_classes=n_tags) for i in y1]
    test_pred1 = model.predict(X1, verbose=1)
    pred_labels1 = pred2label(test_pred1, idx2tag)
    true_labels1 = pred2label(y1, idx2tag)
    f1_test = f1_score(true_labels1, pred_labels1)
    precision_test = precision_score(true_labels1, pred_labels1)
    recall_test = recall_score(true_labels1, pred_labels1)
    test_scores = [f1_test, precision_test, recall_test]
    print('Testing scores:', test_scores)
    return test_scores
Esempio n. 10
0
def hyperopt_train_test(params):

    epsilon = 10**params['epsilon_exp']
    optimizer = optimizers.adam(lr=params['learning_rate'], epsilon=epsilon)

    if dmc_parameters["use_embedding_layer"]:
        input = Input(shape=(dmc_parameters["max_seq_len"], ))
        model = Embedding(input_dim=dmc_parameters["one_hot_vector_len"],
                          output_dim=params['embedding_layer_output'],
                          input_length=dmc_parameters["max_seq_len"])(input)
        model = Dropout(rate=params['embedding_dropout'])(model)
    else:
        input = Input(shape=(dmc_parameters["max_seq_len"],
                             dmc_parameters["one_hot_vector_len"]))
        model = input
    if params['bi_lstm1_units'] > 0:
        model = Bidirectional(
            CuDNNLSTM(units=params['bi_lstm1_units'],
                      return_sequences=True))(model)
    if params['bi_lstm2_units'] > 0:
        model = Bidirectional(
            CuDNNLSTM(units=params['bi_lstm2_units'],
                      return_sequences=True))(model)
    if dmc_parameters["use_crf_layer"]:
        crf = CRF(dmc_parameters["num_tags"])  # CRF layer
        out = crf(model)  # output
        model = Model(input, out)
        model.compile(optimizer=optimizer,
                      loss=losses.crf_loss,
                      metrics=[metrics.crf_accuracy,
                               avg_proximity_metric()])
    else:
        out = TimeDistributed(
            Dense(dmc_parameters["num_tags"], activation="softmax"))(model)
        model = Model(input, out)
        model.compile(optimizer=optimizer,
                      loss="categorical_crossentropy",
                      metrics=["accuracy", avg_proximity_metric()])
    model.summary()
    es = EarlyStopping(monitor='val_loss',
                       min_delta=0,
                       patience=dmc_parameters["patience"],
                       verbose=False,
                       mode='min',
                       restore_best_weights=True)
    history = model.fit(X_tr,
                        np.array(y_tr),
                        batch_size=dmc_parameters['batch_size'],
                        epochs=dmc_parameters["epochs"],
                        validation_data=(X_vl, np.array(y_vl)),
                        verbose=False,
                        shuffle=True,
                        callbacks=[es])
    loss, acc, prox = model.evaluate(x=X_vl,
                                     y=np.array(y_vl),
                                     batch_size=dmc_parameters['batch_size'],
                                     verbose=False)
    validation_labels = deepMirCut.pred2label(y_vl, dmc_parameters)
    validation_pred = model.predict(X_vl, verbose=False)
    pred_labels = deepMirCut.pred2label(validation_pred, dmc_parameters)
    fScore = f1_score(validation_labels, pred_labels)
    return loss, acc, prox, fScore
plot_history(history)


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out


test_pred = model.predict(X_test, verbose=1)
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)

# pip install seqeval

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

# ! pip install sklearn_crfsuite

from sklearn_crfsuite.metrics import flat_classification_report
report = flat_classification_report(y_pred=pred_labels, y_true=test_labels)
print(report)

TP = {}
Esempio n. 12
0
                  output_dim=2,
                  kernel_initializer=initializers.glorot_uniform(seed=1),
                  activation='relu')(model)
    model = Model(inputs=inputs, outputs=model)
    model.compile(loss='mae', optimizer='adam', metrics=['mse'])
    model.summary()

    epochs = 10
    callback = model.fit(x=train_x,
                         y=train_y,
                         epochs=epochs,
                         validation_split=.3,
                         batch_size=200,
                         verbose=1).history

    test_y = np.rint(model.predict(x=test_x, batch_size=200,
                                   verbose=1)).astype('int')

    seconds = str((datetime.datetime.now() - now).seconds)

    print(seconds)

    with open('test{seconds}.txt'.format(seconds=seconds), 'w') as file:
        file.write('id,good,bad\n')
        for index, data in enumerate(test_y):
            file.write('{},{},{}\n'.format(index, data[0], data[1]))

    with open('record{seconds}.log'.format(seconds=seconds), 'w') as file:
        file.write('result\t\n\n')
        file.write('\t'.join(
            ['index', 'loss\t\t', 'mse\t\t', 'val_loss\t', 'val_mse\t']) +
                   '\n')
Esempio n. 13
0
plt.figure(figsize=(8, 8))
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure(figsize=(8, 8))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

# Evaluation
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))

report = flat_classification_report(y_pred=y_pred, y_true=y_test_true)
print(report)

i = np.random.randint(
    0, X_test.shape[0])  # choose a random number between 0 and len(X_te)b
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[i], -1)

print("Sample number {} of {} (Test Set)".format(i, X_test.shape[0]))
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(len(train_tag_set))  # CRF layer
out = crf(model)  # output
model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=20,
                    validation_split=0.1, verbose=1)
save_name = 'bilstm_crf_ner_weights.h5'
print("Saving model weights to : {}".format(save_name))
model.save_weights(save_name)
pred = model.predict(X_te)
print("Preds")
for i in range(10):
    sent_idx = X_te[i]
    word_arr = idx_to_arr(sent_idx,vocab2)
    #sent = idx_to_arr(sent_idx,vocab)
    print(arr_to_str(word_arr))
    sent_len = get_sent_length(word_arr)
    print(sent_len)
    truth = y_te[i][:sent_len]
    #print(truth[i])
    pred_arr = categorical_pred_to_tags(pred[i][:sent_len],train_tag_list)
    truth_arr = categorical_pred_to_tags(truth,train_tag_list)
    for w,p,t in zip(word_arr,pred_arr,truth_arr):
        print("{} {} {}".format(w,t,p))
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='acuratețe de antrenare')
plt.plot(epochs, val_acc, 'b', label='acuratețe de validare')
# plt.title('Acuratețea la antrenare și validare')
plt.legend()
plt.show()

plt.figure(figsize=(8, 8))
plt.plot(epochs, loss, 'bo', label='pierderea de antrenare')
plt.plot(epochs, val_loss, 'b', label='pierderea de validare')
# plt.title('Pierderea la antrenare și validare')
plt.legend()
plt.show()

# Evaluation
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

# Convert the index to tag
y_pred = [[index2tag[i] for i in row] for row in y_pred]
y_test_true = [[index2tag[i] for i in row] for row in y_test_true]

print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))
print("Accuracy is : {:.1%}".format(accuracy_score(y_test_true, y_pred)))
print("Precision is : {:.1%}".format(precision_score(y_test_true, y_pred)))
print("Recall is : {:.1%}".format(recall_score(y_test_true, y_pred)))

report = flat_classification_report(y_pred=y_pred, y_true=y_test_true)
print(report)
def main():
    form = ReusableForm(request.form)

    print(form.errors)
    if request.method == 'POST':
       name=request.form['name']
 #      name=request.form.getlist('name[]')
       print(name) 

    if form.validate():
            # Save the comment here.
       flash('Hello ' + name)

    
    data = pd.read_csv("nano.csv", encoding="latin1", engine = 'python')
    data = data.fillna(method="ffill")
    data.head(10)

    words = list(set(data["Name"].values))
    words.append("ENDPAD")
    n_words = len(words); n_words

    tags = list(set(data["Class"].values))
    n_tags = len(tags); n_tags

    class SentenceGetter(object):
        
        def __init__(self, data):
            self.n_sent = 1
            self.data = data
            self.empty = False
            agg_func = lambda s: [(w, t) for w, t in zip(s["Name"].values.tolist(),
                                                         s["Class"].values.tolist())]
            self.grouped = self.data.groupby("Name").apply(agg_func)
            self.sentences = [s for s in self.grouped]
        
        def get_next(self):
            try:
                s = self.grouped["Sentence: {}".format(self.n_sent)]
                self.n_sent += 1
                return s
            except:
                return None

    sent = getter.get_next()
    #print(sent)

    sentences = getter.sentences
    #print(sentences)

    max_len = 75
    word2idx = {w: i + 1 for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}

    #word2idx['graphene']
    #tag2idx['NONNANO']

    from keras.preprocessing.sequence import pad_sequences
    X = [[word2idx[w[0]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)

    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["NANO"])

    from keras.utils.np_utils import to_categorical
    y = [to_categorical(i, n_tags) for i in y]

    from sklearn.model_selection import train_test_split

    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

    from keras.models import Model
    from keras.layers import LSTM, Input, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
    from keras_contrib.layers import CRF

    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words + 1, output_dim=20,
                      input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
    model = Bidirectional(LSTM(units=50, return_sequences=True,recurrent_dropout=0.2))(model)

    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output

    model = Model(input, out)

    model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

    model.summary()

    history = model.fit(X_tr, np.array(y_tr), batch_size=84, epochs=8,
                        validation_split=0.3, verbose=1)

    hist = pd.DataFrame(history.history)
    #print(hist)

    import matplotlib.pyplot as plt
    plt.style.use("ggplot")
    plt.figure(figsize=(12,12))
    plt.plot(hist["acc"])
    plt.plot(hist["val_acc"])
    plt.show()

    i = 0
    p = model.predict(np.array([X_te[i]]))
    p = np.argmax(p, axis=-1)
    true = np.argmax(y_te[i], -1)
    print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
    print(30 * "=")
    for w, t, pred in zip(X_te[i], true, p[0]):
        if w != 0:
            print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))


    from stanfordcorenlp import StanfordCoreNLP
    import logging
    import json

    class StanfordNLP:
        def __init__(self, host='http://localhost', port=9000):
            self.nlp = StanfordCoreNLP(host, port=port,
                                       timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
            self.props = {
                'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
                'pipelineLanguage': 'en',
                'outputFormat': 'json'
            }

        def word_tokenize(self, sentence):
            return self.nlp.word_tokenize(sentence)

        def tokens_to_dict(_tokens):
            tokens = defaultdict(dict)
            for token in _tokens:
                tokens[int(token['index'])] = {
                    'word': token['word'],
                    'lemma': token['lemma'],
                    'pos': token['pos'],
                    'ner': token['ner']
                }
            return tokens


    if __name__ == '__main__':
        sNLP = StanfordNLP()

    text = "Cantilever Island Atomic distance is 25 nm Force Microscopy Contact Roberts Microscopy"

    print(sNLP.word_tokenize(text))
    test = sNLP.word_tokenize(text)


    x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test]],padding="post", value=0, maxlen=max_len)
    print(x_test_sent)

    p = model.predict(np.array([x_test_sent[0]]))
    p = np.argmax(p, axis=-1)
    print("{:15}||{}".format("Word", "Prediction"))
    print(30 * "=")
    for w, pred in zip(test, p[0]):
        print("{:15}: {:5}".format(w, tags[pred]))

        flash('You have given ' + name +' as input ')

    else:
        flash('Required: All the form fields are required. ')

    return render_template('hello.html', form=form)
Esempio n. 17
0
out = crf(model)

model = Model(input, out)


Y_train_2 = keras.utils.to_categorical(Y_train)
Y_val_2 = keras.utils.to_categorical(Y_val)
Y_test_2 = keras.utils.to_categorical(Y_test)

model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy]) 
earlyStop = [EarlyStopping(monitor='val_loss', patience=1)]
history = model.fit(X_train, Y_train_2, batch_size=64, epochs=10, 
                   validation_data=(X_val, Y_val_2), callbacks=earlyStop)


preds = model.predict(X_test)
test = [[np.argmax(y) for y in x] for x in preds]
test_arr = np.asarray(test)
test = np.reshape(test_arr, (-1))

print (metrics.precision_recall_fscore_support(np.reshape(Y_test,(-1)), test, average=None,
                                              labels=[0, 1]))


preds = test_arr
##record the prediceted start and end index
with open('../../outputs/CRF_glove_preds', 'w') as fout:
	with open('../../data/test.txt', 'r') as test:
		test_list = test.readlines()
		for i in range(len(preds)):
			sent = test_list[i].strip().split()
#

# `assert_consumed` can be used as validation that all variable values have been
# restored from the checkpoint. See `tf.train.Checkpoint.restore` for other
# methods in the Status object.
#print(load_status.assert_consumed())

model.summary()

print('AFTER LOADING', model.get_weights())
# ======================================================================================================================
# Predict on validation data
# ======================================================================================================================

print('\nPredicting...')
y_pred_summaries = model.predict(x=test_generator)
print(y_pred_summaries)
print('\nY_PRED SHAPE', np.array(y_pred_summaries, dtype=object).shape)

# ======================================================================================================================
# Set data generators for batch training
# ======================================================================================================================

sentence_model = False  # True  False

if sentence_model:
    # Set batch size, train and test data size
    batch_size = 256  #224  # 1024  # set during pre-processing (set in file preprocessing.py)
    train_data_size = 4136306  #4139868  # 530809  [ THE NUMBER OF TRAIN SENTENCES\DOCS ]  # the total size of train data
    validation_data_size = 156519  #156836  # 20000  [ THE NUMBER OF VALIDATION SENTENCES\DOCS ]  # the total size of test data
    test_data_size = 155801  #156085  # 156085  SEE BELOW [ THE NUMBER OF TEST SENTENCES\DOCS ]  # the total size of test data
Esempio n. 19
0
model = Embedding(input_dim = no_words, output_dim = EMBEDDING, input_length = MAX_LEN, mask_zero=True)(input)
model = Bidirectional(LSTM(units = 50, return_sequences=True, recurrent_dropout=0.1))(model)
model = TimeDistributed(Dense(50, activation="relu"))(model)
crf_lstm = CRF_2nd(no_tags)
out_layer = crf_lstm(model)

model = Model(input, out_layer)
model.compile(optimizer="rmsprop", loss=crf_lstm.loss_function, metrics=[crf_lstm.accuracy])

model.summary()

history = model.fit(bi_train_x, np.array(bi_train_y), batch_size=BATCH_SIZE, epochs=EPOCHS,
                    validation_split=0.1, verbose=2)

pred_y = model.predict(bi_test_X)
pred_y = np.argmax(pred_y, axis=-1)
y_test_true = np.argmax(bi_test_y, -1)
y_test_true = [[index_to_tag[i] for i in row] for row in y_test_true]
y_test_true = [[x for x in row if x!='PADword'] for row in y_test_true]

pred_y = [[index_to_tag[i] for i in row] for row in pred_y]
pred_y = [[x.replace("PADword", "O") for x in pred_y[index]][: len(y_test_true[index])] for index in range(len(y_test_true))]

print('LSTM Classification Report\n', metrics.flat_classification_report(pred_y, y_test_true, labels=tags_without_o))

# Used four methods for the ensemble but more could easily be added

# flattening function
flatten = lambda l: [item for sublist in l for item in sublist]
Esempio n. 20
0
model = Model(input, out)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])
print(model.summary())

history = model.fit(x_train,
                    np.array(y_train),
                    batch_size=64,
                    epochs=30,
                    validation_split=0.1,
                    verbose=1)

# predict the name entities in the test set
# evaluate the model
from sklearn.metrics import classification_report
y_pred = model.predict(x_test, verbose=1)

idx2tag = {i: w for w, i in tag2idx.items()}


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            if type(idx2tag[p_i]) != str:
                print(p_i)
                print(idx2tag)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
Esempio n. 21
0
def train(word2Vec, train_df, test_df, max_length, filters, kernel_size,
          pool_size, dense):
    r = random.randint(1, 10000)
    now = datetime.datetime.now()

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(word2Vec.wv.vocab.keys())

    train_x = tokenizer.texts_to_sequences(train_df['text'])
    train_x = pad_sequences(train_x, maxlen=max_length)

    # train_y_good = train_df["good"]
    # train_y_bad = train_df["bad"]
    train_y = pd.DataFrame(train_df, columns=["good", "bad"])

    test_x = tokenizer.texts_to_sequences(test_df['text'])
    test_x = pad_sequences(test_x, maxlen=max_length)

    word_index = tokenizer.word_index

    embedding_matrix = np.zeros((len(word_index) + 1, word2Vec.vector_size))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = word2Vec[word]
        except:
            continue

    inputs = Input(shape=(max_length, ))
    model = Embedding(name="embedding",
                      input_dim=len(word_index) + 1,
                      output_dim=word2Vec.vector_size,
                      weights=[embedding_matrix],
                      input_length=max_length)(inputs)
    model = Conv1D(name="conv1D",
                   filters=filters,
                   kernel_size=kernel_size,
                   kernel_initializer=initializers.glorot_uniform(seed=1),
                   padding='same')(model)
    model = MaxPooling1D(name="maxPooling1D",
                         pool_size=pool_size,
                         strides=1,
                         padding='same')(model)
    model = Flatten(name="flatten")(model)
    model = Dense(name="dense",
                  output_dim=dense,
                  kernel_initializer=initializers.glorot_uniform(seed=1),
                  activation='relu')(model)
    model = Dense(name="output",
                  output_dim=2,
                  kernel_initializer=initializers.glorot_uniform(seed=1),
                  activation='relu')(model)
    model = Model(inputs=inputs, outputs=model)
    model.compile(loss='mae',
                  optimizer=optimizers.Adam(lr=.001),
                  metrics=['mse'])
    model.summary()

    epochs = 15
    callback = model.fit(x=train_x,
                         y=train_y,
                         epochs=epochs,
                         validation_split=.3,
                         batch_size=20,
                         verbose=1).history

    test_y = np.rint(model.predict(x=test_x, batch_size=10,
                                   verbose=1)).astype('int')

    seconds = str((datetime.datetime.now() - now).seconds)

    with open('test{seconds}_{r}.txt'.format(seconds=seconds, r=r),
              'w') as file:
        file.write('id,good,bad\n')
        for index, data in enumerate(test_y):
            file.write('{},{},{}\n'.format(index, data[0], data[1]))

    with open('record{seconds}_{r}.log'.format(seconds=seconds, r=r),
              'w') as file:
        file.write('result\t\n\n')
        file.write('\t'.join(
            ['index', 'loss\t\t', 'mse\t\t\t', 'val_loss\t\t', 'val_mse\t']) +
                   '\n')
        for index, loss, mse, val_loss, val_mse in zip(
                range(1, epochs + 1), callback['loss'],
                callback['mean_squared_error'], callback['val_loss'],
                callback['val_mean_squared_error']):
            file.write('\t'.join([
                str(index) + '\t', '{:.12f}'.format(loss), '{:.12f}'.format(
                    mse), '{:.12f}'.format(val_loss), '{:.12f}'.format(val_mse)
            ]) + '\n')
        file.write(
            '\nmax_length={max_length}\nmin_count={min_count}, size=270, iter=10, sg=1, workers=10\n'
            .format(max_length=max_length, min_count=min_count))
        file.write('inputs = Input(shape=(max_length,)\n')
        file.write(
            'model = Embedding(name="embedding", input_dim=len(word_index)+1, output_dim=word2Vec.vector_size, weights=[embedding_matrix], input_length=max_length)(inputs)\n'
        )
        file.write(
            'model = Conv1D(name="conv1D_good", filters={filters}, kernel_size={kernel_size}, kernel_initializer=initializers.glorot_uniform(seed=1), padding="same")(model)\n'
            .format(filters=filters, kernel_size=kernel_size))
        file.write(
            'model = MaxPooling1D(name="maxPooling1D", pool_size={pool_size}, strides=1, padding="same")(model)\n'
            .format(pool_size=pool_size))
        file.write(
            'model = Dense(name="dense", output_dim={dense}, kernel_initializer=initializers.glorot_uniform(seed=1), activation="relu")(model)\n'
            .format(dense=dense))
        file.write(
            'model = Dense(name="output", output_dim=2, kernel_initializer=initializers.glorot_uniform(seed=1), activation="relu")(model)\n'
        )
        file.write('model = Model(inputs=inputs, outputs=model)\n')
        file.write(
            'model.compile(loss="mae", optimizer=optimizers.Adam(lr=.001), metrics=["mse"])\n'
        )

    import matplotlib.pyplot as plt
    fig = plt.figure()
    plt.grid(True)
    plt.ylim(0, 40)
    plt.plot(callback['loss'])
    plt.plot(callback['mse'])
    plt.plot(callback['val_loss'])
    plt.plot(callback['val_mse'])
    plt.title('model loss')
    plt.ylabel('loss (mae)')
    plt.xlabel('epoch')
    plt.legend(['train_loss', 'train_mse', 'test_loss', 'test_mse'],
               loc='upper right')
    fig.savefig('{seconds}_{r}.png'.format(seconds=seconds, r=r), dpi=fig.dpi)
Esempio n. 22
0
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output
model = Model(input, out)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

import time

start = time.time()

history = model.fit(X, np.array(y), batch_size=32, epochs=15, verbose=1)
end = time.time()
print(end - start)

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
test_pred = model.predict(X, verbose=1)

idx2tag = {i: w for w, i in tag2idx.items()}


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

Esempio n. 23
0
history = model.fit(X_tr,
                    np.array(y_tr),
                    batch_size=32,
                    epochs=1,
                    validation_split=0.1,
                    verbose=1)
hist = pd.DataFrame(history.history)

# plt.style.use("ggplot")
# plt.figure(figsize=(12,12))
# plt.plot(hist["acc"])
# plt.plot(hist["val_acc"])
# plt.show()

test_pred = model.predict(X_te, verbose=1)

idx2tag = {i: w for w, i in tag2idx.items()}


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

Esempio n. 24
0
        elif "scope" in negation[0]:
          cues[i, j] = 1; # not a cue
          golds[i, j, classes["F"]] = 1;
        else:
          cues[i, j] = 1; # not a cue
          golds[i, j, classes["T"]] = 1;

  if arguments.debug:
    print("evaluation: {} word tokens; {} unknown."
          "".format(n, unknown));
    print("model.evaluate() on evaluation: {}"
          "".format(model.evaluate(([inputs, cues] if arguments.cues
                                    else inputs),
                                   golds, verbose = 1)));

  outputs = model.predict(([inputs, cues] if arguments.cues else inputs),
                          verbose = 1);
  tf.keras.backend.clear_session();

  #
  # convert back from ‘categorical’, one-hot encoding and un-pad;
  # while at it, (wastefully :-) produce two flat lists of labels.
  #
  golds = [np.argmax(gold, axis = 1) for gold in golds];
  outputs = [np.argmax(output, axis = 1) for output in outputs];
  labels = [];
  system = [];
  for i, sentence in enumerate(test):
    golds[i] = golds[i][0:len(sentence["nodes"])];
    labels.extend(golds[i]);
    outputs[i] = outputs[i][0:len(sentence["nodes"])];
    system.extend(outputs[i]);
Esempio n. 25
0
new_notes = []
new_beginnings = []

for i in range(len(lengths)):
    for l in range(int(lengths[i] / 0.5)):
        new_notes.append(notes1[i])
        if l == 0:
            new_beginnings.append('b')
        elif l == int(lengths[i] / 0.5) - 1:
            new_beginnings.append('e')
        else:
            new_beginnings.append('c')

test = [str(i) + ' ' + c for i, c in zip(new_notes, new_beginnings)]

test = [note2idx[t] for t in test]

test = pad_sequences(maxlen=max_len,
                     sequences=[test],
                     padding="post",
                     value=n_notes - 1)

p = model.predict(np.array(test))
p = np.argmax(p, axis=-1)

melody_notes = [notes[w] for w in test[0]]
predicted_chords = [chords[pred] for pred in p[0]]

pickle.dump(melody_notes, open(state_name + "predictions/melody.p", "wb"))
pickle.dump(predicted_chords, open(state_name + "predictions/chords.p", "wb"))
Esempio n. 26
0
model = Model(input, out)
model.compile(optimizer="rmsprop",
              loss=crf.loss_function,
              metrics=[crf.accuracy])

model.summary()

history = model.fit(X_tr,
                    np.array(y_tr),
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=2)

# Eval
pred_cat = model.predict(X_te)
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true]

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)

i = np.random.randint(
    0, X_te.shape[0])  # choose a random number between 0 and len(X_te)
print(i)
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
model = TimeDistributed(Dense(50, activation="relu"))(model)
out = Dense(6, activation='softmax')(model)
#crf = CRF(n_tags+1)
#out = crf(model)

model = Model(input, out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train,
                    numpy.array(y_train),
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.2,
                    verbose=2)

#history = model.fit(X_train, numpy.array(y_train), batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.2, verbose = 2)
pred_cat = model.predict(X_test)
pred = numpy.argmax(pred_cat, axis=-1)
y_test_true = numpy.argmax(y_test, -1)
from sklearn_crfsuite.metrics import flat_classification_report
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_test_true_tag = [[idx2tag[i] for i in row] for row in y_test_true]

#from sklearn.metrics import f1_score
#report = f1_score(y_test, pred_cat)
report = flat_classification_report(y_pred=pred_tag, y_true=y_test_true_tag)
print(report)
Esempio n. 28
0
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

confE = []
confN = []
confT = []
confJ = []

#precal=[]

yactual = []
ypred = []
for i in range(np.shape(Xtstc)[0]):
    print(i)
    y = model.predict(Xtstc[i].reshape(1, -1))
    One = int(np.round(y[0]))
    Two = int(np.round(y[1]))
    Three = int(np.round(y[2]))
    Four = int(np.round(y[3]))
    ya = y_testc[i]
    Oneac = int(ya[3])
    Twoac = int(ya[1])
    Threeac = int(ya[2])
    Fourac = int(ya[0])
    ypre = [One, Two, Three, Four]
    yac = [Oneac, Twoac, Threeac, Fourac]
    ypred.append(ypre)
    yactual.append(yac)
    confE.append(confusion_matrix([Four], [Fourac]))
    confN.append(confusion_matrix([Two], [Twoac]))
Esempio n. 29
0
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

model = Model(input, out)

model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

truths = []
predictions = []
for i in range(len(y_te)):
    p = model.predict(np.array([X_te[i]]))
    p = np.argmax(p, axis=-1)
    true = np.argmax(y_te[i], -1)
    for w, t, pred in zip(X_te[i], true, p[0]):
        if w != 0:
            truths.append(t)
            predictions.append(pred)
#print(predictions)
accuracy = accuracy_score(truths, predictions)
print(accuracy)

print(classification_report(
    truths, predictions,
target_names=["D", "O","T"]))
Esempio n. 30
0
crf = CRF_2nd(no_tags)
out_layer = crf(model)

model = Model(input, out_layer)
model.compile(optimizer="rmsprop",
              loss=crf.loss_function,
              metrics=[crf.accuracy])

model.summary()

history = model.fit(train_X,
                    np.array(train_y),
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=2)

pred_y = model.predict(test_X)
pred_y = np.argmax(pred_y, axis=-1)
y_test_true = np.argmax(test_y, -1)
y_test_true = [[index_to_tag[i] for i in row] for row in y_test_true]
y_test_true = [[x for x in row if x != 'PADword'] for row in y_test_true]

pred_y = [[index_to_tag[i] for i in row] for row in pred_y]
pred_y = [[x.replace("PADword", "O")
           for x in pred_y[index]][:len(y_test_true[index])]
          for index in range(len(y_test_true))]

print('LSTM Classification Report\n',
      metrics.flat_classification_report(pred_y, y_test_true))