def bilstm_crf_predcit(): # 重新初始化模型,构建配置信息,和train部分一样 input = Input(shape=(max_len, )) model = Embedding(input_dim=n_words + 1, output_dim=20, input_length=max_len, mask_zero=True)(input) # 20-dim embedding model = Bidirectional( LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model) # variational biLSTM model = TimeDistributed(Dense(50, activation="relu"))( model) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model(input, out) # 恢复权重 save_load_utils.load_all_weights(model, filepath="result/bilstm-crf.h5") p = model.predict(np.array([x_test_sent[0]])) p = np.argmax(p, axis=-1) print("{:15}||{}".format("Word", "Prediction")) print(30 * "=") for w, pred in zip(test_sentence, p[0]): print("{:15}: {:5}".format(w, tags[pred]))
def cross_validate(self, X, y): X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1) input = Input(shape=(self.max_len, )) model = Embedding(input_dim=self.n_words, output_dim=50, input_length=self.max_len)(input) model = Dropout(0.1)(model) model = Bidirectional( LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) out = TimeDistributed(Dense(self.n_labels, activation="softmax"))( model) # softmax output layer model = Model(input, out) model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=1, validation_split=0.1, verbose=1) p = model.predict(np.array([X_te[10]])) p = np.argmax(p, axis=-1) for w, pred in zip(X_te[10], p[0]): if self.words[w] != 'PADGARBAGE': print("{:15}: {}".format(self.words[w], self.labels[pred]))
def run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, maxlen=40, emb_dim=300, neg_ratio=0, hidden_dim=300, drop=0.2, r_drop=0.1): ##build model input = Input(shape=(maxlen, )) model = Embedding(vocab_size, emb_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)(input) model = Dropout(drop)(model) model = Bidirectional( LSTM(hidden_dim, return_sequences=True, recurrent_dropout=r_drop))(model) model = Dropout(drop)(model) out = TimeDistributed(Dense(1, activation='sigmoid'))(model) model = Model(input, out) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) earlyStop = [EarlyStopping(monitor='val_loss', patience=1)] history = model.fit(X_train, Y_train, batch_size=64, epochs=10, validation_data=(X_val, Y_val), callbacks=earlyStop) pred = model.predict(X_val) Y_pred = np.squeeze(pred) test = [[1 if y >= threshold else 0 for y in x] for x in Y_pred] test_arr = np.asarray(test) test_arr = np.reshape(test_arr, (-1)) target = np.reshape(Y_val, (-1)) print( metrics.precision_recall_fscore_support(target, test_arr, average=None, labels=[0, 1])) # Y_pred_ = [[1 if y>=threshold else 0 for y in x] for x in Y_pred] Y_val_ = np.squeeze(Y_val) print("Evaluate: dev seg exact") pred_out_dir = out_dir + 'seg_' + str(neg_ratio) + 'neg' gold_dir = '../../data/val_segs/' + 'seg_' + str(neg_ratio) + 'neg' p, r, f = seg_exact_match(test, Y_val_, pred_out_dir, gold_dir) return model, history, p, r, f
def run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, maxlen=40, emb_dim=300, neg_ratio=0, hidden_dim=300, drop=0.2, r_drop=0.1): ##build model # input = Input(shape=(maxlen,)) # model = Embedding(vocab_size, emb_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)(input) # model = Dropout(drop)(model) # model = Bidirectional(LSTM(hidden_dim, return_sequences=True, recurrent_dropout=r_drop))(model) # model = Dropout(drop)(model) # out = TimeDistributed(Dense(1, activation='sigmoid'))(model) input = Input(shape=(maxlen,)) model = Embedding(vocab_size, emb_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)(input) model = Bidirectional(LSTM(hidden_dim, return_sequences=True, recurrent_dropout=r_drop))(model) model = TimeDistributed(Dense(hidden_dim//4, activation='relu'))(model) model = TimeDistributed(Dropout(drop))(model) ##use CRF instead of Dense crf = CRF(2) out = crf(model) model = Model(input, out) Y_train_2 = keras.utils.to_categorical(Y_train) Y_val_2 = keras.utils.to_categorical(Y_val) model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy]) earlyStop = [EarlyStopping(monitor='val_loss', patience=1)] history = model.fit(X_train, Y_train_2, batch_size=64, epochs=10, validation_data=(X_val, Y_val_2), callbacks=earlyStop) preds = model.predict(X_val) test = [[np.argmax(y) for y in x] for x in preds] test_arr = np.asarray(test) test_arr = np.reshape(test_arr, (-1)) print (metrics.precision_recall_fscore_support(np.reshape(Y_val,(-1)), test_arr, average=None, labels=[0, 1])) # Y_pred_ = [[1 if y>=threshold else 0 for y in x] for x in Y_pred] Y_val_ = np.squeeze(Y_val) print ("Evaluate: dev seg exact") pred_out_dir = out_dir+'seg_'+str(neg_ratio)+'neg' gold_dir = '../../data/val_segs/'+'seg_'+str(neg_ratio)+'neg' p, r, f = seg_exact_match(test, Y_val_, pred_out_dir, gold_dir) return model, history, p, r, f
# # # save_load_utils.save_all_weights(model, 'lstm_crf.model', include_optimizer=False) # # hist = pd.DataFrame(history.history) # # # plt.style.use("ggplot") # plt.figure(figsize=(12,12)) # plt.plot(hist["acc"]) # plt.plot(hist["val_acc"]) # plt.show() save_load_utils.load_all_weights(model, 'lstm_crf.model') test_pred = model.predict(X_te, verbose=2) idx2tag = {i: w for w, i in tag2idx.items()} # print(idx2tag) print(test_pred) def pred2label(pred): out = [] for pred_i in pred: out_i = [] for p in pred_i: p_i = np.argmax(p) out_i.append(idx2tag[p_i].replace("PAD", "O")) out.append(out_i) return out
verbose=1) #, callbacks = [tbCallBack]) hist = pd.DataFrame(history.history) hist plt.style.use('ggplot') plt.figure(figsize=(12, 12)) plt.plot(hist['crf_viterbi_accuracy']) plt.plot(hist['val_crf_viterbi_accuracy']) plt.xlabel('Epochs') plt.ylabel('Accuracy') # pip install seqeval from seqeval.metrics import precision_score, recall_score, f1_score, classification_report test_pred = model.predict(X_test, verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} def pred2label(pred): out = [] for pred_i in pred: out_i = [] for p in pred_i: p_i = np.argmax(p) out_i.append(idx2tag[p_i].replace("PAD", "O")) out.append(out_i) return out
def create_model(x_train, y_train, x_test, y_test): args = parse_args() set_logger(args.log_path, args.log_level) logging.debug('Args:') logging.debug(args) lang = construct_languages(args.train) assert len(lang) == 1 lang = lang[0] game = initialize_game(train_file=lang.train, test_file=lang.test, dev_file=lang.dev, emb_file=lang.emb, budget=args.budget, max_seq_len=args.max_seq_len, max_vocab_size=args.max_vocab_size, emb_size=args.embedding_size, model_name=args.model_name) max_len = args.max_seq_len input_dim = args.max_vocab_size output_dim = args.embedding_size embedding_matrix = game.w2v logging.debug('building Keras model...') input = Input(shape=(max_len, )) model = Embedding(input_dim=input_dim, output_dim=output_dim, input_length=max_len, weights=[embedding_matrix], trainable=False)(input) model = Dropout(0.1)(model) n_units = 128 model = Bidirectional( LSTM(units=n_units, return_sequences=True, recurrent_dropout=0.1))(model) n_tags = 5 out = TimeDistributed(Dense(n_tags, activation='softmax'))(model) model = Model(input, out) logging.debug('Model type: ') logging.debug(type(model)) logging.debug('Model summary: ') logging.debug(model.summary()) rmsprop = keras.optimizers.RMSprop(lr={{choice([0.0001])}}) model.compile(optimizer=rmsprop, loss='categorical_crossentropy', metrics=['accuracy']) logging.debug('done building model...') logging.debug('starting training...') num_train_examples = len(x_train) for i in range(num_train_examples): print('i: ', i) model.fit(x_train[:i], y_train[:i], batch_size=200, epochs=20, verbose=0) logging.debug('done training...') logging.debug('starting testing...') num_samples = x_test.shape[0] logging.debug('Number of samples: {}'.format(num_samples)) max_batch_size = 4096 batch_size = min(num_samples, max_batch_size) predictions_probability = model.predict(x_test, batch_size=batch_size) predictions = numpy.argmax(predictions_probability, axis=-1) fscore = compute_fscore(Y_pred=predictions, Y_true=y_test) logging.debug('done testing...') return -fscore
bilstm_model.predict(np.zeros((1, 50))) input = Input(shape=(max_len,)) bilstm_crf_model = Embedding(input_dim=n_words + 1, output_dim=20, input_length=max_len, mask_zero=True)(input) # 20-dim embedding bilstm_crf_model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(bilstm_crf_model) # variational biLSTM bilstm_crf_model = TimeDistributed(Dense(50, activation="relu"))(bilstm_crf_model) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(bilstm_crf_model) # output bilstm_crf_model = Model(input, out) save_load_utils.load_all_weights(bilstm_crf_model, filepath="result/bilstm-crf.h5") bilstm_crf_model.predict(np.zeros((1, 50))) print('test done.') # 测试数据 def build_input(test_sentence): test_sentence =test_sentence.split(" ") x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]], padding="post", value=0, maxlen=max_len) return test_sentence,x_test_sent def bilstm_predcit(model,test_sentence,x_test_sent): pred = model.predict(np.array([x_test_sent[0]])) pred = np.argmax(pred, axis=-1) temp = []
def bilstm_crf(train_loc, test_loc): train_pre = preprocess(train_loc) test_pre = preprocess(test_loc) cc_train = cuu(train_pre) cc_test = cuu(test_pre) words_all, tags_all = combine_all(cc_train, cc_test) n_words = len(words_all) n_tags = len(tags_all) max_len = 130 word2idx = {w: i for i, w in enumerate(words_all)} tag2idx = {t: i for i, t in enumerate(tags_all)} X = [[word2idx[w[0]] for w in s] for s in cc_train] X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1) X1 = [[word2idx[w[0]] for w in s] for s in cc_test] X1 = pad_sequences(maxlen=max_len, sequences=X1, padding="post", value=n_words - 1) y = [[tag2idx[w[1]] for w in s] for s in cc_train] y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"]) y1 = [[tag2idx[w[1]] for w in s] for s in cc_test] y1 = pad_sequences(maxlen=max_len, sequences=y1, padding="post", value=tag2idx["O"]) y = [to_categorical(i, num_classes=n_tags) for i in y] input = Input(shape=(max_len, )) model = Embedding(input_dim=n_words + 1, output_dim=50, input_length=max_len, mask_zero=True)(input) # 20-dim embedding model = Bidirectional( LSTM(units=250, return_sequences=True, recurrent_dropout=0.2))(model) # variational biLSTM model = TimeDistributed(Dense(50, activation="relu"))( model) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() history = model.fit(X, np.array(y), batch_size=4, epochs=15, verbose=1) test_pred = model.predict(X, verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} pred_labels = pred2label(test_pred, idx2tag) true_labels = pred2label(y, idx2tag) f1_train = f1_score(true_labels, pred_labels) precision_train = precision_score(true_labels, pred_labels) recall_train = recall_score(true_labels, pred_labels) train_scores = [f1_train, precision_train, recall_train] y1 = [to_categorical(i, num_classes=n_tags) for i in y1] test_pred1 = model.predict(X1, verbose=1) pred_labels1 = pred2label(test_pred1, idx2tag) true_labels1 = pred2label(y1, idx2tag) f1_test = f1_score(true_labels1, pred_labels1) precision_test = precision_score(true_labels1, pred_labels1) recall_test = recall_score(true_labels1, pred_labels1) test_scores = [f1_test, precision_test, recall_test] print('Testing scores:', test_scores) return test_scores
def hyperopt_train_test(params): epsilon = 10**params['epsilon_exp'] optimizer = optimizers.adam(lr=params['learning_rate'], epsilon=epsilon) if dmc_parameters["use_embedding_layer"]: input = Input(shape=(dmc_parameters["max_seq_len"], )) model = Embedding(input_dim=dmc_parameters["one_hot_vector_len"], output_dim=params['embedding_layer_output'], input_length=dmc_parameters["max_seq_len"])(input) model = Dropout(rate=params['embedding_dropout'])(model) else: input = Input(shape=(dmc_parameters["max_seq_len"], dmc_parameters["one_hot_vector_len"])) model = input if params['bi_lstm1_units'] > 0: model = Bidirectional( CuDNNLSTM(units=params['bi_lstm1_units'], return_sequences=True))(model) if params['bi_lstm2_units'] > 0: model = Bidirectional( CuDNNLSTM(units=params['bi_lstm2_units'], return_sequences=True))(model) if dmc_parameters["use_crf_layer"]: crf = CRF(dmc_parameters["num_tags"]) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer=optimizer, loss=losses.crf_loss, metrics=[metrics.crf_accuracy, avg_proximity_metric()]) else: out = TimeDistributed( Dense(dmc_parameters["num_tags"], activation="softmax"))(model) model = Model(input, out) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy", avg_proximity_metric()]) model.summary() es = EarlyStopping(monitor='val_loss', min_delta=0, patience=dmc_parameters["patience"], verbose=False, mode='min', restore_best_weights=True) history = model.fit(X_tr, np.array(y_tr), batch_size=dmc_parameters['batch_size'], epochs=dmc_parameters["epochs"], validation_data=(X_vl, np.array(y_vl)), verbose=False, shuffle=True, callbacks=[es]) loss, acc, prox = model.evaluate(x=X_vl, y=np.array(y_vl), batch_size=dmc_parameters['batch_size'], verbose=False) validation_labels = deepMirCut.pred2label(y_vl, dmc_parameters) validation_pred = model.predict(X_vl, verbose=False) pred_labels = deepMirCut.pred2label(validation_pred, dmc_parameters) fScore = f1_score(validation_labels, pred_labels) return loss, acc, prox, fScore
plot_history(history) def pred2label(pred): out = [] for pred_i in pred: out_i = [] for p in pred_i: p_i = np.argmax(p) out_i.append(idx2tag[p_i]) out.append(out_i) return out test_pred = model.predict(X_test, verbose=1) pred_labels = pred2label(test_pred) test_labels = pred2label(y_test) # pip install seqeval from seqeval.metrics import precision_score, recall_score, f1_score, classification_report print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels))) # ! pip install sklearn_crfsuite from sklearn_crfsuite.metrics import flat_classification_report report = flat_classification_report(y_pred=pred_labels, y_true=test_labels) print(report) TP = {}
output_dim=2, kernel_initializer=initializers.glorot_uniform(seed=1), activation='relu')(model) model = Model(inputs=inputs, outputs=model) model.compile(loss='mae', optimizer='adam', metrics=['mse']) model.summary() epochs = 10 callback = model.fit(x=train_x, y=train_y, epochs=epochs, validation_split=.3, batch_size=200, verbose=1).history test_y = np.rint(model.predict(x=test_x, batch_size=200, verbose=1)).astype('int') seconds = str((datetime.datetime.now() - now).seconds) print(seconds) with open('test{seconds}.txt'.format(seconds=seconds), 'w') as file: file.write('id,good,bad\n') for index, data in enumerate(test_y): file.write('{},{},{}\n'.format(index, data[0], data[1])) with open('record{seconds}.log'.format(seconds=seconds), 'w') as file: file.write('result\t\n\n') file.write('\t'.join( ['index', 'loss\t\t', 'mse\t\t', 'val_loss\t', 'val_mse\t']) + '\n')
plt.figure(figsize=(8, 8)) epochs = range(1, len(acc) + 1) plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.legend() plt.figure(figsize=(8, 8)) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.legend() plt.show() # Evaluation y_pred = model.predict(X_test) y_pred = np.argmax(y_pred, axis=-1) y_test_true = np.argmax(y_test, -1) print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred))) report = flat_classification_report(y_pred=y_pred, y_true=y_test_true) print(report) i = np.random.randint( 0, X_test.shape[0]) # choose a random number between 0 and len(X_te)b p = model.predict(np.array([X_test[i]])) p = np.argmax(p, axis=-1) true = np.argmax(y_test[i], -1) print("Sample number {} of {} (Test Set)".format(i, X_test.shape[0]))
model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model) # variational biLSTM model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model) model = TimeDistributed(Dense(50, activation="relu"))(model) # a dense layer as suggested by neuralNer crf = CRF(len(train_tag_set)) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=20, validation_split=0.1, verbose=1) save_name = 'bilstm_crf_ner_weights.h5' print("Saving model weights to : {}".format(save_name)) model.save_weights(save_name) pred = model.predict(X_te) print("Preds") for i in range(10): sent_idx = X_te[i] word_arr = idx_to_arr(sent_idx,vocab2) #sent = idx_to_arr(sent_idx,vocab) print(arr_to_str(word_arr)) sent_len = get_sent_length(word_arr) print(sent_len) truth = y_te[i][:sent_len] #print(truth[i]) pred_arr = categorical_pred_to_tags(pred[i][:sent_len],train_tag_list) truth_arr = categorical_pred_to_tags(truth,train_tag_list) for w,p,t in zip(word_arr,pred_arr,truth_arr): print("{} {} {}".format(w,t,p))
epochs = range(1, len(acc) + 1) plt.plot(epochs, acc, 'bo', label='acuratețe de antrenare') plt.plot(epochs, val_acc, 'b', label='acuratețe de validare') # plt.title('Acuratețea la antrenare și validare') plt.legend() plt.show() plt.figure(figsize=(8, 8)) plt.plot(epochs, loss, 'bo', label='pierderea de antrenare') plt.plot(epochs, val_loss, 'b', label='pierderea de validare') # plt.title('Pierderea la antrenare și validare') plt.legend() plt.show() # Evaluation y_pred = model.predict(X_test) y_pred = np.argmax(y_pred, axis=-1) y_test_true = np.argmax(y_test, -1) # Convert the index to tag y_pred = [[index2tag[i] for i in row] for row in y_pred] y_test_true = [[index2tag[i] for i in row] for row in y_test_true] print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred))) print("Accuracy is : {:.1%}".format(accuracy_score(y_test_true, y_pred))) print("Precision is : {:.1%}".format(precision_score(y_test_true, y_pred))) print("Recall is : {:.1%}".format(recall_score(y_test_true, y_pred))) report = flat_classification_report(y_pred=y_pred, y_true=y_test_true) print(report)
def main(): form = ReusableForm(request.form) print(form.errors) if request.method == 'POST': name=request.form['name'] # name=request.form.getlist('name[]') print(name) if form.validate(): # Save the comment here. flash('Hello ' + name) data = pd.read_csv("nano.csv", encoding="latin1", engine = 'python') data = data.fillna(method="ffill") data.head(10) words = list(set(data["Name"].values)) words.append("ENDPAD") n_words = len(words); n_words tags = list(set(data["Class"].values)) n_tags = len(tags); n_tags class SentenceGetter(object): def __init__(self, data): self.n_sent = 1 self.data = data self.empty = False agg_func = lambda s: [(w, t) for w, t in zip(s["Name"].values.tolist(), s["Class"].values.tolist())] self.grouped = self.data.groupby("Name").apply(agg_func) self.sentences = [s for s in self.grouped] def get_next(self): try: s = self.grouped["Sentence: {}".format(self.n_sent)] self.n_sent += 1 return s except: return None sent = getter.get_next() #print(sent) sentences = getter.sentences #print(sentences) max_len = 75 word2idx = {w: i + 1 for i, w in enumerate(words)} tag2idx = {t: i for i, t in enumerate(tags)} #word2idx['graphene'] #tag2idx['NONNANO'] from keras.preprocessing.sequence import pad_sequences X = [[word2idx[w[0]] for w in s] for s in sentences] X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0) y = [[tag2idx[w[1]] for w in s] for s in sentences] y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["NANO"]) from keras.utils.np_utils import to_categorical y = [to_categorical(i, n_tags) for i in y] from sklearn.model_selection import train_test_split X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1) from keras.models import Model from keras.layers import LSTM, Input, Embedding, Dense, TimeDistributed, Dropout, Bidirectional from keras_contrib.layers import CRF input = Input(shape=(max_len,)) model = Embedding(input_dim=n_words + 1, output_dim=20, input_length=max_len, mask_zero=True)(input) # 20-dim embedding model = Bidirectional(LSTM(units=50, return_sequences=True,recurrent_dropout=0.2))(model) model = TimeDistributed(Dense(50, activation="relu"))(model) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() history = model.fit(X_tr, np.array(y_tr), batch_size=84, epochs=8, validation_split=0.3, verbose=1) hist = pd.DataFrame(history.history) #print(hist) import matplotlib.pyplot as plt plt.style.use("ggplot") plt.figure(figsize=(12,12)) plt.plot(hist["acc"]) plt.plot(hist["val_acc"]) plt.show() i = 0 p = model.predict(np.array([X_te[i]])) p = np.argmax(p, axis=-1) true = np.argmax(y_te[i], -1) print("{:15}||{:5}||{}".format("Word", "True", "Pred")) print(30 * "=") for w, t, pred in zip(X_te[i], true, p[0]): if w != 0: print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred])) from stanfordcorenlp import StanfordCoreNLP import logging import json class StanfordNLP: def __init__(self, host='http://localhost', port=9000): self.nlp = StanfordCoreNLP(host, port=port, timeout=30000) # , quiet=False, logging_level=logging.DEBUG) self.props = { 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation', 'pipelineLanguage': 'en', 'outputFormat': 'json' } def word_tokenize(self, sentence): return self.nlp.word_tokenize(sentence) def tokens_to_dict(_tokens): tokens = defaultdict(dict) for token in _tokens: tokens[int(token['index'])] = { 'word': token['word'], 'lemma': token['lemma'], 'pos': token['pos'], 'ner': token['ner'] } return tokens if __name__ == '__main__': sNLP = StanfordNLP() text = "Cantilever Island Atomic distance is 25 nm Force Microscopy Contact Roberts Microscopy" print(sNLP.word_tokenize(text)) test = sNLP.word_tokenize(text) x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test]],padding="post", value=0, maxlen=max_len) print(x_test_sent) p = model.predict(np.array([x_test_sent[0]])) p = np.argmax(p, axis=-1) print("{:15}||{}".format("Word", "Prediction")) print(30 * "=") for w, pred in zip(test, p[0]): print("{:15}: {:5}".format(w, tags[pred])) flash('You have given ' + name +' as input ') else: flash('Required: All the form fields are required. ') return render_template('hello.html', form=form)
out = crf(model) model = Model(input, out) Y_train_2 = keras.utils.to_categorical(Y_train) Y_val_2 = keras.utils.to_categorical(Y_val) Y_test_2 = keras.utils.to_categorical(Y_test) model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy]) earlyStop = [EarlyStopping(monitor='val_loss', patience=1)] history = model.fit(X_train, Y_train_2, batch_size=64, epochs=10, validation_data=(X_val, Y_val_2), callbacks=earlyStop) preds = model.predict(X_test) test = [[np.argmax(y) for y in x] for x in preds] test_arr = np.asarray(test) test = np.reshape(test_arr, (-1)) print (metrics.precision_recall_fscore_support(np.reshape(Y_test,(-1)), test, average=None, labels=[0, 1])) preds = test_arr ##record the prediceted start and end index with open('../../outputs/CRF_glove_preds', 'w') as fout: with open('../../data/test.txt', 'r') as test: test_list = test.readlines() for i in range(len(preds)): sent = test_list[i].strip().split()
# # `assert_consumed` can be used as validation that all variable values have been # restored from the checkpoint. See `tf.train.Checkpoint.restore` for other # methods in the Status object. #print(load_status.assert_consumed()) model.summary() print('AFTER LOADING', model.get_weights()) # ====================================================================================================================== # Predict on validation data # ====================================================================================================================== print('\nPredicting...') y_pred_summaries = model.predict(x=test_generator) print(y_pred_summaries) print('\nY_PRED SHAPE', np.array(y_pred_summaries, dtype=object).shape) # ====================================================================================================================== # Set data generators for batch training # ====================================================================================================================== sentence_model = False # True False if sentence_model: # Set batch size, train and test data size batch_size = 256 #224 # 1024 # set during pre-processing (set in file preprocessing.py) train_data_size = 4136306 #4139868 # 530809 [ THE NUMBER OF TRAIN SENTENCES\DOCS ] # the total size of train data validation_data_size = 156519 #156836 # 20000 [ THE NUMBER OF VALIDATION SENTENCES\DOCS ] # the total size of test data test_data_size = 155801 #156085 # 156085 SEE BELOW [ THE NUMBER OF TEST SENTENCES\DOCS ] # the total size of test data
model = Embedding(input_dim = no_words, output_dim = EMBEDDING, input_length = MAX_LEN, mask_zero=True)(input) model = Bidirectional(LSTM(units = 50, return_sequences=True, recurrent_dropout=0.1))(model) model = TimeDistributed(Dense(50, activation="relu"))(model) crf_lstm = CRF_2nd(no_tags) out_layer = crf_lstm(model) model = Model(input, out_layer) model.compile(optimizer="rmsprop", loss=crf_lstm.loss_function, metrics=[crf_lstm.accuracy]) model.summary() history = model.fit(bi_train_x, np.array(bi_train_y), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=2) pred_y = model.predict(bi_test_X) pred_y = np.argmax(pred_y, axis=-1) y_test_true = np.argmax(bi_test_y, -1) y_test_true = [[index_to_tag[i] for i in row] for row in y_test_true] y_test_true = [[x for x in row if x!='PADword'] for row in y_test_true] pred_y = [[index_to_tag[i] for i in row] for row in pred_y] pred_y = [[x.replace("PADword", "O") for x in pred_y[index]][: len(y_test_true[index])] for index in range(len(y_test_true))] print('LSTM Classification Report\n', metrics.flat_classification_report(pred_y, y_test_true, labels=tags_without_o)) # Used four methods for the ensemble but more could easily be added # flattening function flatten = lambda l: [item for sublist in l for item in sublist]
model = Model(input, out) model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy]) print(model.summary()) history = model.fit(x_train, np.array(y_train), batch_size=64, epochs=30, validation_split=0.1, verbose=1) # predict the name entities in the test set # evaluate the model from sklearn.metrics import classification_report y_pred = model.predict(x_test, verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} def pred2label(pred): out = [] for pred_i in pred: out_i = [] for p in pred_i: p_i = np.argmax(p) if type(idx2tag[p_i]) != str: print(p_i) print(idx2tag) out_i.append(idx2tag[p_i]) out.append(out_i)
def train(word2Vec, train_df, test_df, max_length, filters, kernel_size, pool_size, dense): r = random.randint(1, 10000) now = datetime.datetime.now() tokenizer = Tokenizer() tokenizer.fit_on_texts(word2Vec.wv.vocab.keys()) train_x = tokenizer.texts_to_sequences(train_df['text']) train_x = pad_sequences(train_x, maxlen=max_length) # train_y_good = train_df["good"] # train_y_bad = train_df["bad"] train_y = pd.DataFrame(train_df, columns=["good", "bad"]) test_x = tokenizer.texts_to_sequences(test_df['text']) test_x = pad_sequences(test_x, maxlen=max_length) word_index = tokenizer.word_index embedding_matrix = np.zeros((len(word_index) + 1, word2Vec.vector_size)) for word, i in word_index.items(): try: embedding_matrix[i] = word2Vec[word] except: continue inputs = Input(shape=(max_length, )) model = Embedding(name="embedding", input_dim=len(word_index) + 1, output_dim=word2Vec.vector_size, weights=[embedding_matrix], input_length=max_length)(inputs) model = Conv1D(name="conv1D", filters=filters, kernel_size=kernel_size, kernel_initializer=initializers.glorot_uniform(seed=1), padding='same')(model) model = MaxPooling1D(name="maxPooling1D", pool_size=pool_size, strides=1, padding='same')(model) model = Flatten(name="flatten")(model) model = Dense(name="dense", output_dim=dense, kernel_initializer=initializers.glorot_uniform(seed=1), activation='relu')(model) model = Dense(name="output", output_dim=2, kernel_initializer=initializers.glorot_uniform(seed=1), activation='relu')(model) model = Model(inputs=inputs, outputs=model) model.compile(loss='mae', optimizer=optimizers.Adam(lr=.001), metrics=['mse']) model.summary() epochs = 15 callback = model.fit(x=train_x, y=train_y, epochs=epochs, validation_split=.3, batch_size=20, verbose=1).history test_y = np.rint(model.predict(x=test_x, batch_size=10, verbose=1)).astype('int') seconds = str((datetime.datetime.now() - now).seconds) with open('test{seconds}_{r}.txt'.format(seconds=seconds, r=r), 'w') as file: file.write('id,good,bad\n') for index, data in enumerate(test_y): file.write('{},{},{}\n'.format(index, data[0], data[1])) with open('record{seconds}_{r}.log'.format(seconds=seconds, r=r), 'w') as file: file.write('result\t\n\n') file.write('\t'.join( ['index', 'loss\t\t', 'mse\t\t\t', 'val_loss\t\t', 'val_mse\t']) + '\n') for index, loss, mse, val_loss, val_mse in zip( range(1, epochs + 1), callback['loss'], callback['mean_squared_error'], callback['val_loss'], callback['val_mean_squared_error']): file.write('\t'.join([ str(index) + '\t', '{:.12f}'.format(loss), '{:.12f}'.format( mse), '{:.12f}'.format(val_loss), '{:.12f}'.format(val_mse) ]) + '\n') file.write( '\nmax_length={max_length}\nmin_count={min_count}, size=270, iter=10, sg=1, workers=10\n' .format(max_length=max_length, min_count=min_count)) file.write('inputs = Input(shape=(max_length,)\n') file.write( 'model = Embedding(name="embedding", input_dim=len(word_index)+1, output_dim=word2Vec.vector_size, weights=[embedding_matrix], input_length=max_length)(inputs)\n' ) file.write( 'model = Conv1D(name="conv1D_good", filters={filters}, kernel_size={kernel_size}, kernel_initializer=initializers.glorot_uniform(seed=1), padding="same")(model)\n' .format(filters=filters, kernel_size=kernel_size)) file.write( 'model = MaxPooling1D(name="maxPooling1D", pool_size={pool_size}, strides=1, padding="same")(model)\n' .format(pool_size=pool_size)) file.write( 'model = Dense(name="dense", output_dim={dense}, kernel_initializer=initializers.glorot_uniform(seed=1), activation="relu")(model)\n' .format(dense=dense)) file.write( 'model = Dense(name="output", output_dim=2, kernel_initializer=initializers.glorot_uniform(seed=1), activation="relu")(model)\n' ) file.write('model = Model(inputs=inputs, outputs=model)\n') file.write( 'model.compile(loss="mae", optimizer=optimizers.Adam(lr=.001), metrics=["mse"])\n' ) import matplotlib.pyplot as plt fig = plt.figure() plt.grid(True) plt.ylim(0, 40) plt.plot(callback['loss']) plt.plot(callback['mse']) plt.plot(callback['val_loss']) plt.plot(callback['val_mse']) plt.title('model loss') plt.ylabel('loss (mae)') plt.xlabel('epoch') plt.legend(['train_loss', 'train_mse', 'test_loss', 'test_mse'], loc='upper right') fig.savefig('{seconds}_{r}.png'.format(seconds=seconds, r=r), dpi=fig.dpi)
crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() import time start = time.time() history = model.fit(X, np.array(y), batch_size=32, epochs=15, verbose=1) end = time.time() print(end - start) from seqeval.metrics import precision_score, recall_score, f1_score, classification_report test_pred = model.predict(X, verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} def pred2label(pred): out = [] for pred_i in pred: out_i = [] for p in pred_i: p_i = np.argmax(p) out_i.append(idx2tag[p_i].replace("PAD", "O")) out.append(out_i) return out
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=1, validation_split=0.1, verbose=1) hist = pd.DataFrame(history.history) # plt.style.use("ggplot") # plt.figure(figsize=(12,12)) # plt.plot(hist["acc"]) # plt.plot(hist["val_acc"]) # plt.show() test_pred = model.predict(X_te, verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} def pred2label(pred): out = [] for pred_i in pred: out_i = [] for p in pred_i: p_i = np.argmax(p) out_i.append(idx2tag[p_i].replace("PAD", "O")) out.append(out_i) return out
elif "scope" in negation[0]: cues[i, j] = 1; # not a cue golds[i, j, classes["F"]] = 1; else: cues[i, j] = 1; # not a cue golds[i, j, classes["T"]] = 1; if arguments.debug: print("evaluation: {} word tokens; {} unknown." "".format(n, unknown)); print("model.evaluate() on evaluation: {}" "".format(model.evaluate(([inputs, cues] if arguments.cues else inputs), golds, verbose = 1))); outputs = model.predict(([inputs, cues] if arguments.cues else inputs), verbose = 1); tf.keras.backend.clear_session(); # # convert back from ‘categorical’, one-hot encoding and un-pad; # while at it, (wastefully :-) produce two flat lists of labels. # golds = [np.argmax(gold, axis = 1) for gold in golds]; outputs = [np.argmax(output, axis = 1) for output in outputs]; labels = []; system = []; for i, sentence in enumerate(test): golds[i] = golds[i][0:len(sentence["nodes"])]; labels.extend(golds[i]); outputs[i] = outputs[i][0:len(sentence["nodes"])]; system.extend(outputs[i]);
new_notes = [] new_beginnings = [] for i in range(len(lengths)): for l in range(int(lengths[i] / 0.5)): new_notes.append(notes1[i]) if l == 0: new_beginnings.append('b') elif l == int(lengths[i] / 0.5) - 1: new_beginnings.append('e') else: new_beginnings.append('c') test = [str(i) + ' ' + c for i, c in zip(new_notes, new_beginnings)] test = [note2idx[t] for t in test] test = pad_sequences(maxlen=max_len, sequences=[test], padding="post", value=n_notes - 1) p = model.predict(np.array(test)) p = np.argmax(p, axis=-1) melody_notes = [notes[w] for w in test[0]] predicted_chords = [chords[pred] for pred in p[0]] pickle.dump(melody_notes, open(state_name + "predictions/melody.p", "wb")) pickle.dump(predicted_chords, open(state_name + "predictions/chords.p", "wb"))
model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() history = model.fit(X_tr, np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=2) # Eval pred_cat = model.predict(X_te) pred = np.argmax(pred_cat, axis=-1) y_te_true = np.argmax(y_te, -1) # Convert the index to tag pred_tag = [[idx2tag[i] for i in row] for row in pred] y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag) print(report) i = np.random.randint( 0, X_te.shape[0]) # choose a random number between 0 and len(X_te) print(i) p = model.predict(np.array([X_te[i]])) p = np.argmax(p, axis=-1)
model = TimeDistributed(Dense(50, activation="relu"))(model) out = Dense(6, activation='softmax')(model) #crf = CRF(n_tags+1) #out = crf(model) model = Model(input, out) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() history = model.fit(X_train, numpy.array(y_train), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2, verbose=2) #history = model.fit(X_train, numpy.array(y_train), batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.2, verbose = 2) pred_cat = model.predict(X_test) pred = numpy.argmax(pred_cat, axis=-1) y_test_true = numpy.argmax(y_test, -1) from sklearn_crfsuite.metrics import flat_classification_report pred_tag = [[idx2tag[i] for i in row] for row in pred] y_test_true_tag = [[idx2tag[i] for i in row] for row in y_test_true] #from sklearn.metrics import f1_score #report = f1_score(y_test, pred_cat) report = flat_classification_report(y_pred=pred_tag, y_true=y_test_true_tag) print(report)
from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_fscore_support confE = [] confN = [] confT = [] confJ = [] #precal=[] yactual = [] ypred = [] for i in range(np.shape(Xtstc)[0]): print(i) y = model.predict(Xtstc[i].reshape(1, -1)) One = int(np.round(y[0])) Two = int(np.round(y[1])) Three = int(np.round(y[2])) Four = int(np.round(y[3])) ya = y_testc[i] Oneac = int(ya[3]) Twoac = int(ya[1]) Threeac = int(ya[2]) Fourac = int(ya[0]) ypre = [One, Two, Three, Four] yac = [Oneac, Twoac, Threeac, Fourac] ypred.append(ypre) yactual.append(yac) confE.append(confusion_matrix([Four], [Fourac])) confN.append(confusion_matrix([Two], [Twoac]))
input = Input(shape=(max_len,)) model = Embedding(input_dim=n_words + 1, output_dim=20, input_length=max_len, mask_zero=True)(input) # 20-dim embedding model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model) # variational biLSTM model = TimeDistributed(Dense(50, activation="relu"))(model) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) truths = [] predictions = [] for i in range(len(y_te)): p = model.predict(np.array([X_te[i]])) p = np.argmax(p, axis=-1) true = np.argmax(y_te[i], -1) for w, t, pred in zip(X_te[i], true, p[0]): if w != 0: truths.append(t) predictions.append(pred) #print(predictions) accuracy = accuracy_score(truths, predictions) print(accuracy) print(classification_report( truths, predictions, target_names=["D", "O","T"]))
crf = CRF_2nd(no_tags) out_layer = crf(model) model = Model(input, out_layer) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() history = model.fit(train_X, np.array(train_y), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=2) pred_y = model.predict(test_X) pred_y = np.argmax(pred_y, axis=-1) y_test_true = np.argmax(test_y, -1) y_test_true = [[index_to_tag[i] for i in row] for row in y_test_true] y_test_true = [[x for x in row if x != 'PADword'] for row in y_test_true] pred_y = [[index_to_tag[i] for i in row] for row in pred_y] pred_y = [[x.replace("PADword", "O") for x in pred_y[index]][:len(y_test_true[index])] for index in range(len(y_test_true))] print('LSTM Classification Report\n', metrics.flat_classification_report(pred_y, y_test_true))