def on_epoch_end(self, epoch, logs=None): logs = logs or {} pred = transform_indices_to_labels(self.index_to_label, predict(self.model, self.generator)) f1 = compute_f1(pred, self.valid_labels) logs['valid_f1'] = f1[0] print(f'Precision: {f1[0]}, Recall: {f1[1]}, F1: {f1[2]}')
def test(model, test_set, idx2Label, package): test_batch, test_batch_len = createBatches(test_set) predLabels, correctLabels = tag_dataset(test_batch, model, package) pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, idx2Label) print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test)) return pre_test, rec_test, f1_test
def train(self): """Default training""" self.f1_test_history = [] self.f1_dev_history = [] for epoch in range(self.epochs): print("Epoch {}/{}".format(epoch, self.epochs)) for i, batch in enumerate( iterate_minibatches(self.train_batch, self.train_batch_len)): labels, tokens, casing, char = batch self.model.train_on_batch([tokens, casing, char], labels) # compute F1 scores predLabels, correctLabels = self.tag_dataset( self.test_batch, self.model) pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, self.idx2Label) self.f1_test_history.append(f1_test) print("f1 test ", round(f1_test, 4)) predLabels, correctLabels = self.tag_dataset( self.dev_batch, self.model) pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, self.idx2Label) self.f1_dev_history.append(f1_dev) print("f1 dev ", round(f1_dev, 4), "\n") print("Final F1 test score: ", f1_test) print("Training finished.") # save model self.modelName = "{}_{}_{}_{}_{}_{}_{}".format( self.epochs, self.dropout, self.dropout_recurrent, self.lstm_state_size, self.conv_size, self.learning_rate, self.optimizer.__class__.__name__) modelName = self.modelName + ".h5" self.model.save(modelName) print("Model weights saved.") self.model.set_weights(self.init_weights) # clear model print("Model weights cleared.")
def evaluate(self, predictions, groundTruths, *args, **kwargs): """ :param predictions: Predictions from predict() function :param groundTruths: ground Truth values :param args: None :param kwargs: None :return: A tuple with precision, recall and F1 scores """ precision, recall, f1 = compute_f1(predictions, groundTruths, self.cnnBLSTM_Obj.idx2Label) return (precision, recall, f1)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output) model = Model(inputs=[words_input, casing_input, character_input], outputs=[output]) model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam') model.summary() # plot_model(model, to_file='model.png') for epoch in range(epochs): print("Epoch %d/%d" % (epoch, epochs)) a = Progbar(len(train_batch_len)) for i, batch in enumerate(iterate_minibatches(train_batch, train_batch_len)): labels, tokens, casing, char = batch model.train_on_batch([tokens, casing, char], labels) a.update(i) a.update(i + 1) print(' ') model.save("models/model.h5") # Performance on dev dataset predLabels, correctLabels = tag_dataset(dev_batch) pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, idx2Label) print("Dev-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_dev, rec_dev, f1_dev)) # Performance on test dataset predLabels, correctLabels = tag_dataset(test_batch) pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, idx2Label) print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))
def main(): conll_2003_data_dir = './data/conll-2003' conll_2003_indices = [0, 3] conll_2003_ignore = ('-DOCSTART-', ) char_cnn_width = 30 train_data, train_labels = load_conll( os.path.join(conll_2003_data_dir, 'train.txt'), conll_2003_indices, conll_2003_ignore) validate_data, validate_labels = load_conll( os.path.join(conll_2003_data_dir, 'dev.txt'), conll_2003_indices, conll_2003_ignore) test_data, test_labels = load_conll( os.path.join(conll_2003_data_dir, 'test.txt'), conll_2003_indices, conll_2003_ignore) label_classes = sorted({ l for sent_labels in train_labels + validate_labels + test_labels for l in sent_labels }) label_to_index = build_labels_mapping(label_classes) index_to_label = build_indices_mapping(label_classes) characters = sorted({ ch for sent in train_data + validate_data + test_data for word in sent for ch in word }) word_set = { w for sent in train_data + validate_data + test_data for w in sent } print(f'{len(word_set)} unique words found.') embed = Embeddings('./embeddings/eng/glove.6B.300d.txt', True, word_set=word_set) embed_matrix = embed.matrix train_inputs = make_ner_inputs(train_data, embed, characters, char_cnn_width) train_outputs = make_ner_one_hot_outputs(train_labels, label_to_index) validate_inputs = make_ner_inputs(validate_data, embed, characters, char_cnn_width) test_inputs = make_ner_inputs(test_data, embed, characters, char_cnn_width) model = build_model_char_cnn_lstm_crf(len(label_classes), embed_matrix, 30, len(characters)) train_generator = DataGenerator(train_inputs, train_outputs, 32) evaluator = ModelEval(DataGenerator(validate_inputs), validate_labels, index_to_label) model_saver = ModelCheckpoint(filepath='./checkpoints/' + model.name.replace(' ', '_') + '_{epoch:02d}.hdf5', verbose=1, save_best_only=True, monitor='valid_f1') time_stamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") csv_logger = CSVLogger(f"./logs/NER_log_{time_stamp}.csv", append=False) # model.load_weights('./checkpoints/char_cnn_bilstm_crf17.hdf5') model.fit_generator(train_generator, epochs=1, callbacks=[evaluator, model_saver, csv_logger]) test_pred_indices = predict(model, DataGenerator(test_inputs)) test_pred_labels = transform_indices_to_labels(index_to_label, test_pred_indices) print(compute_f1(test_pred_labels, test_labels)) with open('ner_results.txt', 'wt') as file: for sent, sent_true_labels, sent_pred_labels in zip( test_data, test_labels, test_pred_labels): file.writelines([ ' '.join(z) + '\n' for z in zip(sent, sent_true_labels, sent_pred_labels) ]) file.write('\n')
def train(self): """Default training""" self.f1_test_history = [] self.f1_dev_history = [] if self.verify_model_exists() is False: for epoch in range(self.epochs): print("Epoch {}/{}".format(epoch, self.epochs)) for i, batch in enumerate( iterate_minibatches(self.train_batch, self.train_batch_len)): labels, tokens, casing, char = batch self.model.train_on_batch([tokens, casing, char], labels) # compute F1 scores predLabels, correctLabels = self.tag_dataset( self.test_batch, self.model) pre_test, rec_test, f1_test = compute_f1( predLabels, correctLabels, self.idx2Label) self.f1_test_history.append(f1_test) print("\nprec test ", round(pre_test, 4)) print("rec test", round(rec_test, 4)) print("f1 test ", round(f1_test, 4)) predLabels, correctLabels = self.tag_dataset( self.dev_batch, self.model) pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, self.idx2Label) self.f1_dev_history.append(f1_dev) print("\nprec dev ", round(pre_dev, 4)) print("rec dev ", round(rec_dev, 4)) print("f1 dev ", round(f1_dev, 4), "\n") print("Final F1 dev score: ", f1_dev) self.writeToFile() modelName = "./models/" + self.modelName + ".h5" self.model.save(modelName) print("Model weights saved.") print("Training finished.") else: # compute F1 scores predLabels, correctLabels = self.tag_dataset( self.test_batch, self.model) pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, self.idx2Label) self.f1_test_history.append(f1_test) print("\nprec test ", round(pre_test, 4)) print("rec test", round(rec_test, 4)) print("f1 test ", round(f1_test, 4)) predLabels, correctLabels = self.tag_dataset( self.dev_batch, self.model) pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, self.idx2Label) self.f1_dev_history.append(f1_dev) print("\nprec dev ", round(pre_dev, 4)) print("rec dev ", round(rec_dev, 4)) print("f1 dev ", round(f1_dev, 4), "\n") print("Final F1 dev score: ", f1_dev) self.model.set_weights(self.init_weights) # clear model print("Model weights cleared.")
model.summary() epochs = 80 for epoch in range(epochs): print("Epoch %d/%d" % (epoch, epochs)) a = Progbar(len(train_batch_len)) for i, batch in enumerate(iterate_minibatches(train_batch, train_batch_len)): labels, tokens, casing, char = batch model.train_on_batch([tokens, casing, char], labels) a.update(i) print('\n') # Performance on test dataset predLabels, correctLabels, sentences = tag_dataset(model, test_batch) sents = [] for s in sentences: word = "" for number in s[0]: for char in word2Idx: word_ind = word2Idx[char] if number == word_ind: word += " " + char sents.append(word) pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, idx2Label, sents) print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))