batch_size=args.b) # train model model.fit(x=train_x, y=train_y, batch_size=args.b, epochs=args.e, validation=(test_x, test_y), callbacks=[conll_cb]) print('Training done.') print('Saving model') model.save(args.model_path) with open(args.model_info_path, 'wb') as fp: info = { 'type': 'seq2seq', 'tags_vocab': dataset.tags_vocab.vocab, 'word_vocab': dataset.word_vocab.vocab, 'char_vocab': dataset.char_vocab.vocab, 'intent_vocab': dataset.intents_vocab.vocab, } pickle.dump(info, fp) # test performance predictions = model.predict(test_x, batch_size=args.b) eval = get_conll_scores( predictions, test_y, {v: k for k, v in dataset.tags_vocab.vocab.items()}) print(eval)
def on_epoch_end(self, epoch, logs=None): predictions = self.model.predict(self.x, batch_size=self.bsz) f1 = get_conll_scores(predictions, self.y, self.y_vocab)[0][-1] print() print('Conll eval F1: {}'.format(f1))
train_features = [words_train, char_train] test_features = [words_test, char_test] else: train_features = words_train test_features = words_test train_labels = [pos_train, chunk_train] test_labels = [pos_test, chunk_test] chunk_f1_cb = ConllCallback(test_features, chunk_test, dataset.chunk_vocab.vocab, batch_size=64) model.fit(train_features, train_labels, epochs=args.e, batch_size=args.b, validation_data=(test_features, test_labels), callbacks=[chunk_f1_cb]) # save model _save_model() # load model model = SequenceChunker(use_cudnn=args.use_cudnn) model.load(model_path) # print evaluation metric chunk_pred = model.predict(test_features, 64) res = get_conll_scores(chunk_pred, chunk_test, dataset.chunk_vocab.reverse_vocab()) print(res)
def on_epoch_end(self, epoch, logs=None): predictions = self.model.predict(self.x, batch_size=self.bsz) stats = get_conll_scores(predictions, self.y, self.y_vocab) print() print("Conll eval: \n{}".format(stats))
def run_aspect_sequence_tagging( train_file, test_file, models_path: str, logs_path: Path, augment_data: bool, embedding_model: str, word_embedding_dims: int, character_embedding_dims: int, char_features_lstm_dims: int, entity_tagger_lstm_dims: int, tagger_fc_dims: int, batch_size=10, epoch=50, tag_num=2, sentence_length=30, word_length=20, dropout=0.2, bilstm_layer: bool = True, crf_layer: bool = False, word_embedding_flag: bool = True, char_embedding_flag: bool = True, similarity_threshold: float = 0.8, ): network_params = [ ('char', char_embedding_flag), ('word', word_embedding_flag), ('bilstm', bilstm_layer), ('lstm', not bilstm_layer), ('crf', crf_layer), (str(epoch) + 'epochs', True), (str(similarity_threshold) + 'augmented', augment_data), ] network_params_string = '-'.join( [param for param, flag in network_params if flag]) trained_models_path = Path('trained', models_path) trained_models_path.mkdir(exist_ok=True, parents=True) logs_path = logs_path / models_path logs_path.mkdir(exist_ok=True, parents=True) # load dataset and parameters model_name = 'model-info' + '-' + network_params_string + '-' + basename( train_file) + '.info' models_path = join(models_path, model_name) if Path(models_path).exists(): click.echo('Model has been already computed and saved!') return dataset = SequentialTaggingDataset( train_file=train_file, test_file=test_file, augment_data=augment_data, similarity_threshold=similarity_threshold, max_sentence_length=sentence_length, max_word_length=word_length, tag_field_no=tag_num) # get the train and test data sets x_train, x_char_train, y_train = dataset.train x_test, x_char_test, y_test = dataset.test if word_embedding_flag and char_embedding_flag: x_train = [x_train, x_char_train] x_test = [x_test, x_char_test] elif word_embedding_flag and not char_embedding_flag: x_train = x_train x_test = x_test elif not word_embedding_flag and char_embedding_flag: x_train = x_char_train x_test = x_char_test else: raise Exception('Wrong features') num_y_labels = len(dataset.y_labels) + 1 vocabulary_size = dataset.word_vocab_size + 1 char_vocabulary_size = dataset.char_vocab_size + 1 y_test = to_categorical(y_test, num_y_labels) y_train = to_categorical(y_train, num_y_labels) aspect_model = AspectExtraction() aspect_model.build( sentence_length, word_length, num_y_labels, dataset.word_vocab, vocabulary_size, char_vocabulary_size, word_embedding_dims=word_embedding_dims, char_embedding_dims=character_embedding_dims, word_lstm_dims=char_features_lstm_dims, tagger_lstm_dims=entity_tagger_lstm_dims, tagger_fc_dims=tagger_fc_dims, dropout=dropout, external_embedding_model=embedding_model, bilstm_layer=bilstm_layer, crf_layer=crf_layer, word_embedding_flag=word_embedding_flag, char_embedding_flag=char_embedding_flag, ) # Set callback functions to early stop training and save the best model so far tensorboard_path = (logs_path / ('tensorboard-' + model_name)).as_posix() print('Tensorboard: ' + tensorboard_path) callbacks = [ ConllCallback(x_test, y_test, dataset.y_labels, batch_size=batch_size), TensorBoard(log_dir=tensorboard_path), EarlyStopping(monitor='val_loss', patience=2), # ModelCheckpoint( # filepath=(trained_models_path / '{}-best_model.h5'.format(model_name)).as_posix(), # monitor='val_loss', # save_best_only=True # ) ] aspect_model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epoch, callbacks=callbacks, validation=(x_test, y_test)) # running predictions predictions = aspect_model.predict(x=x_test, batch_size=1) eval = get_conll_scores(predictions, y_test, {v: k for k, v in dataset.y_labels.items()}) pp = pprint.PrettyPrinter(indent=4) pp.pprint(eval) # saving model with open(models_path, 'wb') as fp: info = { 'sentence_len': sentence_length, 'word_len': word_length, 'num_of_labels': num_y_labels, 'labels_id_to_word': {v: k for k, v in dataset.y_labels.items()}, 'epoch': epoch, # 'word_vocab': dataset.word_vocab, 'vocab_size': vocabulary_size, 'char_vocab_size': char_vocabulary_size, 'char_vocab': dataset.char_vocab, 'word_embedding_dims': word_embedding_dims, 'char_embedding_dims': character_embedding_dims, 'word_lstm_dims': char_features_lstm_dims, 'tagger_lstm_dims': entity_tagger_lstm_dims, 'dropout': dropout, 'external_embedding_model': embedding_model, 'train_file': train_file, 'test_file': test_file, # 'test_raw_sentences': dataset.test_raw_sentences, 'eval': eval, # 'data_augmentation': dataset.data_augmentation, # 'augment_data': augment_data, 'similarity_threshold': similarity_threshold, 'bilstm_layer': bilstm_layer, 'crf_layer': crf_layer, 'word_embedding_layer': word_embedding_flag, 'char_embedding_layer': char_embedding_flag, # 'predictions': predictions, # 'y_test': y_test, # 'y_labels': dataset.y_labels } print('Save model in: ' + models_path) pickle.dump(info, fp)
conll_cb = ConllCallback(test_inputs, y_test, dataset.y_labels.vocab, batch_size=args.b) ner_model.fit( x=train_inputs, y=y_train, batch_size=args.b, epochs=args.e, callbacks=[conll_cb], validation=(test_inputs, y_test), ) # saving model ner_model.save(args.model_path) with open(args.model_info_path, "wb") as fp: info = { "y_vocab": dataset.y_labels.vocab, "word_vocab": dataset.word_vocab.vocab, "char_vocab": dataset.char_vocab.vocab, } pickle.dump(info, fp) # running predictions predictions = ner_model.predict(x=test_inputs, batch_size=args.b) eval = get_conll_scores(predictions, y_test, {v: k for k, v in dataset.y_labels.vocab.items()}) print(eval)
print('Creating new model, starting to train from scratch') model.build(args.sentence_length, dataset.vocab_size, dataset.label_vocab_size, args.token_emb_size, args.encoder_depth, args.decoder_depth, args.lstm_hidden_size, args.encoder_dropout, args.decoder_depth, args.embedding_model) conll_cb = ConllCallback(test_x, test_y, dataset.labels_vocab, batch_size=args.b) cp_cb = ModelCheckpoint(model_path, verbose=1, period=args.save_epochs) # train model model.fit(x=train_x, y=train_y, batch_size=args.b, epochs=args.e, validation=(test_x, test_y), callbacks=[conll_cb, cp_cb]) print('Training done.') # test performance predictions = model.predict(test_x, batch_size=args.b) eval = get_conll_scores(predictions, test_y, { v: k for k, v in dataset.labels_vocab.items()}) if args.full_eval is True: print(eval) else: print(eval[0])
epochs=args.e, callbacks=[conll_cb], validation=([x_test, x_char_test], y_test)) # saving model ner_model.save(args.model_path) with open(args.model_info_path, 'wb') as fp: info = { 'sentence_len': args.sentence_length, 'word_len': args.word_length, 'num_of_labels': num_y_labels, 'labels_id_to_word': {v: k for k, v in dataset.y_labels.items()}, 'word_vocab': dataset.word_vocab, 'vocab_size': vocabulary_size, 'char_vocab_size': char_vocabulary_size, 'char_vocab': dataset.char_vocab, 'word_embedding_dims': args.word_embedding_dims, 'char_embedding_dims': args.character_embedding_dims, 'word_lstm_dims': args.char_features_lstm_dims, 'tagger_lstm_dims': args.entity_tagger_lstm_dims, 'dropout': args.dropout, 'external_embedding_model': args.embedding_model } pickle.dump(info, fp) # running predictions predictions = ner_model.predict(x=[x_test, x_char_test], batch_size=1) eval = get_conll_scores(predictions, y_test, {v: k for k, v in dataset.y_labels.items()}) pp = pprint.PrettyPrinter(indent=4) pp.pprint(eval)
epochs=args.epochs, cost=cost, callbacks=callbacks) # save model model_settings = { 'sentence_len': args.sentence_len, 'use_embeddings': args.embedding_model is not None, 'pos': args.use_pos, 'char_rnn': args.use_char_rnn, 'y_vocab': dataset.y_vocab, 'vocabs': dataset.vocabs, } with open(settings_path + '.dat', 'wb') as fp: pickle.dump(model_settings, fp) model.save(model_path) # tagging accuracy y_preds = model.predict(test_set) predictions = y_preds.argmax(2) truth_labels = test_set.y.reshape(-1, args.sentence_len) eval = get_conll_scores(predictions, truth_labels, {v + 1: k for k, v in dataset.y_vocab.items()}) if args.print_np_perf is True: print('NP performance: {}'.format(eval[1]['NP'])) else: print('Global performance: {}'.format(eval[0]))
model.fit(train_set, optimizer=optimizer, epochs=args.epochs, cost=cost, callbacks=callbacks) # save model model_settings = {'sentence_len': args.sentence_len, 'use_embeddings': args.embedding_model is not None, 'pos': args.use_pos, 'char_rnn': args.use_char_rnn, 'y_vocab': dataset.y_vocab, 'vocabs': dataset.vocabs, } with open(settings_path + '.dat', 'wb') as fp: pickle.dump(model_settings, fp) model.save(model_path) # tagging accuracy y_preds = model.predict(test_set) predictions = y_preds.argmax(2) truth_labels = test_set.y.reshape(-1, args.sentence_len) eval = get_conll_scores(predictions, truth_labels, { v+1: k for k, v in dataset.y_vocab.items()}) if args.print_np_perf is True: print('NP performance: {}'.format(eval[1]['NP'])) else: print('Global performance: {}'.format(eval[0]))