train_features = [words_train, char_train] test_features = [words_test, char_test] else: train_features = words_train test_features = words_test train_labels = [pos_train, chunk_train] test_labels = [pos_test, chunk_test] chunk_f1_cb = ConllCallback(test_features, chunk_test, dataset.chunk_vocab.vocab, batch_size=64) model.fit(train_features, train_labels, epochs=args.e, batch_size=args.b, validation_data=(test_features, test_labels), callbacks=[chunk_f1_cb]) # save model _save_model() # load model model = SequenceChunker(use_cudnn=args.use_cudnn) model.load(model_path) # print evaluation metric chunk_pred = model.predict(test_features, 64) res = get_conll_scores(chunk_pred, chunk_test, dataset.chunk_vocab.reverse_vocab()) print(res)
validate_existing_filepath(settings_path) # load model and parameters model = SequenceChunker() model.load(model_path) word_length = model.max_word_len with open(settings_path, 'rb') as fp: model_params = pickle.load(fp) word_vocab = model_params['word_vocab'] chunk_vocab = model_params['chunk_vocab'] char_vocab = model_params.get('char_vocab', None) # parse documents and get tokens nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) with open(args.input_file) as fp: document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()] # vectorize input tokens and run inference doc_vecs = vectorize(document_texts, word_vocab, char_vocab) document_annotations = [] for vec in doc_vecs: doc_chunks = model.predict(vec, batch_size=args.b) chunk_a = [ chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten() ] document_annotations.append(chunk_a) # print document text and annotations build_annotation(document_texts, document_annotations)
# initialize word embedding if external model selected if args.embedding_model is not None: embedding_model, _ = load_word_embeddings(args.embedding_model) embedding_mat = get_embedding_matrix(embedding_model, dataset.word_vocab) model.load_embedding_weights(embedding_mat) # train the model chunk_f1_cb = ConllCallback(words_test, chunk_test, dataset.chunk_vocab.vocab, batch_size=64) model.fit(words_train, [pos_train, chunk_train], epochs=args.e, batch_size=args.b, callbacks=[chunk_f1_cb]) # save model _save_model() # print evaluation metric model.chunk_inference_mode() chunk_pred = model.predict(words_test, 64) _, _, chunk_test = dataset.test_set res = get_conll_scores(chunk_pred, chunk_test, dataset.chunk_vocab.reverse_vocab()) if args.print_np is True: print('NP F1: {}'.format(res[1]['NP'][-1])) else: print('Chunk F1: {}'.format(res[0][-1]))
epochs=args.epochs, cost=cost, callbacks=callbacks) # save model model_settings = { 'sentence_len': args.sentence_len, 'use_embeddings': args.embedding_model is not None, 'pos': args.use_pos, 'char_rnn': args.use_char_rnn, 'y_vocab': dataset.y_vocab, 'vocabs': dataset.vocabs, } with open(settings_path + '.dat', 'wb') as fp: pickle.dump(model_settings, fp) model.save(model_path) # tagging accuracy y_preds = model.predict(test_set) predictions = y_preds.argmax(2) truth_labels = test_set.y.reshape(-1, args.sentence_len) eval = get_conll_scores(predictions, truth_labels, {v + 1: k for k, v in dataset.y_vocab.items()}) if args.print_np_perf is True: print('NP performance: {}'.format(eval[1]['NP'])) else: print('Global performance: {}'.format(eval[0]))
validate_existing_filepath(settings_path) # load model and parameters model = SequenceChunker() model.load(model_path) with open(settings_path, 'rb') as fp: model_params = pickle.load(fp) word_vocab = model_params['word_vocab'] chunk_vocab = model_params['chunk_vocab'] pos_vocab = model_params['pos_vocab'] # parse documents and get tokens nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) with open(args.input_file) as fp: document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()] # vectorize input tokens and run inference doc_vecs = vectorize(document_texts, word_vocab) document_annotations = [] for vec in doc_vecs: doc_pos, doc_chunks = model.predict(vec, batch_size=1) pos_a = [pos_vocab.id_to_word(l) for l in doc_pos.argmax(2).flatten()] chunk_a = [ chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten() ] document_annotations.append((pos_a, chunk_a)) # print document text and annotations build_annotation(document_texts, document_annotations)
model.fit(train_set, optimizer=optimizer, epochs=args.epochs, cost=cost, callbacks=callbacks) # save model model_settings = {'sentence_len': args.sentence_len, 'use_embeddings': args.embedding_model is not None, 'pos': args.use_pos, 'char_rnn': args.use_char_rnn, 'y_vocab': dataset.y_vocab, 'vocabs': dataset.vocabs, } with open(settings_path + '.dat', 'wb') as fp: pickle.dump(model_settings, fp) model.save(model_path) # tagging accuracy y_preds = model.predict(test_set) predictions = y_preds.argmax(2) truth_labels = test_set.y.reshape(-1, args.sentence_len) eval = get_conll_scores(predictions, truth_labels, { v+1: k for k, v in dataset.y_vocab.items()}) if args.print_np_perf is True: print('NP performance: {}'.format(eval[1]['NP'])) else: print('Global performance: {}'.format(eval[0]))