コード例 #1
0
        train_features = [words_train, char_train]
        test_features = [words_test, char_test]
    else:
        train_features = words_train
        test_features = words_test
    train_labels = [pos_train, chunk_train]
    test_labels = [pos_test, chunk_test]
    chunk_f1_cb = ConllCallback(test_features,
                                chunk_test,
                                dataset.chunk_vocab.vocab,
                                batch_size=64)
    model.fit(train_features,
              train_labels,
              epochs=args.e,
              batch_size=args.b,
              validation_data=(test_features, test_labels),
              callbacks=[chunk_f1_cb])

    # save model
    _save_model()

    # load model
    model = SequenceChunker(use_cudnn=args.use_cudnn)
    model.load(model_path)

    # print evaluation metric
    chunk_pred = model.predict(test_features, 64)
    res = get_conll_scores(chunk_pred, chunk_test,
                           dataset.chunk_vocab.reverse_vocab())
    print(res)
コード例 #2
0
ファイル: inference.py プロジェクト: neuroph12/intel_nlp
    validate_existing_filepath(settings_path)

    # load model and parameters
    model = SequenceChunker()
    model.load(model_path)
    word_length = model.max_word_len
    with open(settings_path, 'rb') as fp:
        model_params = pickle.load(fp)
        word_vocab = model_params['word_vocab']
        chunk_vocab = model_params['chunk_vocab']
        char_vocab = model_params.get('char_vocab', None)

    # parse documents and get tokens
    nlp = SpacyInstance(
        disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
    with open(args.input_file) as fp:
        document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()]

    # vectorize input tokens and run inference
    doc_vecs = vectorize(document_texts, word_vocab, char_vocab)
    document_annotations = []
    for vec in doc_vecs:
        doc_chunks = model.predict(vec, batch_size=args.b)
        chunk_a = [
            chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten()
        ]
        document_annotations.append(chunk_a)

    # print document text and annotations
    build_annotation(document_texts, document_annotations)
コード例 #3
0
ファイル: train.py プロジェクト: xcgfth/nlp-architect
    # initialize word embedding if external model selected
    if args.embedding_model is not None:
        embedding_model, _ = load_word_embeddings(args.embedding_model)
        embedding_mat = get_embedding_matrix(embedding_model,
                                             dataset.word_vocab)
        model.load_embedding_weights(embedding_mat)

    # train the model
    chunk_f1_cb = ConllCallback(words_test,
                                chunk_test,
                                dataset.chunk_vocab.vocab,
                                batch_size=64)
    model.fit(words_train, [pos_train, chunk_train],
              epochs=args.e,
              batch_size=args.b,
              callbacks=[chunk_f1_cb])
    # save model
    _save_model()

    # print evaluation metric
    model.chunk_inference_mode()
    chunk_pred = model.predict(words_test, 64)
    _, _, chunk_test = dataset.test_set

    res = get_conll_scores(chunk_pred, chunk_test,
                           dataset.chunk_vocab.reverse_vocab())
    if args.print_np is True:
        print('NP F1: {}'.format(res[1]['NP'][-1]))
    else:
        print('Chunk F1: {}'.format(res[0][-1]))
コード例 #4
0
              epochs=args.epochs,
              cost=cost,
              callbacks=callbacks)

    # save model
    model_settings = {
        'sentence_len': args.sentence_len,
        'use_embeddings': args.embedding_model is not None,
        'pos': args.use_pos,
        'char_rnn': args.use_char_rnn,
        'y_vocab': dataset.y_vocab,
        'vocabs': dataset.vocabs,
    }

    with open(settings_path + '.dat', 'wb') as fp:
        pickle.dump(model_settings, fp)
    model.save(model_path)

    # tagging accuracy
    y_preds = model.predict(test_set)
    predictions = y_preds.argmax(2)
    truth_labels = test_set.y.reshape(-1, args.sentence_len)

    eval = get_conll_scores(predictions, truth_labels,
                            {v + 1: k
                             for k, v in dataset.y_vocab.items()})
    if args.print_np_perf is True:
        print('NP performance: {}'.format(eval[1]['NP']))
    else:
        print('Global performance: {}'.format(eval[0]))
コード例 #5
0
    validate_existing_filepath(settings_path)

    # load model and parameters
    model = SequenceChunker()
    model.load(model_path)
    with open(settings_path, 'rb') as fp:
        model_params = pickle.load(fp)
        word_vocab = model_params['word_vocab']
        chunk_vocab = model_params['chunk_vocab']
        pos_vocab = model_params['pos_vocab']

    # parse documents and get tokens
    nlp = SpacyInstance(
        disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
    with open(args.input_file) as fp:
        document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()]

    # vectorize input tokens and run inference
    doc_vecs = vectorize(document_texts, word_vocab)
    document_annotations = []
    for vec in doc_vecs:
        doc_pos, doc_chunks = model.predict(vec, batch_size=1)
        pos_a = [pos_vocab.id_to_word(l) for l in doc_pos.argmax(2).flatten()]
        chunk_a = [
            chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten()
        ]
        document_annotations.append((pos_a, chunk_a))

    # print document text and annotations
    build_annotation(document_texts, document_annotations)
コード例 #6
0
ファイル: train.py プロジェクト: cdj0311/nlp-architect
    model.fit(train_set,
              optimizer=optimizer,
              epochs=args.epochs,
              cost=cost,
              callbacks=callbacks)

    # save model
    model_settings = {'sentence_len': args.sentence_len,
                      'use_embeddings': args.embedding_model is not None,
                      'pos': args.use_pos,
                      'char_rnn': args.use_char_rnn,
                      'y_vocab': dataset.y_vocab,
                      'vocabs': dataset.vocabs,
                      }

    with open(settings_path + '.dat', 'wb') as fp:
        pickle.dump(model_settings, fp)
    model.save(model_path)

    # tagging accuracy
    y_preds = model.predict(test_set)
    predictions = y_preds.argmax(2)
    truth_labels = test_set.y.reshape(-1, args.sentence_len)

    eval = get_conll_scores(predictions, truth_labels, {
        v+1: k for k, v in dataset.y_vocab.items()})
    if args.print_np_perf is True:
        print('NP performance: {}'.format(eval[1]['NP']))
    else:
        print('Global performance: {}'.format(eval[0]))