Ejemplo n.º 1
0
    def load(cls,
             model_path: str,
             parameter_path: str,
             batch_size: int = 32,
             use_cudnn: bool = False):
        """
        Load a NPAnnotator annotator

        Args:
            model_path (str): path to trained model
            parameter_path (str): path to model parameters
            batch_size (int, optional): inference batch_size
            use_cudnn (bool, optional): use gpu for inference (cudnn cells)

        Returns:
            NPAnnotator class with loaded model
        """

        _model_path = path.join(path.dirname(path.realpath(__file__)),
                                model_path)
        validate_existing_filepath(_model_path)
        _parameter_path = path.join(path.dirname(path.realpath(__file__)),
                                    parameter_path)
        validate_existing_filepath(_parameter_path)

        model = SequenceChunker(use_cudnn=use_cudnn)
        model.load(_model_path)
        with open(_parameter_path, "rb") as fp:
            model_params = pickle.load(fp)
            word_vocab = model_params["word_vocab"]
            chunk_vocab = model_params["chunk_vocab"]
            char_vocab = model_params.get("char_vocab", None)
        return cls(model, word_vocab, char_vocab, chunk_vocab, batch_size)
Ejemplo n.º 2
0
    word_vocab_size = len(dataset.word_vocab) + 2
    char_vocab_size = None
    if args.char_features is True:
        char_train = train_set[3]
        char_test = test_set[3]
        char_vocab_size = len(dataset.char_vocab) + 2

    pos_train = keras.utils.to_categorical(pos_train, num_classes=pos_labels)
    chunk_train = keras.utils.to_categorical(chunk_train,
                                             num_classes=chunk_labels)
    pos_test = keras.utils.to_categorical(pos_test, num_classes=pos_labels)
    chunk_test = keras.utils.to_categorical(chunk_test,
                                            num_classes=chunk_labels)

    # build model with input parameters
    model = SequenceChunker(use_cudnn=args.use_cudnn)
    model.build(word_vocab_size,
                pos_labels,
                chunk_labels,
                char_vocab_size=char_vocab_size,
                max_word_len=args.max_word_length,
                feature_size=args.feature_size,
                classifier=args.classifier)

    # initialize word embedding if external model selected
    if args.embedding_model is not None:
        embedding_model, _ = load_word_embeddings(args.embedding_model)
        embedding_mat = get_embedding_matrix(embedding_model,
                                             dataset.word_vocab)
        model.load_embedding_weights(embedding_mat)
Ejemplo n.º 3
0
                        help='Model name (used for saving the model)')
    parser.add_argument('-b',
                        type=int,
                        action=check_size(1, 9999),
                        default=1,
                        help='inference batch size')
    args = parser.parse_args()
    model_path = path.join(path.dirname(path.realpath(__file__)),
                           '{}.h5'.format(str(args.model_name)))
    settings_path = path.join(path.dirname(path.realpath(__file__)),
                              '{}.params'.format(str(args.model_name)))
    validate_existing_filepath(model_path)
    validate_existing_filepath(settings_path)

    # load model and parameters
    model = SequenceChunker()
    model.load(model_path)
    word_length = model.max_word_len
    with open(settings_path, 'rb') as fp:
        model_params = pickle.load(fp)
        word_vocab = model_params['word_vocab']
        chunk_vocab = model_params['chunk_vocab']
        char_vocab = model_params.get('char_vocab', None)

    # parse documents and get tokens
    nlp = SpacyInstance(
        disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
    with open(args.input_file) as fp:
        document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()]

    # vectorize input tokens and run inference
Ejemplo n.º 4
0
    words_train, pos_train, chunk_train = dataset.train_set
    words_test, pos_test, chunk_test = dataset.test_set

    # get label sizes, transform y's into 1-hot encoding
    chunk_labels = len(dataset.chunk_vocab) + 1
    pos_labels = len(dataset.pos_vocab) + 1
    word_vocab_size = len(dataset.word_vocab) + 2
    pos_train = keras.utils.to_categorical(pos_train, num_classes=pos_labels)
    chunk_train = keras.utils.to_categorical(chunk_train,
                                             num_classes=chunk_labels)
    pos_test = keras.utils.to_categorical(pos_test, num_classes=pos_labels)
    chunk_test = keras.utils.to_categorical(chunk_test,
                                            num_classes=chunk_labels)

    # build model with input parameters
    model = SequenceChunker(use_gpu=args.use_gpu)
    model.build(word_vocab_size,
                pos_labels,
                chunk_labels,
                feature_size=args.feature_size)

    # initialize word embedding if external model selected
    if args.embedding_model is not None:
        embedding_model, _ = load_word_embeddings(args.embedding_model)
        embedding_mat = get_embedding_matrix(embedding_model,
                                             dataset.word_vocab)
        model.load_embedding_weights(embedding_mat)

    # train the model
    chunk_f1_cb = ConllCallback(words_test,
                                chunk_test,
Ejemplo n.º 5
0
    dataset = CONLL2000(sentence_length=args.sentence_len,
                        vocab_size=args.vocab_size,
                        use_pos=args.use_pos,
                        use_chars=args.use_char_rnn,
                        chars_len=args.max_char_word_length,
                        embedding_model_path=args.embedding_model)
    train_set = dataset.train_iter
    test_set = dataset.test_iter

    model = SequenceChunker(sentence_length=args.sentence_len,
                            num_labels=dataset.y_size,
                            token_vocab_size=args.vocab_size,
                            pos_vocab_size=pos_vocab_size,
                            char_vocab_size=char_vocab_size,
                            max_char_word_length=args.max_char_word_length,
                            token_embedding_size=args.token_embedding_size,
                            pos_embedding_size=args.pos_embedding_size,
                            char_embedding_size=args.char_hidden_size,
                            lstm_hidden_size=args.lstm_hidden_size,
                            num_lstm_layers=args.lstm_depth,
                            use_external_embedding=args.embedding_model)

    cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))
    optimizer = RMSProp(stochastic_round=args.rounding)
    callbacks = Callbacks(model.get_model(),
                          eval_set=test_set,
                          **args.callback_args)
    model.fit(train_set,
              optimizer=optimizer,
              epochs=args.epochs,
              cost=cost,
Ejemplo n.º 6
0
    dataset = CONLL2000(sentence_length=args.sentence_len,
                        vocab_size=args.vocab_size,
                        use_pos=args.use_pos,
                        use_chars=args.use_char_rnn,
                        chars_len=args.max_char_word_length,
                        embedding_model_path=args.embedding_model)
    train_set = dataset.train_iter
    test_set = dataset.test_iter

    model = SequenceChunker(sentence_length=args.sentence_len,
                            num_labels=dataset.y_size,
                            token_vocab_size=args.vocab_size,
                            pos_vocab_size=pos_vocab_size,
                            char_vocab_size=char_vocab_size,
                            max_char_word_length=args.max_char_word_length,
                            token_embedding_size=args.token_embedding_size,
                            pos_embedding_size=args.pos_embedding_size,
                            char_embedding_size=args.char_hidden_size,
                            lstm_hidden_size=args.lstm_hidden_size,
                            num_lstm_layers=args.lstm_depth,
                            use_external_embedding=args.embedding_model)

    cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))
    optimizer = RMSProp(stochastic_round=args.rounding)
    callbacks = Callbacks(model.get_model(), eval_set=test_set, **args.callback_args)
    model.fit(train_set,
              optimizer=optimizer,
              epochs=args.epochs,
              cost=cost,
              callbacks=callbacks)