Ejemplo n.º 1
0
def run_inference(model_dir: str, epoch: Optional[int], device: str, metric: str):
    chainer.config.train = False

    if device >= 0:
        cuda.get_device(device).use()

    set_seed()

    configs = json.load(open(os.path.join(model_dir, "args")))
    snapshot_file, prediction_path = select_snapshot(epoch, metric, model_dir)
    logger.debug(f"creat prediction into {prediction_path}")

    vocab = Vocabulary.prepare(configs)
    num_word_vocab = configs["num_word_vocab"]
    num_char_vocab = configs["num_char_vocab"]
    num_tag_vocab = configs["num_tag_vocab"]

    model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab)

    model_path = os.path.join(model_dir, snapshot_file)
    chainer.serializers.load_npz(model_path, model)
    logger.debug(f"load {snapshot_file}")

    if device >= 0:
        model.to_gpu(device)

    transformer = DatasetTransformer(vocab)
    transform = transformer.transform
    test_iterator = create_iterator(
        vocab, configs, "test", transform, return_original_sentence=True
    )

    with open(prediction_path, "w", encoding="utf-8") as file:
        for batch in test_iterator:
            batch, original_sentences = list(zip(*batch))
            in_arrays, t_arrays = converter(batch, device)
            p_arrays = model.predict(in_arrays)

            word_sentences, t_tag_sentences = list(
                zip(*transformer.itransform(in_arrays[0], t_arrays))
            )
            _, p_tag_sentences = list(
                zip(*transformer.itransform(in_arrays[0], p_arrays))
            )

            sentence_gen = zip(
                word_sentences,
                t_tag_sentences,
                p_tag_sentences,
                original_sentences,
            )  # NOQA
            for ws, ts, ps, _os in sentence_gen:
                for w, t, p, o in zip(ws, ts, ps, _os):
                    w = w.replace(" ", "<WHITESPACE>")
                    o = o.replace(" ", "<WHITESPACE>")
                    if w != o:
                        w = f"{w}({o})"
                    print(f"{w} {t} {p}", file=file)
                print(file=file)
Ejemplo n.º 2
0
def run_inference(
        model_dir: str,
        epoch: Optional[int],
        device: str,
        metric: str,
        tokenizer: str):
    chainer.config.train = False

    if device >= 0:
        chainer.get_device(device).use()

    set_seed()

    config = json.load(open(os.path.join(model_dir, "args")))
    snapshot_file, prediction_path = select_snapshot(epoch, metric, model_dir)
    logger.debug(f"creat prediction into {prediction_path}")

    vocab = Vocabulary.prepare(config)
    num_word_vocab = config["num_word_vocab"]
    num_char_vocab = config["num_char_vocab"]
    num_tag_vocab = config["num_tag_vocab"]

    model = BiLSTM_CRF(config, num_word_vocab, num_char_vocab, num_tag_vocab)

    model_path = os.path.join(model_dir, snapshot_file)
    logger.debug(f"load {snapshot_file}")
    chainer.serializers.load_npz(model_path, model)

    if device >= 0:
        model.to_gpu(device)

    transformer = DatasetTransformer(vocab)
    word_tokenizer = WordTokenizer(tokenizer=tokenizer)

    for line in sys.stdin:
        input_sentence = [str(t) for t in word_tokenizer.tokenize(line)]
        batch = transformer.transform(input_sentence, None)
        in_arr, _ = converter([batch])
        pd_arr = model.predict(in_arr)
        (_, tag_sequence), = transformer.itransform(in_arr[0], pd_arr)
        print(' '.join(f"{word}/{tag}" for word, tag in zip(input_sentence, tag_sequence)))  # NOQA
Ejemplo n.º 3
0
    model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab)

    model_path = model_dir / snapshot_file
    logger.debug(f'load {snapshot_file}')
    chainer.serializers.load_npz(model_path.as_posix(), model)

    if args.device >= 0:
        model.to_gpu(args.device)

    transformer = DatasetTransformer(vocab)
    transform = transformer.transform
    test_iterator = create_iterator(vocab, configs, 'test', transform)

    with open(prediction_path, 'w', encoding='utf-8') as file:
        for batch in test_iterator:
            in_arrays, t_arrays = converter(batch, args.device)
            p_arrays = model.predict(in_arrays)

            word_sentences, t_tag_sentences = list(
                zip(*transformer.itransform(in_arrays[0], t_arrays)))
            _, p_tag_sentences = list(
                zip(*transformer.itransform(in_arrays[0], p_arrays)))

            sentence_gen = zip(word_sentences, t_tag_sentences,
                               p_tag_sentences)  # NOQA
            for ws, ts, ps in sentence_gen:
                for w, t, p in zip(ws, ts, ps):
                    print(f'{w} {t} {p}', file=file)
                print(file=file)