Esempio n. 1
0
def run_inference(model_dir: str, epoch: Optional[int], device: str, metric: str):
    chainer.config.train = False

    if device >= 0:
        cuda.get_device(device).use()

    set_seed()

    configs = json.load(open(os.path.join(model_dir, "args")))
    snapshot_file, prediction_path = select_snapshot(epoch, metric, model_dir)
    logger.debug(f"creat prediction into {prediction_path}")

    vocab = Vocabulary.prepare(configs)
    num_word_vocab = configs["num_word_vocab"]
    num_char_vocab = configs["num_char_vocab"]
    num_tag_vocab = configs["num_tag_vocab"]

    model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab)

    model_path = os.path.join(model_dir, snapshot_file)
    chainer.serializers.load_npz(model_path, model)
    logger.debug(f"load {snapshot_file}")

    if device >= 0:
        model.to_gpu(device)

    transformer = DatasetTransformer(vocab)
    transform = transformer.transform
    test_iterator = create_iterator(
        vocab, configs, "test", transform, return_original_sentence=True
    )

    with open(prediction_path, "w", encoding="utf-8") as file:
        for batch in test_iterator:
            batch, original_sentences = list(zip(*batch))
            in_arrays, t_arrays = converter(batch, device)
            p_arrays = model.predict(in_arrays)

            word_sentences, t_tag_sentences = list(
                zip(*transformer.itransform(in_arrays[0], t_arrays))
            )
            _, p_tag_sentences = list(
                zip(*transformer.itransform(in_arrays[0], p_arrays))
            )

            sentence_gen = zip(
                word_sentences,
                t_tag_sentences,
                p_tag_sentences,
                original_sentences,
            )  # NOQA
            for ws, ts, ps, _os in sentence_gen:
                for w, t, p, o in zip(ws, ts, ps, _os):
                    w = w.replace(" ", "<WHITESPACE>")
                    o = o.replace(" ", "<WHITESPACE>")
                    if w != o:
                        w = f"{w}({o})"
                    print(f"{w} {t} {p}", file=file)
                print(file=file)
Esempio n. 2
0
def run_inference(
        model_dir: str,
        epoch: Optional[int],
        device: str,
        metric: str,
        tokenizer: str):
    chainer.config.train = False

    if device >= 0:
        chainer.get_device(device).use()

    set_seed()

    config = json.load(open(os.path.join(model_dir, "args")))
    snapshot_file, prediction_path = select_snapshot(epoch, metric, model_dir)
    logger.debug(f"creat prediction into {prediction_path}")

    vocab = Vocabulary.prepare(config)
    num_word_vocab = config["num_word_vocab"]
    num_char_vocab = config["num_char_vocab"]
    num_tag_vocab = config["num_tag_vocab"]

    model = BiLSTM_CRF(config, num_word_vocab, num_char_vocab, num_tag_vocab)

    model_path = os.path.join(model_dir, snapshot_file)
    logger.debug(f"load {snapshot_file}")
    chainer.serializers.load_npz(model_path, model)

    if device >= 0:
        model.to_gpu(device)

    transformer = DatasetTransformer(vocab)
    word_tokenizer = WordTokenizer(tokenizer=tokenizer)

    for line in sys.stdin:
        input_sentence = [str(t) for t in word_tokenizer.tokenize(line)]
        batch = transformer.transform(input_sentence, None)
        in_arr, _ = converter([batch])
        pd_arr = model.predict(in_arr)
        (_, tag_sequence), = transformer.itransform(in_arr[0], pd_arr)
        print(' '.join(f"{word}/{tag}" for word, tag in zip(input_sentence, tag_sequence)))  # NOQA
Esempio n. 3
0
File: train.py Progetto: tatHi/pyner
def run_training(config: str, device: int, seed: int):
    configs = ConfigParser.parse(config)
    params = yaml.load(open(config, encoding="utf-8"))

    if device >= 0:
        cuda.get_device(device).use()

    set_seed(seed, device)

    vocab = Vocabulary.prepare(configs)
    num_word_vocab = max(vocab.dictionaries["word2idx"].values()) + 1
    num_char_vocab = max(vocab.dictionaries["char2idx"].values()) + 1
    num_tag_vocab = max(vocab.dictionaries["tag2idx"].values()) + 1

    model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab)

    transformer = DatasetTransformer(vocab)
    transform = transformer.transform

    external_configs = configs["external"]
    if "word_vector" in external_configs:
        syn0 = model.embed_word.W.data
        _, word_dim = syn0.shape
        pre_word_dim = vocab.gensim_model.vector_size
        if word_dim != pre_word_dim:
            msg = "Mismatch vector size between model and pre-trained word vectors"  # NOQA
            msg += f"(model: \x1b[31m{word_dim}\x1b[0m"
            msg += f", pre-trained word vector: \x1b[31m{pre_word_dim}\x1b[0m"
            raise Exception(msg)

        word2idx = vocab.dictionaries["word2idx"]
        syn0 = prepare_pretrained_word_vector(word2idx, vocab.gensim_model,
                                              syn0, num_word_vocab)
        model.set_pretrained_word_vectors(syn0)

    train_iterator = create_iterator(vocab, configs, "train", transform)
    valid_iterator = create_iterator(vocab, configs, "valid", transform)
    test_iterator = create_iterator(vocab, configs, "test", transform)

    if device >= 0:
        model.to_gpu(device)

    optimizer = create_optimizer(configs)
    optimizer.setup(model)
    optimizer = add_hooks(optimizer, configs)

    updater = T.StandardUpdater(train_iterator,
                                optimizer,
                                converter=converter,
                                device=device)

    params = configs.export()
    params["num_word_vocab"] = num_word_vocab
    params["num_char_vocab"] = num_char_vocab
    params["num_tag_vocab"] = num_tag_vocab

    epoch = configs["iteration"]["epoch"]
    trigger = (epoch, "epoch")

    model_path = configs["output"]
    timestamp = datetime.datetime.now()
    timestamp_str = timestamp.isoformat()
    output_path = Path(f"{model_path}.{timestamp_str}")

    trainer = T.Trainer(updater, trigger, out=output_path)
    save_args(params, output_path)
    msg = f"Create \x1b[31m{output_path}\x1b[0m for saving model snapshots"
    logging.debug(msg)

    entries = ["epoch", "iteration", "elapsed_time", "lr", "main/loss"]
    entries += ["validation/main/loss", "validation/main/fscore"]
    entries += ["validation_1/main/loss", "validation_1/main/fscore"]

    valid_evaluator = NamedEntityEvaluator(valid_iterator,
                                           model,
                                           transformer.itransform,
                                           converter,
                                           device=device)

    test_evaluator = NamedEntityEvaluator(test_iterator,
                                          model,
                                          transformer.itransform,
                                          converter,
                                          device=device)

    epoch_trigger = (1, "epoch")
    snapshot_filename = "snapshot_epoch_{.updater.epoch:04d}"
    trainer.extend(valid_evaluator, trigger=epoch_trigger)
    trainer.extend(test_evaluator, trigger=epoch_trigger)
    trainer.extend(E.observe_lr(), trigger=epoch_trigger)
    trainer.extend(E.LogReport(trigger=epoch_trigger))
    trainer.extend(E.PrintReport(entries=entries), trigger=epoch_trigger)
    trainer.extend(E.ProgressBar(update_interval=20))
    trainer.extend(E.snapshot_object(model, filename=snapshot_filename),
                   trigger=(1, "epoch"))

    if "learning_rate_decay" in params:
        logger.debug("Enable Learning Rate decay")
        trainer.extend(
            LearningRateDecay("lr", params["learning_rate"],
                              params["learning_rate_decay"]),
            trigger=epoch_trigger,
        )

    trainer.run()
Esempio n. 4
0
 def test_all_normalize(self):
     vocab = Vocabulary.prepare(params3)
     result = vocab.vocab_arr['word']
     expect = sorted(['apple00', 'wine', 'apple'])
     print('result: ', result)
     self.assertEqual(expect, result)
Esempio n. 5
0
    logging.basicConfig(level=logging.DEBUG, format=fmt)
    args = parse_inference_args()
    chainer.config.train = False

    if args.device >= 0:
        chainer.cuda.get_device(args.device).use()
    set_seed()

    model_dir = pathlib.Path(args.model)
    configs = json.load(open(model_dir / 'args'))

    metric = args.metric.replace('/', '.')
    snapshot_file, prediction_path = select_snapshot(args, model_dir)
    logger.debug(f'creat prediction into {prediction_path}')

    vocab = Vocabulary.prepare(configs)
    num_word_vocab = configs['num_word_vocab']
    num_char_vocab = configs['num_char_vocab']
    num_tag_vocab = configs['num_tag_vocab']

    model = BiLSTM_CRF(configs, num_word_vocab, num_char_vocab, num_tag_vocab)

    model_path = model_dir / snapshot_file
    logger.debug(f'load {snapshot_file}')
    chainer.serializers.load_npz(model_path.as_posix(), model)

    if args.device >= 0:
        model.to_gpu(args.device)

    transformer = DatasetTransformer(vocab)
    transform = transformer.transform
Esempio n. 6
0
def test_all_normalize():
    vocab = Vocabulary.prepare(PARAMS3)
    result = vocab.vocab_arr["word"]
    expect = sorted(["apple00", "wine", "apple"])
    print("result: ", result)
    assert expect == result
Esempio n. 7
0
def test_all_normalize():
    vocab = Vocabulary.prepare(PARAMS3)
    result = vocab.vocab_arr['word']
    expect = sorted(['apple00', 'wine', 'apple'])
    print('result: ', result)
    assert expect == result