Example #1
0
def test(model_file, test_file, filter_type=True, limit=-1, device=-1):
    context = utils.Saver.load_context(model_file)
    logger = logging.getLogger()
    logger.trace('# context: {}'.format(context))
    if context.seed is not None:
        utils.set_random_seed(context.seed, device)

    loader = context.builder.loader
    loader.filter_coord = filter_type
    encoder_input = context.encoder_input

    cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input)
    use_cont_embed = cont_embed_file_ext is not None

    test_dataset = loader.load_with_external_resources(
        test_file, train=False, bucketing=False,
        size=None if limit < 0 else limit,
        use_external_postags=True,
        use_contextualized_embed=use_cont_embed,
        contextualized_embed_file_ext=cont_embed_file_ext,
        logger=logger)
    logger.info('{} samples loaded for test'.format(len(test_dataset)))

    model = context.builder.build()
    chainer.serializers.load_npz(model_file, model)
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)

    parser = parsers.build_parser(loader, model)
    evaluator = eval_module.Evaluator(
        parser, logger=logger, report_details=True)
    reporter = training.listeners.Reporter(logger)

    logger.info('Start decoding')
    utils.chainer_train_off()
    evaluator.on_epoch_validate_begin({'epoch': -1})
    pbar = tqdm(total=len(test_dataset))
    for batch in test_dataset.batch(
            context.batch_size, colwise=True, shuffle=False):
        xs, ts = batch[:-1], batch[-1]
        ys = model.forward(*xs)
        loss = model.compute_loss(ys, ts)
        with reporter:
            values = dict(loss=float(chainer.cuda.to_cpu(loss.data)))
            model.compute_accuracy(ys, ts)
            for k, v in model.result.items():
                if 'loss' in k:
                    values[k] = float(chainer.cuda.to_cpu(v.data))
                elif 'accuracy' in k:
                    values[k] = v
            reporter.report(values)
        evaluator.on_batch_end({'train': False, 'xs': xs, 'ts': ts})
        pbar.update(len(ts))
    pbar.close()
    reporter._output_log("testing", reporter.get_summary(),
                         {'epoch': -1, 'size': len(test_dataset)})
    evaluator.on_epoch_validate_end({'epoch': -1})
Example #2
0
def parse(model_file,
          target_file,
          contextualized_embed_file=None,
          n_best=1,
          device=-1):
    context = utils.Saver.load_context(model_file)
    logging.trace('# context: {}'.format(context))
    if context.seed is not None:
        utils.set_random_seed(context.seed, device)

    loader = context.builder.loader
    encoder_input = context.encoder_input
    use_cont_embed = _get_cont_embed_file_ext(encoder_input) is not None
    if use_cont_embed and contextualized_embed_file is None:
        raise ValueError(
            "contextualized_embed_file must be specified when using "
            "a model trained with contextualized embeddings")
    elif not use_cont_embed and contextualized_embed_file is not None:
        raise ValueError(
            "contextualized_embed_file must not be specified when using "
            "a model trained without contextualized embeddings")

    target_dataset = loader.load_from_tagged_file(target_file,
                                                  contextualized_embed_file)
    logging.info('{} samples loaded for parsing'.format(len(target_dataset)))

    model = context.builder.build()
    chainer.serializers.load_npz(model_file, model)
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)
    parser = parsers.build_parser(loader, model)

    logging.info('Start parsing')
    utils.chainer_train_off()
    pbar = tqdm(total=len(target_dataset))
    for batch in target_dataset.batch(context.batch_size,
                                      colwise=True,
                                      shuffle=False):
        xs, (words, is_quote, sentence_id) = batch[:-3], batch[-3:]
        parsed = parser.parse(*xs, n_best)
        for results, words_i, is_quote_i, sentence_id_i \
                in zip(parsed, words, is_quote, sentence_id):
            raw_sentence = ' '.join(words_i)
            for best_k, (coords, score) in enumerate(results):
                output = [
                    "#!RAW: {}".format(raw_sentence),
                    "SENTENCE: {}".format(sentence_id_i),
                    "CANDIDATE: #{}".format(best_k),
                    "SCORE: {}".format(score),
                ]
                coords = dataset.post_process(coords, is_quote_i)
                for cc, coord in sorted(coords.items()):
                    output.append("CC: {} {}".format(cc, words_i[cc]))
                    if coord is not None:
                        b, e = coord.conjuncts[0][0], coord.conjuncts[-1][1]
                        output.append("COORD: {} {} {}".format(
                            b, e, ' '.join(words_i[b:e + 1])))
                        for (b, e) in coord.conjuncts:
                            output.append("CONJ: {} {} {}".format(
                                b, e, ' '.join(words_i[b:e + 1])))
                    else:
                        output.append("COORD: None")
                print('\n'.join(output) + '\n')
        pbar.update(len(sentence_id))
    pbar.close()
Example #3
0
def train(train_file,
          test_file=None,
          format='tree',
          embed_file=None,
          n_epoch=20,
          batch_size=20,
          lr=0.001,
          limit=-1,
          l2_lambda=0.0,
          grad_clip=5.0,
          encoder_input=('char', 'postag'),
          model_config=None,
          device=-1,
          save_dir=None,
          seed=None,
          cache_dir='',
          refresh_cache=False,
          bert_model=0,
          bert_dir=''):
    if seed is not None:
        utils.set_random_seed(seed, device)
    logger = logging.getLogger()
    # logger.configure(filename='log.txt', logdir=save_dir)
    assert isinstance(logger, logging.AppLogger)
    if model_config is None:
        model_config = {}
    model_config['bert_model'] = bert_model
    model_config['bert_dir'] = bert_dir

    os.makedirs(save_dir, exist_ok=True)

    read_genia = format == 'genia'
    loader = dataset.DataLoader.build(
        postag_embed_size=model_config.get('postag_embed_size', 50),
        char_embed_size=model_config.get('char_embed_size', 10),
        word_embed_file=embed_file,
        filter_coord=(not read_genia),
        refresh_cache=refresh_cache,
        format=format,
        cache_options=dict(dir=cache_dir, mkdir=True, logger=logger),
        extra_ids=(git.hash(), ))

    use_external_postags = not read_genia
    cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input)
    use_cont_embed = cont_embed_file_ext is not None

    train_dataset = loader.load_with_external_resources(
        train_file,
        train=True,
        bucketing=False,
        size=None if limit < 0 else limit,
        refresh_cache=refresh_cache,
        use_external_postags=use_external_postags,
        use_contextualized_embed=use_cont_embed,
        contextualized_embed_file_ext=cont_embed_file_ext)
    logging.info('{} samples loaded for training'.format(len(train_dataset)))
    test_dataset = None
    if test_file is not None:
        test_dataset = loader.load_with_external_resources(
            test_file,
            train=False,
            bucketing=False,
            size=None if limit < 0 else limit // 10,
            refresh_cache=refresh_cache,
            use_external_postags=use_external_postags,
            use_contextualized_embed=use_cont_embed,
            contextualized_embed_file_ext=cont_embed_file_ext)
        logging.info('{} samples loaded for validation'.format(
            len(test_dataset)))

    builder = models.CoordSolverBuilder(loader,
                                        inputs=encoder_input,
                                        **model_config)
    logger.info("{}".format(builder))
    model = builder.build()
    logger.trace("Model: {}".format(model))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)

    if bert_model == 1:
        optimizer = chainer.optimizers.AdamW(alpha=lr)
        optimizer.setup(model)
        # optimizer.add_hook(chainer.optimizer.GradientClipping(1.))
    else:
        optimizer = chainer.optimizers.AdamW(alpha=lr,
                                             beta1=0.9,
                                             beta2=0.999,
                                             eps=1e-08)
        optimizer.setup(model)
        if l2_lambda > 0.0:
            optimizer.add_hook(chainer.optimizer.WeightDecay(l2_lambda))
        if grad_clip > 0.0:
            optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip))

    def _report(y, t):
        values = {}
        model.compute_accuracy(y, t)
        for k, v in model.result.items():
            if 'loss' in k:
                values[k] = float(chainer.cuda.to_cpu(v.data))
            elif 'accuracy' in k:
                values[k] = v
        training.report(values)

    trainer = training.Trainer(optimizer, model, loss_func=model.compute_loss)
    trainer.configure(utils.training_config)
    trainer.add_listener(
        training.listeners.ProgressBar(lambda n: tqdm(total=n)), priority=200)
    trainer.add_hook(training.BATCH_END,
                     lambda data: _report(data['ys'], data['ts']))
    if test_dataset:
        parser = parsers.build_parser(loader, model)
        evaluator = eval_module.Evaluator(parser,
                                          logger=logging,
                                          report_details=False)
        trainer.add_listener(evaluator)

    if bert_model == 2:
        num_train_steps = 20000 * 5 / 20
        num_warmup_steps = 10000 / 20
        learning_rate = 2e-5
        # learning rate (eta) scheduling in Adam
        lr_decay_init = learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.add_hook(
            training.BATCH_END,
            extensions.LinearShift(  # decay
                'eta', (lr_decay_init, 0.),
                (num_warmup_steps, num_train_steps),
                optimizer=optimizer))
        trainer.add_hook(
            training.BATCH_END,
            extensions.WarmupShift(  # warmup
                'eta',
                0.,
                num_warmup_steps,
                learning_rate,
                optimizer=optimizer))

    if save_dir is not None:
        accessid = logging.getLogger().accessid
        date = logging.getLogger().accesstime.strftime('%Y%m%d')
        # metric = 'whole' if isinstance(model, models.Teranishi17) else 'inner'
        metric = 'exact'
        trainer.add_listener(
            utils.Saver(
                model,
                basename="{}-{}".format(date, accessid),
                context=dict(App.context, builder=builder),
                directory=save_dir,
                logger=logger,
                save_best=True,
                evaluate=(lambda _: evaluator.get_overall_score(metric))))

    trainer.fit(train_dataset, test_dataset, n_epoch, batch_size)