Example #1
0
    def run(cls, command=None):
        cls.configure()
        command, command_args, common_args \
            = cls._parse_args(command)
        try:
            def handler(signum, frame):
                raise SystemExit("Signal(%d) received: "
                                 "The program %s will be closed"
                                 % (signum, __file__))
            signal.signal(signal.SIGINT, handler)
            signal.signal(signal.SIGTERM, handler)
            os.umask(0)

            self = cls._get_instance()
            self._initialize(command, command_args, common_args)
            self._preprocess()
            self._process()
            self._postprocess()
            self._finalize()
        except Exception:
            logging.e("Exception occurred during execution:",
                      exc_info=True, stack_info=cls.debug)
        except SystemExit as e:
            logging.w(e)
        finally:
            logging.getLogger().finalize()
Example #2
0
def test(model_file, test_file, device=-1):
    context = utils.Saver.load_context(model_file)
    if context.seed is not None:
        utils.set_random_seed(context.seed, device)

    test_dataset = context.loader.load(test_file, train=False, bucketing=True)
    kwargs = dict(context)
    if context.model_config is not None:
        kwargs.update(context.model_config)
    model = _build_parser(**dict(kwargs))
    chainer.serializers.load_npz(model_file, model)
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)

    pbar = training.listeners.ProgressBar(lambda n: tqdm(total=n))
    pbar.init(len(test_dataset))
    evaluator = Evaluator(model, context.loader.rel_map, test_file,
                          logging.getLogger())
    utils.chainer_train_off()
    for batch in test_dataset.batch(context.batch_size,
                                    colwise=True,
                                    shuffle=False):
        xs, ts = batch[:-1], batch[-1]
        ys = model.forward(*xs)
        evaluator.on_batch_end({'train': False, 'xs': xs, 'ys': ys, 'ts': ts})
        pbar.update(len(ts))
    evaluator.on_epoch_validate_end({})
Example #3
0
def test(model_file, test_file, device=-1):
    context = utils.Saver.load_context(model_file)
    if context.seed is not None:
        utils.set_random_seed(context.seed, device)

    test_dataset = context.loader.load(test_file, train=False, bucketing=True)
    model = _build_parser(**dict(context))
    chainer.serializers.load_npz(model_file, model)
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)

    pbar = training.listeners.ProgressBar(lambda n: tqdm(total=n))
    pbar.init(len(test_dataset))
    evaluator = Evaluator(model, context.loader.rel_map, test_file,
                          Log.getLogger())
    utils.chainer_train_off()
    for batch in test_dataset.batch(context.batch_size,
                                    colwise=True,
                                    shuffle=False):
        xs, ts = batch[:-1], batch[-1]
        parsed = model.parse(*xs)
        evaluator.append([tokens[1:] for tokens in xs[-1]], parsed)
        pbar.update(len(ts))
    evaluator.report(show_details=False)
Example #4
0
def test(model_file, test_file, filter_type=True, limit=-1, device=-1):
    context = utils.Saver.load_context(model_file)
    logger = logging.getLogger()
    logger.trace('# context: {}'.format(context))
    if context.seed is not None:
        utils.set_random_seed(context.seed, device)

    loader = context.builder.loader
    loader.filter_coord = filter_type
    encoder_input = context.encoder_input

    cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input)
    use_cont_embed = cont_embed_file_ext is not None

    test_dataset = loader.load_with_external_resources(
        test_file, train=False, bucketing=False,
        size=None if limit < 0 else limit,
        use_external_postags=True,
        use_contextualized_embed=use_cont_embed,
        contextualized_embed_file_ext=cont_embed_file_ext,
        logger=logger)
    logger.info('{} samples loaded for test'.format(len(test_dataset)))

    model = context.builder.build()
    chainer.serializers.load_npz(model_file, model)
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)

    parser = parsers.build_parser(loader, model)
    evaluator = eval_module.Evaluator(
        parser, logger=logger, report_details=True)
    reporter = training.listeners.Reporter(logger)

    logger.info('Start decoding')
    utils.chainer_train_off()
    evaluator.on_epoch_validate_begin({'epoch': -1})
    pbar = tqdm(total=len(test_dataset))
    for batch in test_dataset.batch(
            context.batch_size, colwise=True, shuffle=False):
        xs, ts = batch[:-1], batch[-1]
        ys = model.forward(*xs)
        loss = model.compute_loss(ys, ts)
        with reporter:
            values = dict(loss=float(chainer.cuda.to_cpu(loss.data)))
            model.compute_accuracy(ys, ts)
            for k, v in model.result.items():
                if 'loss' in k:
                    values[k] = float(chainer.cuda.to_cpu(v.data))
                elif 'accuracy' in k:
                    values[k] = v
            reporter.report(values)
        evaluator.on_batch_end({'train': False, 'xs': xs, 'ts': ts})
        pbar.update(len(ts))
    pbar.close()
    reporter._output_log("testing", reporter.get_summary(),
                         {'epoch': -1, 'size': len(test_dataset)})
    evaluator.on_epoch_validate_end({'epoch': -1})
Example #5
0
def check_grammar(test_file, limit=-1, grammar_type=1):
    logger = logging.getLogger()
    loader = dataset.DataLoader(filter_coord=True)
    test_dataset = loader.load(test_file, train=True, bucketing=False,
                               size=None if limit < 0 else limit)
    word_vocab = loader.get_processor('word').vocab
    from models.gold import GoldModel
    model = GoldModel()
    if grammar_type == 1:
        cfg = parsers.Grammar.CFG_COORD_1 + parsers.Grammar.CFG
    elif grammar_type == 2:
        cfg = parsers.Grammar.CFG_COORD_2 + parsers.Grammar.CFG
    else:
        raise ValueError("Invalid grammar type: {}".format(grammar_type))
    grammar = parsers.Grammar(word_vocab, cfg)
    parser = parsers.CkyParser(model, grammar)
    evaluator = eval_module.Evaluator(
        parser, logger=logger, report_details=False)
    n_corrects = 0
    pbar = tqdm(total=len(test_dataset))
    for batch in test_dataset.batch(size=20, colwise=True, shuffle=False):
        xs, ts = batch[:-1], batch[-1]
        true_coords_batch = ts
        model.set_gold(true_coords_batch)
        pred_coords_batch = evaluator._parser.parse(*xs, n_best=1)
        for i, (pred_coord_entries, true_coords) in \
                enumerate(zip(pred_coords_batch, true_coords_batch)):
            pred_coords, _score = pred_coord_entries[0]
            true_coords = {ckey: coord for ckey, coord
                           in true_coords.items() if coord is not None}
            for k, v in tuple(pred_coords.items()):
                if v is None:
                    del pred_coords[k]
            if pred_coords == true_coords:
                n_corrects += 1
            else:
                sentence = ' '.join(
                    [word_vocab.lookup(word_id) for word_id in xs[0][i]])
                print("SENTENCE: {}\nPRED: {}\nTRUE: {}\n-"
                      .format(sentence, pred_coords, true_coords))
            evaluator.add(pred_coords, true_coords)
        pbar.update(len(ts))
    pbar.close()
    evaluator.report()
    logger.info("Number of correct tree: {}/{}"
                .format(n_corrects, len(test_dataset)))
Example #6
0
    def fit(self, data, valid_data=None, epochs=10, batch_size=32,
            valid_batch_size=None):
        if isinstance(data, Dataset):
            train_dataset = data
        elif isinstance(data, (tuple, list)) and len(data) == 2:
            train_dataset = Dataset(*data)
        else:
            raise ValueError('invalid data: {}'.format(type(data)))

        if valid_data:
            do_validation = True
            if isinstance(valid_data, Dataset):
                val_dataset = valid_data
            elif isinstance(valid_data, (tuple, list)) \
                    and len(valid_data) == 2:
                val_dataset = Dataset(*valid_data)
            else:
                raise ValueError('When passing valid_data, '
                                 'it must be dataset or contain '
                                 'two (x_val, y_val) items: {}'
                                 .format(type(valid_data)))
            if valid_batch_size is None:
                valid_batch_size = batch_size
        else:
            do_validation = False

        self._reporter = listeners.Reporter(logging.getLogger())
        self.add_listener(self._reporter, priority=110)
        if self._acc_func is not None:
            def _report_accuracy(data):
                listeners.report(
                    {"accuracy": self._acc_func(data['ys'], data['ts'])})
            self.add_hook(TrainEvent.BATCH_END, _report_accuracy, priority=120)

        forward = self._forward
        if not callable(forward):
            if hasattr(self._forward, 'forward'):
                forward = self._forward.forward
            else:
                raise RuntimeError('`forward` is not callable')
        lossfun = self._loss_func
        convert = (self._converter if callable(self._converter)
                   else lambda x: x)

        history = []

        self.notify(TrainEvent.TRAIN_BEGIN)

        def main_loop():
            for epoch in range(1, epochs + 1):
                epoch_logs = PseudoImmutableMap(
                    epoch=epoch,
                    size=train_dataset.size,
                )
                self.notify(TrainEvent.EPOCH_BEGIN, epoch_logs)

                self._process(forward, train_dataset, lossfun, convert,
                              batch_size, epoch_logs.copy(), train=True)
                if do_validation:
                    self._process(
                        forward, val_dataset, lossfun, convert,
                        valid_batch_size, epoch_logs.copy(), train=False)

                self.notify(TrainEvent.EPOCH_END, epoch_logs)

        if self._reporter is not None:
            with self._reporter:
                main_loop()
        else:
            main_loop()

        self.notify(TrainEvent.TRAIN_END)

        return history
Example #7
0
def train(train_file,
          test_file=None,
          format='tree',
          embed_file=None,
          n_epoch=20,
          batch_size=20,
          lr=0.001,
          limit=-1,
          l2_lambda=0.0,
          grad_clip=5.0,
          encoder_input=('char', 'postag'),
          model_config=None,
          device=-1,
          save_dir=None,
          seed=None,
          cache_dir='',
          refresh_cache=False,
          bert_model=0,
          bert_dir=''):
    if seed is not None:
        utils.set_random_seed(seed, device)
    logger = logging.getLogger()
    # logger.configure(filename='log.txt', logdir=save_dir)
    assert isinstance(logger, logging.AppLogger)
    if model_config is None:
        model_config = {}
    model_config['bert_model'] = bert_model
    model_config['bert_dir'] = bert_dir

    os.makedirs(save_dir, exist_ok=True)

    read_genia = format == 'genia'
    loader = dataset.DataLoader.build(
        postag_embed_size=model_config.get('postag_embed_size', 50),
        char_embed_size=model_config.get('char_embed_size', 10),
        word_embed_file=embed_file,
        filter_coord=(not read_genia),
        refresh_cache=refresh_cache,
        format=format,
        cache_options=dict(dir=cache_dir, mkdir=True, logger=logger),
        extra_ids=(git.hash(), ))

    use_external_postags = not read_genia
    cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input)
    use_cont_embed = cont_embed_file_ext is not None

    train_dataset = loader.load_with_external_resources(
        train_file,
        train=True,
        bucketing=False,
        size=None if limit < 0 else limit,
        refresh_cache=refresh_cache,
        use_external_postags=use_external_postags,
        use_contextualized_embed=use_cont_embed,
        contextualized_embed_file_ext=cont_embed_file_ext)
    logging.info('{} samples loaded for training'.format(len(train_dataset)))
    test_dataset = None
    if test_file is not None:
        test_dataset = loader.load_with_external_resources(
            test_file,
            train=False,
            bucketing=False,
            size=None if limit < 0 else limit // 10,
            refresh_cache=refresh_cache,
            use_external_postags=use_external_postags,
            use_contextualized_embed=use_cont_embed,
            contextualized_embed_file_ext=cont_embed_file_ext)
        logging.info('{} samples loaded for validation'.format(
            len(test_dataset)))

    builder = models.CoordSolverBuilder(loader,
                                        inputs=encoder_input,
                                        **model_config)
    logger.info("{}".format(builder))
    model = builder.build()
    logger.trace("Model: {}".format(model))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)

    if bert_model == 1:
        optimizer = chainer.optimizers.AdamW(alpha=lr)
        optimizer.setup(model)
        # optimizer.add_hook(chainer.optimizer.GradientClipping(1.))
    else:
        optimizer = chainer.optimizers.AdamW(alpha=lr,
                                             beta1=0.9,
                                             beta2=0.999,
                                             eps=1e-08)
        optimizer.setup(model)
        if l2_lambda > 0.0:
            optimizer.add_hook(chainer.optimizer.WeightDecay(l2_lambda))
        if grad_clip > 0.0:
            optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip))

    def _report(y, t):
        values = {}
        model.compute_accuracy(y, t)
        for k, v in model.result.items():
            if 'loss' in k:
                values[k] = float(chainer.cuda.to_cpu(v.data))
            elif 'accuracy' in k:
                values[k] = v
        training.report(values)

    trainer = training.Trainer(optimizer, model, loss_func=model.compute_loss)
    trainer.configure(utils.training_config)
    trainer.add_listener(
        training.listeners.ProgressBar(lambda n: tqdm(total=n)), priority=200)
    trainer.add_hook(training.BATCH_END,
                     lambda data: _report(data['ys'], data['ts']))
    if test_dataset:
        parser = parsers.build_parser(loader, model)
        evaluator = eval_module.Evaluator(parser,
                                          logger=logging,
                                          report_details=False)
        trainer.add_listener(evaluator)

    if bert_model == 2:
        num_train_steps = 20000 * 5 / 20
        num_warmup_steps = 10000 / 20
        learning_rate = 2e-5
        # learning rate (eta) scheduling in Adam
        lr_decay_init = learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.add_hook(
            training.BATCH_END,
            extensions.LinearShift(  # decay
                'eta', (lr_decay_init, 0.),
                (num_warmup_steps, num_train_steps),
                optimizer=optimizer))
        trainer.add_hook(
            training.BATCH_END,
            extensions.WarmupShift(  # warmup
                'eta',
                0.,
                num_warmup_steps,
                learning_rate,
                optimizer=optimizer))

    if save_dir is not None:
        accessid = logging.getLogger().accessid
        date = logging.getLogger().accesstime.strftime('%Y%m%d')
        # metric = 'whole' if isinstance(model, models.Teranishi17) else 'inner'
        metric = 'exact'
        trainer.add_listener(
            utils.Saver(
                model,
                basename="{}-{}".format(date, accessid),
                context=dict(App.context, builder=builder),
                directory=save_dir,
                logger=logger,
                save_best=True,
                evaluate=(lambda _: evaluator.get_overall_score(metric))))

    trainer.fit(train_dataset, test_dataset, n_epoch, batch_size)
Example #8
0
def parse(model_file, target_file, contextualized_embed_file=None,
          n_best=1, device=-1):
    context = utils.Saver.load_context(model_file)
    logger = logging.getLogger()
    logger.trace('# context: {}'.format(context))
    if context.seed is not None:
        utils.set_random_seed(context.seed, device)

    loader = context.builder.loader
    encoder_input = context.encoder_input
    use_cont_embed = _get_cont_embed_file_ext(encoder_input) is not None
    if use_cont_embed and contextualized_embed_file is None:
        raise ValueError(
            "contextualized_embed_file must be specified when using "
            "a model trained with contextualized embeddings")
    elif not use_cont_embed and contextualized_embed_file is not None:
        raise ValueError(
            "contextualized_embed_file must not be specified when using "
            "a model trained without contextualized embeddings")

    if target_file.endswith('.txt'):
        loader.init_reader(format='default')
    loader.set_contextualized_embed_file(contextualized_embed_file)
    target_dataset = loader.load_with_external_resources(
        target_file, mode='parse', use_external_postags=True, logger=logger)
    logger.info('{} samples loaded for parsing'.format(len(target_dataset)))

    model = context.builder.build()
    chainer.serializers.load_npz(model_file, model)
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)
    parser = parsers.build_parser(loader, model)

    logger.info('Start parsing')
    utils.chainer_train_off()
    pbar = tqdm(total=len(target_dataset))
    for batch in target_dataset.batch(
            context.batch_size, colwise=True, shuffle=False):
        xs, (words, indices, sentence_id) = batch[:-3], batch[-3:]
        parsed = parser.parse(*xs, n_best)
        for results, words_i, indices_i, sentence_id_i \
                in zip(parsed, words, indices, sentence_id):
            raw_sentence = ' '.join(words_i)
            for best_k, (coords, score) in enumerate(results):
                output = [
                    "#!RAW: {}".format(raw_sentence),
                    "SENTENCE: {}".format(sentence_id_i),
                    "CANDIDATE: #{}".format(best_k),
                    "SCORE: {}".format(score),
                ]
                if indices_i is not None:
                    coords = dataset.postprocess(coords, indices_i)
                for cc, coord in sorted(coords.items()):
                    output.append("CC: {} {}".format(cc, words_i[cc]))
                    if coord is not None:
                        b, e = coord.conjuncts[0][0], coord.conjuncts[-1][1]
                        output.append("COORD: {} {} {}".format(
                            b, e, ' '.join(words_i[b:e + 1])))
                        for (b, e) in coord.conjuncts:
                            output.append("CONJ: {} {} {}".format(
                                b, e, ' '.join(words_i[b:e + 1])))
                    else:
                        output.append("COORD: None")
                print('\n'.join(output) + '\n')
        pbar.update(len(sentence_id))
    pbar.close()
Example #9
0
def train(train_file,
          test_file=None,
          embed_file=None,
          n_epoch=20,
          batch_size=5000,
          lr=2e-3,
          model_config=None,
          device=-1,
          save_dir=None,
          seed=None,
          cache_dir='',
          refresh_cache=False):
    if seed is not None:
        utils.set_random_seed(seed, device)
    logger = logging.getLogger()
    assert isinstance(logger, logging.AppLogger)
    if model_config is None:
        model_config = {}

    loader = dataset.DataLoader.build(input_file=train_file,
                                      word_embed_file=embed_file,
                                      refresh_cache=refresh_cache,
                                      extra_ids=(git.hash(), ),
                                      cache_options=dict(dir=cache_dir,
                                                         mkdir=True,
                                                         logger=logger))
    train_dataset = loader.load(train_file,
                                train=True,
                                bucketing=True,
                                refresh_cache=refresh_cache)
    test_dataset = None
    if test_file is not None:
        test_dataset = loader.load(test_file,
                                   train=False,
                                   bucketing=True,
                                   refresh_cache=refresh_cache)

    model = _build_parser(loader, **model_config)
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)
    optimizer = chainer.optimizers.Adam(alpha=lr,
                                        beta1=0.9,
                                        beta2=0.9,
                                        eps=1e-12)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(5.0))
    optimizer.add_hook(
        optimizers.ExponentialDecayAnnealing(initial_lr=lr,
                                             decay_rate=0.75,
                                             decay_step=5000,
                                             lr_key='alpha'))

    def _report(y, t):
        arc_accuracy, rel_accuracy = model.compute_accuracy(y, t)
        training.report({
            'arc_accuracy': arc_accuracy,
            'rel_accuracy': rel_accuracy
        })

    trainer = training.Trainer(optimizer, model, loss_func=model.compute_loss)
    trainer.configure(utils.training_config)
    trainer.add_listener(
        training.listeners.ProgressBar(lambda n: tqdm(total=n)), priority=200)
    trainer.add_hook(training.BATCH_END,
                     lambda data: _report(data['ys'], data['ts']))
    if test_dataset:
        evaluator = Evaluator(model, loader.rel_map, test_file, logger)
        trainer.add_listener(evaluator, priority=128)
        if save_dir is not None:
            accessid = logger.accessid
            date = logger.accesstime.strftime('%Y%m%d')
            trainer.add_listener(
                utils.Saver(model,
                            basename="{}-{}".format(date, accessid),
                            context=dict(App.context, loader=loader),
                            directory=save_dir,
                            logger=logger,
                            save_best=True,
                            evaluate=(lambda _: evaluator._parsed['UAS'])))
    trainer.fit(train_dataset, test_dataset, n_epoch, batch_size)