예제 #1
0
 def test_with_optimizer(self):
     optimizer = mock.Mock()
     optimizer.x = 0
     extension = extensions.WarmupShift('x', self.warmup_start,
                                        self.warmup_iter, self.init,
                                        optimizer)
     self._run_trainer(extension, self.expect, optimizer)
예제 #2
0
    def setUp(self):
        self.optimizer = mock.MagicMock()
        self.extension = extensions.WarmupShift('x', self.warmup_start,
                                                self.warmup_iter, self.init,
                                                self.optimizer)

        self.interval = 1
        self.expect = [e for e in self.expect for _ in range(self.interval)]
        self.trigger = util.get_trigger((self.interval, 'iteration'))

        self.trainer = testing.get_trainer_with_mock_updater(self.trigger)
        self.trainer.updater.get_optimizer.return_value = self.optimizer
예제 #3
0
def main():
    if not FLAGS.do_train and not FLAGS.do_predict and not FLAGS.do_print_test:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if FLAGS.do_train:
        if not FLAGS.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if FLAGS.do_predict:
        if not FLAGS.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.isdir(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = read_squad_examples(input_file=FLAGS.train_file,
                                             is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      FLAGS.max_seq_length,
                                                      FLAGS.doc_stride,
                                                      FLAGS.max_query_length,
                                                      is_training=True)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    bert = modeling.BertModel(config=bert_config)
    model = modeling.BertSQuAD(bert)
    if FLAGS.do_train:
        # If training, load BERT parameters only.
        ignore_names = ['output/W', 'output/b']
    else:
        # If only do_predict, load all parameters.
        ignore_names = None
    chainer.serializers.load_npz(FLAGS.init_checkpoint,
                                 model,
                                 ignore_names=ignore_names)

    if FLAGS.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use()
        model.to_gpu()

    if FLAGS.do_train:
        # Adam with weight decay only for 2D matrices
        optimizer = optimization.WeightDecayForMatrixAdam(
            alpha=1.,  # ignore alpha. instead, use eta as actual lr
            eps=1e-6,
            weight_decay_rate=0.01)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(1.))

        train_iter = chainer.iterators.SerialIterator(train_features,
                                                      FLAGS.train_batch_size)
        converter = Converter(is_training=True)
        updater = training.updaters.StandardUpdater(
            train_iter,
            optimizer,
            converter=converter,
            device=FLAGS.gpu,
            loss_func=model.compute_loss)
        trainer = training.Trainer(updater, (num_train_steps, 'iteration'),
                                   out=FLAGS.output_dir)

        # learning rate (eta) scheduling in Adam
        lr_decay_init = FLAGS.learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.extend(
            extensions.LinearShift(  # decay
                'eta', (lr_decay_init, 0.),
                (num_warmup_steps, num_train_steps)))
        trainer.extend(
            extensions.WarmupShift(  # warmup
                'eta', 0., num_warmup_steps, FLAGS.learning_rate))
        trainer.extend(extensions.observe_value(
            'eta', lambda trainer: trainer.updater.get_optimizer('main').eta),
                       trigger=(100, 'iteration'))  # logging

        trainer.extend(extensions.snapshot_object(
            model, 'model_snapshot_iter_{.updater.iteration}.npz'),
                       trigger=(num_train_steps // 2, 'iteration'))  # TODO
        trainer.extend(extensions.LogReport(trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PrintReport([
                'iteration', 'main/loss', 'main/accuracy', 'elapsed_time',
                'eta'
            ]))
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.run()

    if FLAGS.do_predict:
        eval_examples = read_squad_examples(input_file=FLAGS.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(eval_examples,
                                                     tokenizer,
                                                     FLAGS.max_seq_length,
                                                     FLAGS.doc_stride,
                                                     FLAGS.max_query_length,
                                                     is_training=False)
        test_iter = chainer.iterators.SerialIterator(eval_features,
                                                     FLAGS.predict_batch_size,
                                                     repeat=False,
                                                     shuffle=False)
        converter = Converter(is_training=False)

        print('Evaluating ...')
        evaluate(eval_examples,
                 test_iter,
                 model,
                 converter=converter,
                 device=FLAGS.gpu,
                 predict_func=model.predict)
        print('Finished.')
예제 #4
0
def train(train_file,
          test_file=None,
          format='tree',
          embed_file=None,
          n_epoch=20,
          batch_size=20,
          lr=0.001,
          limit=-1,
          l2_lambda=0.0,
          grad_clip=5.0,
          encoder_input=('char', 'postag'),
          model_config=None,
          device=-1,
          save_dir=None,
          seed=None,
          cache_dir='',
          refresh_cache=False,
          bert_model=0,
          bert_dir=''):
    if seed is not None:
        utils.set_random_seed(seed, device)
    logger = logging.getLogger()
    # logger.configure(filename='log.txt', logdir=save_dir)
    assert isinstance(logger, logging.AppLogger)
    if model_config is None:
        model_config = {}
    model_config['bert_model'] = bert_model
    model_config['bert_dir'] = bert_dir

    os.makedirs(save_dir, exist_ok=True)

    read_genia = format == 'genia'
    loader = dataset.DataLoader.build(
        postag_embed_size=model_config.get('postag_embed_size', 50),
        char_embed_size=model_config.get('char_embed_size', 10),
        word_embed_file=embed_file,
        filter_coord=(not read_genia),
        refresh_cache=refresh_cache,
        format=format,
        cache_options=dict(dir=cache_dir, mkdir=True, logger=logger),
        extra_ids=(git.hash(), ))

    use_external_postags = not read_genia
    cont_embed_file_ext = _get_cont_embed_file_ext(encoder_input)
    use_cont_embed = cont_embed_file_ext is not None

    train_dataset = loader.load_with_external_resources(
        train_file,
        train=True,
        bucketing=False,
        size=None if limit < 0 else limit,
        refresh_cache=refresh_cache,
        use_external_postags=use_external_postags,
        use_contextualized_embed=use_cont_embed,
        contextualized_embed_file_ext=cont_embed_file_ext)
    logging.info('{} samples loaded for training'.format(len(train_dataset)))
    test_dataset = None
    if test_file is not None:
        test_dataset = loader.load_with_external_resources(
            test_file,
            train=False,
            bucketing=False,
            size=None if limit < 0 else limit // 10,
            refresh_cache=refresh_cache,
            use_external_postags=use_external_postags,
            use_contextualized_embed=use_cont_embed,
            contextualized_embed_file_ext=cont_embed_file_ext)
        logging.info('{} samples loaded for validation'.format(
            len(test_dataset)))

    builder = models.CoordSolverBuilder(loader,
                                        inputs=encoder_input,
                                        **model_config)
    logger.info("{}".format(builder))
    model = builder.build()
    logger.trace("Model: {}".format(model))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu(device)

    if bert_model == 1:
        optimizer = chainer.optimizers.AdamW(alpha=lr)
        optimizer.setup(model)
        # optimizer.add_hook(chainer.optimizer.GradientClipping(1.))
    else:
        optimizer = chainer.optimizers.AdamW(alpha=lr,
                                             beta1=0.9,
                                             beta2=0.999,
                                             eps=1e-08)
        optimizer.setup(model)
        if l2_lambda > 0.0:
            optimizer.add_hook(chainer.optimizer.WeightDecay(l2_lambda))
        if grad_clip > 0.0:
            optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip))

    def _report(y, t):
        values = {}
        model.compute_accuracy(y, t)
        for k, v in model.result.items():
            if 'loss' in k:
                values[k] = float(chainer.cuda.to_cpu(v.data))
            elif 'accuracy' in k:
                values[k] = v
        training.report(values)

    trainer = training.Trainer(optimizer, model, loss_func=model.compute_loss)
    trainer.configure(utils.training_config)
    trainer.add_listener(
        training.listeners.ProgressBar(lambda n: tqdm(total=n)), priority=200)
    trainer.add_hook(training.BATCH_END,
                     lambda data: _report(data['ys'], data['ts']))
    if test_dataset:
        parser = parsers.build_parser(loader, model)
        evaluator = eval_module.Evaluator(parser,
                                          logger=logging,
                                          report_details=False)
        trainer.add_listener(evaluator)

    if bert_model == 2:
        num_train_steps = 20000 * 5 / 20
        num_warmup_steps = 10000 / 20
        learning_rate = 2e-5
        # learning rate (eta) scheduling in Adam
        lr_decay_init = learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.add_hook(
            training.BATCH_END,
            extensions.LinearShift(  # decay
                'eta', (lr_decay_init, 0.),
                (num_warmup_steps, num_train_steps),
                optimizer=optimizer))
        trainer.add_hook(
            training.BATCH_END,
            extensions.WarmupShift(  # warmup
                'eta',
                0.,
                num_warmup_steps,
                learning_rate,
                optimizer=optimizer))

    if save_dir is not None:
        accessid = logging.getLogger().accessid
        date = logging.getLogger().accesstime.strftime('%Y%m%d')
        # metric = 'whole' if isinstance(model, models.Teranishi17) else 'inner'
        metric = 'exact'
        trainer.add_listener(
            utils.Saver(
                model,
                basename="{}-{}".format(date, accessid),
                context=dict(App.context, builder=builder),
                directory=save_dir,
                logger=logger,
                save_best=True,
                evaluate=(lambda _: evaluator.get_overall_score(metric))))

    trainer.fit(train_dataset, test_dataset, n_epoch, batch_size)
예제 #5
0
def main():
    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if not os.path.isdir(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)

    def _get_text_file(text_dir):
        import glob
        #file_list = glob.glob(f'{text_dir}/**/*')
        # seqが512
        #file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/all']
        # seqが128
        file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/all_seq128']
        # debug
        #file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/AA/wiki_00']
        files = ",".join(file_list)
        return files
    input_files = _get_text_file(FLAGS.input_file).split(',')

   #  model_fn = model_fn_builder(
   #      bert_config=bert_config,
   #      init_checkpoint=FLAGS.init_checkpoint,
   #      learning_rate=FLAGS.learning_rate,
   #      num_train_steps=FLAGS.num_train_steps,
   #      num_warmup_steps=FLAGS.num_warmup_steps,
   #      use_tpu=FLAGS.use_tpu,
   #      use_one_hot_embeddings=FLAGS.use_tpu)

    if FLAGS.do_train:
        input_files = input_files
    bert = modeling.BertModel(config=bert_config)
    model = modeling.BertPretrainer(bert)
    if FLAGS.init_checkpoint:
        serializers.load_npz(FLAGS.init_checkpoint, model)
        model = modeling.BertPretrainer(model.bert)
    if FLAGS.gpu >= 0:
        pass
        #chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use()
        #model.to_gpu()

    if FLAGS.do_train:
        """chainerでのpretrainを記述。BERTClassificationに変わるものを作成し、BERTの出力をこねこねしてmodel_fnが返すものと同じものを返すようにすれば良いか?"""
        # Adam with weight decay only for 2D matrices
        optimizer = optimization.WeightDecayForMatrixAdam(
            alpha=1.,  # ignore alpha. instead, use eta as actual lr
            eps=1e-6, weight_decay_rate=0.01)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(1.))

        """ ConcatenatedDatasetはon memolyなため、巨大データセットのPickleを扱えない
        input_files = sorted(input_files)[:len(input_files) // 2]
        input_files = sorted(input_files)[:200]
        import concurrent.futures
        train_examples = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for train_exapmle in executor.map(_load_data_using_dataset_api, input_files):
                train_examples.append(train_exapmle)
        train_examples = ConcatenatedDataset(*train_examples)
        """
        train_examples = _load_data_using_dataset_api(input_files[0])

        train_iter = chainer.iterators.SerialIterator(
            train_examples, FLAGS.train_batch_size)
        converter = Converter()
        if False:
            updater = training.updaters.StandardUpdater(
                train_iter, optimizer,
                converter=converter,
                device=FLAGS.gpu)
        else:
            updater = training.updaters.ParallelUpdater(
                iterator=train_iter,
                optimizer=optimizer,
                converter=converter,
                # The device of the name 'main' is used as a "master", while others are
                # used as slaves. Names other than 'main' are arbitrary.
                devices={'main': 0,
                         '1': 1,
                         '2': 2,
                         '3': 3,
                         '4': 4,
                         '5': 5,
                         '6': 6,
                         '7': 7,
                         },
            )
        # learning rate (eta) scheduling in Adam
        num_warmup_steps = FLAGS.num_warmup_steps
        num_train_steps = FLAGS.num_train_steps
        trainer = training.Trainer(
            updater, (num_train_steps, 'iteration'), out=FLAGS.output_dir)
        lr_decay_init = FLAGS.learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.extend(extensions.LinearShift(  # decay
            'eta', (lr_decay_init, 0.), (num_warmup_steps, num_train_steps)))
        trainer.extend(extensions.WarmupShift(  # warmup
            'eta', 0., num_warmup_steps, FLAGS.learning_rate))
        trainer.extend(extensions.observe_value(
            'eta', lambda trainer: trainer.updater.get_optimizer('main').eta),
            trigger=(50, 'iteration'))  # logging

        trainer.extend(extensions.snapshot_object(
            model, 'seq_128_model_snapshot_iter_{.updater.iteration}.npz'),
            trigger=(1000, 'iteration'))
        trainer.extend(extensions.LogReport(
            trigger=(1, 'iteration')))
        #trainer.extend(extensions.PlotReport(
        #    [
        #        'main/next_sentence_loss',
        #        'main/next_sentence_accuracy',
        #     ], (3, 'iteration'), file_name='next_sentence.png'))
        #trainer.extend(extensions.PlotReport(
        #    [
        #        'main/masked_lm_loss',
        #        'main/masked_lm_accuracy',
        #     ], (3, 'iteration'), file_name='masked_lm.png'))
        trainer.extend(extensions.PlotReport(
            y_keys=[
                'main/loss',
                'main/next_sentence_loss',
                'main/next_sentence_accuracy',
                'main/masked_lm_loss',
                'main/masked_lm_accuracy',
             ], x_key='iteration', trigger=(100, 'iteration'), file_name='loss.png'))
        trainer.extend(extensions.PrintReport(
            ['iteration',
             'main/loss',
             'main/masked_lm_loss', 'main/masked_lm_accuracy',
             'main/next_sentence_loss', 'main/next_sentence_accuracy',
             'elapsed_time']))
        trainer.extend(extensions.ProgressBar(update_interval=20))

        trainer.run()

    if FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)

        result = estimator.evaluate(
            input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
예제 #6
0
def main():
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "livedoor": LivedoorProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_print_test:
        raise ValueError("At least one of `do_train` or `do_eval` "
                         "or `do_print_test` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.isdir(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(model_file=FLAGS.model_file,
                                           vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    # TODO: use special Adam from "optimization.py"
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    bert = modeling.BertModel(config=bert_config)
    pretrained = modeling.BertPretrainer(bert)
    chainer.serializers.load_npz(FLAGS.init_checkpoint, pretrained)

    model = modeling.BertClassifier(pretrained.bert,
                                    num_labels=len(label_list))

    if FLAGS.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use()
        model.to_gpu()

    if FLAGS.do_train:
        # Adam with weight decay only for 2D matrices
        optimizer = optimization.WeightDecayForMatrixAdam(
            alpha=1.,  # ignore alpha. instead, use eta as actual lr
            eps=1e-6,
            weight_decay_rate=0.01)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(1.))

        train_iter = chainer.iterators.SerialIterator(train_examples,
                                                      FLAGS.train_batch_size)
        converter = Converter(label_list, FLAGS.max_seq_length, tokenizer)
        updater = training.updaters.StandardUpdater(train_iter,
                                                    optimizer,
                                                    converter=converter,
                                                    device=FLAGS.gpu)
        trainer = training.Trainer(updater, (num_train_steps, 'iteration'),
                                   out=FLAGS.output_dir)

        # learning rate (eta) scheduling in Adam
        lr_decay_init = FLAGS.learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.extend(
            extensions.LinearShift(  # decay
                'eta', (lr_decay_init, 0.),
                (num_warmup_steps, num_train_steps)))
        trainer.extend(
            extensions.WarmupShift(  # warmup
                'eta', 0., num_warmup_steps, FLAGS.learning_rate))
        trainer.extend(extensions.observe_value(
            'eta', lambda trainer: trainer.updater.get_optimizer('main').eta),
                       trigger=(50, 'iteration'))  # logging

        trainer.extend(extensions.snapshot_object(
            model, 'model_snapshot_iter_{.updater.iteration}.npz'),
                       trigger=(num_train_steps, 'iteration'))
        trainer.extend(extensions.LogReport(trigger=(50, 'iteration')))
        trainer.extend(
            extensions.PrintReport(
                ['iteration', 'main/loss', 'main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.run()

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        test_iter = chainer.iterators.SerialIterator(eval_examples,
                                                     FLAGS.train_batch_size *
                                                     2,
                                                     repeat=False,
                                                     shuffle=False)
        converter = Converter(label_list, FLAGS.max_seq_length, tokenizer)
        evaluator = extensions.Evaluator(test_iter,
                                         model,
                                         converter=converter,
                                         device=FLAGS.gpu)
        results = evaluator()
        print(results)

    # if you wanna see some output arrays for debugging
    if FLAGS.do_print_test:
        short_eval_examples = processor.get_dev_examples(FLAGS.data_dir)[:3]
        short_eval_examples = short_eval_examples[:FLAGS.eval_batch_size]
        short_test_iter = chainer.iterators.SerialIterator(
            short_eval_examples,
            FLAGS.eval_batch_size,
            repeat=False,
            shuffle=False)
        converter = Converter(label_list, FLAGS.max_seq_length, tokenizer)
        evaluator = extensions.Evaluator(test_iter,
                                         model,
                                         converter=converter,
                                         device=FLAGS.gpu)

        with chainer.using_config('train', False):
            with chainer.no_backprop_mode():
                data = short_test_iter.__next__()
                out = model.bert.get_pooled_output(
                    *converter(data, FLAGS.gpu)[:-1])
                print(out)
                print(out.shape)
            print(converter(data, -1))
예제 #7
0
 def test_without_init(self):
     self.optimizer.x = self.warmup_start
     extension = extensions.WarmupShift('x', self.warmup_start,
                                        self.warmup_iter, self.init,
                                        self.optimizer)
     self._run_trainer(extension, self.expect)