Python WeightDecayForMatrixAdam Exemples, optimization.WeightDecayForMatrixAdam Python Exemples

Exemple #1

0

Afficher le fichier

def main():
    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if not os.path.isdir(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)

    def _get_text_file(text_dir):
        import glob
        #file_list = glob.glob(f'{text_dir}/**/*')
        # seqが512
        #file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/all']
        # seqが128
        file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/all_seq128']
        # debug
        #file_list = ['/nfs/ai16storage01/sec/akp2/1706nasubi/inatomi/benchmark/bert-chainer/data/wiki_data_pickle/AA/wiki_00']
        files = ",".join(file_list)
        return files
    input_files = _get_text_file(FLAGS.input_file).split(',')

   #  model_fn = model_fn_builder(
   #      bert_config=bert_config,
   #      init_checkpoint=FLAGS.init_checkpoint,
   #      learning_rate=FLAGS.learning_rate,
   #      num_train_steps=FLAGS.num_train_steps,
   #      num_warmup_steps=FLAGS.num_warmup_steps,
   #      use_tpu=FLAGS.use_tpu,
   #      use_one_hot_embeddings=FLAGS.use_tpu)

    if FLAGS.do_train:
        input_files = input_files
    bert = modeling.BertModel(config=bert_config)
    model = modeling.BertPretrainer(bert)
    if FLAGS.init_checkpoint:
        serializers.load_npz(FLAGS.init_checkpoint, model)
        model = modeling.BertPretrainer(model.bert)
    if FLAGS.gpu >= 0:
        pass
        #chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use()
        #model.to_gpu()

    if FLAGS.do_train:
        """chainerでのpretrainを記述。BERTClassificationに変わるものを作成し、BERTの出力をこねこねしてmodel_fnが返すものと同じものを返すようにすれば良いか?"""
        # Adam with weight decay only for 2D matrices
        optimizer = optimization.WeightDecayForMatrixAdam(
            alpha=1.,  # ignore alpha. instead, use eta as actual lr
            eps=1e-6, weight_decay_rate=0.01)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(1.))

        """ ConcatenatedDatasetはon memolyなため、巨大データセットのPickleを扱えない
        input_files = sorted(input_files)[:len(input_files) // 2]
        input_files = sorted(input_files)[:200]
        import concurrent.futures
        train_examples = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for train_exapmle in executor.map(_load_data_using_dataset_api, input_files):
                train_examples.append(train_exapmle)
        train_examples = ConcatenatedDataset(*train_examples)
        """
        train_examples = _load_data_using_dataset_api(input_files[0])

        train_iter = chainer.iterators.SerialIterator(
            train_examples, FLAGS.train_batch_size)
        converter = Converter()
        if False:
            updater = training.updaters.StandardUpdater(
                train_iter, optimizer,
                converter=converter,
                device=FLAGS.gpu)
        else:
            updater = training.updaters.ParallelUpdater(
                iterator=train_iter,
                optimizer=optimizer,
                converter=converter,
                # The device of the name 'main' is used as a "master", while others are
                # used as slaves. Names other than 'main' are arbitrary.
                devices={'main': 0,
                         '1': 1,
                         '2': 2,
                         '3': 3,
                         '4': 4,
                         '5': 5,
                         '6': 6,
                         '7': 7,
                         },
            )
        # learning rate (eta) scheduling in Adam
        num_warmup_steps = FLAGS.num_warmup_steps
        num_train_steps = FLAGS.num_train_steps
        trainer = training.Trainer(
            updater, (num_train_steps, 'iteration'), out=FLAGS.output_dir)
        lr_decay_init = FLAGS.learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.extend(extensions.LinearShift(  # decay
            'eta', (lr_decay_init, 0.), (num_warmup_steps, num_train_steps)))
        trainer.extend(extensions.WarmupShift(  # warmup
            'eta', 0., num_warmup_steps, FLAGS.learning_rate))
        trainer.extend(extensions.observe_value(
            'eta', lambda trainer: trainer.updater.get_optimizer('main').eta),
            trigger=(50, 'iteration'))  # logging

        trainer.extend(extensions.snapshot_object(
            model, 'seq_128_model_snapshot_iter_{.updater.iteration}.npz'),
            trigger=(1000, 'iteration'))
        trainer.extend(extensions.LogReport(
            trigger=(1, 'iteration')))
        #trainer.extend(extensions.PlotReport(
        #    [
        #        'main/next_sentence_loss',
        #        'main/next_sentence_accuracy',
        #     ], (3, 'iteration'), file_name='next_sentence.png'))
        #trainer.extend(extensions.PlotReport(
        #    [
        #        'main/masked_lm_loss',
        #        'main/masked_lm_accuracy',
        #     ], (3, 'iteration'), file_name='masked_lm.png'))
        trainer.extend(extensions.PlotReport(
            y_keys=[
                'main/loss',
                'main/next_sentence_loss',
                'main/next_sentence_accuracy',
                'main/masked_lm_loss',
                'main/masked_lm_accuracy',
             ], x_key='iteration', trigger=(100, 'iteration'), file_name='loss.png'))
        trainer.extend(extensions.PrintReport(
            ['iteration',
             'main/loss',
             'main/masked_lm_loss', 'main/masked_lm_accuracy',
             'main/next_sentence_loss', 'main/next_sentence_accuracy',
             'elapsed_time']))
        trainer.extend(extensions.ProgressBar(update_interval=20))

        trainer.run()

    if FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)

        result = estimator.evaluate(
            input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

Exemple #2

0

Afficher le fichier

def main():
    if not FLAGS.do_train and not FLAGS.do_predict and not FLAGS.do_print_test:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if FLAGS.do_train:
        if not FLAGS.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if FLAGS.do_predict:
        if not FLAGS.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.isdir(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = read_squad_examples(input_file=FLAGS.train_file,
                                             is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      FLAGS.max_seq_length,
                                                      FLAGS.doc_stride,
                                                      FLAGS.max_query_length,
                                                      is_training=True)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    bert = modeling.BertModel(config=bert_config)
    model = modeling.BertSQuAD(bert)
    if FLAGS.do_train:
        # If training, load BERT parameters only.
        ignore_names = ['output/W', 'output/b']
    else:
        # If only do_predict, load all parameters.
        ignore_names = None
    chainer.serializers.load_npz(FLAGS.init_checkpoint,
                                 model,
                                 ignore_names=ignore_names)

    if FLAGS.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use()
        model.to_gpu()

    if FLAGS.do_train:
        # Adam with weight decay only for 2D matrices
        optimizer = optimization.WeightDecayForMatrixAdam(
            alpha=1.,  # ignore alpha. instead, use eta as actual lr
            eps=1e-6,
            weight_decay_rate=0.01)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(1.))

        train_iter = chainer.iterators.SerialIterator(train_features,
                                                      FLAGS.train_batch_size)
        converter = Converter(is_training=True)
        updater = training.updaters.StandardUpdater(
            train_iter,
            optimizer,
            converter=converter,
            device=FLAGS.gpu,
            loss_func=model.compute_loss)
        trainer = training.Trainer(updater, (num_train_steps, 'iteration'),
                                   out=FLAGS.output_dir)

        # learning rate (eta) scheduling in Adam
        lr_decay_init = FLAGS.learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.extend(
            extensions.LinearShift(  # decay
                'eta', (lr_decay_init, 0.),
                (num_warmup_steps, num_train_steps)))
        trainer.extend(
            extensions.WarmupShift(  # warmup
                'eta', 0., num_warmup_steps, FLAGS.learning_rate))
        trainer.extend(extensions.observe_value(
            'eta', lambda trainer: trainer.updater.get_optimizer('main').eta),
                       trigger=(100, 'iteration'))  # logging

        trainer.extend(extensions.snapshot_object(
            model, 'model_snapshot_iter_{.updater.iteration}.npz'),
                       trigger=(num_train_steps // 2, 'iteration'))  # TODO
        trainer.extend(extensions.LogReport(trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PrintReport([
                'iteration', 'main/loss', 'main/accuracy', 'elapsed_time',
                'eta'
            ]))
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.run()

    if FLAGS.do_predict:
        eval_examples = read_squad_examples(input_file=FLAGS.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(eval_examples,
                                                     tokenizer,
                                                     FLAGS.max_seq_length,
                                                     FLAGS.doc_stride,
                                                     FLAGS.max_query_length,
                                                     is_training=False)
        test_iter = chainer.iterators.SerialIterator(eval_features,
                                                     FLAGS.predict_batch_size,
                                                     repeat=False,
                                                     shuffle=False)
        converter = Converter(is_training=False)

        print('Evaluating ...')
        evaluate(eval_examples,
                 test_iter,
                 model,
                 converter=converter,
                 device=FLAGS.gpu,
                 predict_func=model.predict)
        print('Finished.')

Exemple #3

0

Afficher le fichier

Fichier : run_classifier.py Projet : qhapaq-49/bert-chainer

def main():
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "livedoor": LivedoorProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_print_test:
        raise ValueError("At least one of `do_train` or `do_eval` "
                         "or `do_print_test` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.isdir(FLAGS.output_dir):
        os.makedirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(model_file=FLAGS.model_file,
                                           vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    # TODO: use special Adam from "optimization.py"
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    bert = modeling.BertModel(config=bert_config)
    pretrained = modeling.BertPretrainer(bert)
    chainer.serializers.load_npz(FLAGS.init_checkpoint, pretrained)

    model = modeling.BertClassifier(pretrained.bert,
                                    num_labels=len(label_list))

    if FLAGS.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use()
        model.to_gpu()

    if FLAGS.do_train:
        # Adam with weight decay only for 2D matrices
        optimizer = optimization.WeightDecayForMatrixAdam(
            alpha=1.,  # ignore alpha. instead, use eta as actual lr
            eps=1e-6,
            weight_decay_rate=0.01)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(1.))

        train_iter = chainer.iterators.SerialIterator(train_examples,
                                                      FLAGS.train_batch_size)
        converter = Converter(label_list, FLAGS.max_seq_length, tokenizer)
        updater = training.updaters.StandardUpdater(train_iter,
                                                    optimizer,
                                                    converter=converter,
                                                    device=FLAGS.gpu)
        trainer = training.Trainer(updater, (num_train_steps, 'iteration'),
                                   out=FLAGS.output_dir)

        # learning rate (eta) scheduling in Adam
        lr_decay_init = FLAGS.learning_rate * \
            (num_train_steps - num_warmup_steps) / num_train_steps
        trainer.extend(
            extensions.LinearShift(  # decay
                'eta', (lr_decay_init, 0.),
                (num_warmup_steps, num_train_steps)))
        trainer.extend(
            extensions.WarmupShift(  # warmup
                'eta', 0., num_warmup_steps, FLAGS.learning_rate))
        trainer.extend(extensions.observe_value(
            'eta', lambda trainer: trainer.updater.get_optimizer('main').eta),
                       trigger=(50, 'iteration'))  # logging

        trainer.extend(extensions.snapshot_object(
            model, 'model_snapshot_iter_{.updater.iteration}.npz'),
                       trigger=(num_train_steps, 'iteration'))
        trainer.extend(extensions.LogReport(trigger=(50, 'iteration')))
        trainer.extend(
            extensions.PrintReport(
                ['iteration', 'main/loss', 'main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.run()

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        test_iter = chainer.iterators.SerialIterator(eval_examples,
                                                     FLAGS.train_batch_size *
                                                     2,
                                                     repeat=False,
                                                     shuffle=False)
        converter = Converter(label_list, FLAGS.max_seq_length, tokenizer)
        evaluator = extensions.Evaluator(test_iter,
                                         model,
                                         converter=converter,
                                         device=FLAGS.gpu)
        results = evaluator()
        print(results)

    # if you wanna see some output arrays for debugging
    if FLAGS.do_print_test:
        short_eval_examples = processor.get_dev_examples(FLAGS.data_dir)[:3]
        short_eval_examples = short_eval_examples[:FLAGS.eval_batch_size]
        short_test_iter = chainer.iterators.SerialIterator(
            short_eval_examples,
            FLAGS.eval_batch_size,
            repeat=False,
            shuffle=False)
        converter = Converter(label_list, FLAGS.max_seq_length, tokenizer)
        evaluator = extensions.Evaluator(test_iter,
                                         model,
                                         converter=converter,
                                         device=FLAGS.gpu)

        with chainer.using_config('train', False):
            with chainer.no_backprop_mode():
                data = short_test_iter.__next__()
                out = model.bert.get_pooled_output(
                    *converter(data, FLAGS.gpu)[:-1])
                print(out)
                print(out.shape)
            print(converter(data, -1))