Esempio n. 1
0
    def test_trainer_can_resume_with_lr_scheduler(self):
        lr_scheduler = CosineWithRestarts(self.optimizer, t_initial=5)
        trainer = GradientDescentTrainer(
            model=self.model,
            optimizer=self.optimizer,
            data_loader=self.data_loader,
            learning_rate_scheduler=lr_scheduler,
            validation_data_loader=self.validation_data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
        )
        trainer.train()

        new_lr_scheduler = CosineWithRestarts(self.optimizer, t_initial=5)
        new_trainer = GradientDescentTrainer(
            model=self.model,
            optimizer=self.optimizer,
            data_loader=self.data_loader,
            learning_rate_scheduler=new_lr_scheduler,
            validation_data_loader=self.validation_data_loader,
            num_epochs=4,
            serialization_dir=self.TEST_DIR,
        )
        epoch = new_trainer._restore_checkpoint()
        assert epoch == 2
        assert new_trainer._learning_rate_scheduler.last_epoch == 1
        new_trainer.train()
Esempio n. 2
0
    def test_schedules(self):
        cosine_schedule_cases = [
            (30, 30, 1.0, [(0, 1.0), (15, 0.5000000000000001),
                           (29, 0.0027390523158632996)]),
            (10, 1, 2.0, [(0, 1.0), (1, 1.0), (2, 0.5), (3, 1.0)]),
            (30, 1, 1.0, [(0, 1.0), (15, 1.0), (29, 1.0)]),
            (60, 30, 1.0, [(0, 1.0), (15, 0.5000000000000001),
                           (29, 0.0027390523158632996), (30, 1.0),
                           (45, 0.5000000000000001),
                           (59, 0.0027390523158632996)]),
            (100, 30, 1.5, [(0, 1.0), (29, 0.0027390523158632996), (30, 1.0),
                            (74, 0.0012179748700879012)]),
            (210, 30, 2, [(0, 1.0), (29, 0.0027390523158632996), (30, 1.0),
                          (89, 0.0006852326227130834), (90, 1.0),
                          (209, 0.00017133751222137006)]),
            (150, 30, 1, [(0, 1.0), (29, 0.0027390523158632996), (30, 1.0),
                          (59, 0.0027390523158632996), (60, 1.0),
                          (89, 0.0027390523158632996), (90, 1.0)]),
        ]

        for epochs, t_max, factor, lr_checks in cosine_schedule_cases:
            optimizer = self._get_optimizer()
            scheduler = CosineWithRestarts(optimizer, t_max, factor=factor)
            lrs = [optimizer.param_groups[0]["lr"]]
            for epoch in range(epochs):
                scheduler.step(epoch)
                lrs.append(optimizer.param_groups[0]["lr"])

            for it, lr in lr_checks:
                assert lrs[it] == lr
Esempio n. 3
0
    def test_model_training(self):
        training_dataset = self.sample_instances if self.sample_only else self.train_instances
        #training_dataset = training_dataset[:500]
        validation_dataset = self.sample_instances if self.sample_only else self.test_instances
        serialization_dir = self.TEST_DATA_ROOT / "serialized_sample" if self.sample_only else "serialized"
        tensorboard_dir = self.TEST_DATA_ROOT / "tensorboard.seq2seq"

        batch_size = 64

        train_iterator = BucketIterator(sorting_keys=[("source_tokens",
                                                       "num_tokens")],
                                        padding_noise=0.1,
                                        batch_size=batch_size)
        train_iterator.index_with(vocab=self.vocab)
        multiproc_iterator = MultiprocessIterator(train_iterator,
                                                  num_workers=4,
                                                  output_queue_size=6000)

        tensorboard = TensorboardWriter(get_batch_num_total=lambda: np.ceil(
            len(training_dataset) / batch_size),
                                        serialization_dir=tensorboard_dir,
                                        summary_interval=5,
                                        histogram_interval=5,
                                        should_log_parameter_statistics=True,
                                        should_log_learning_rate=True)

        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        scheduler = CosineWithRestarts(optimizer=optimizer, t_initial=5)

        trainer = CallbackTrainer(
            model=self.model,
            serialization_dir=serialization_dir,
            iterator=multiproc_iterator,
            training_data=self.train_instances,
            num_epochs=100,
            cuda_device=0,
            optimizer=optimizer,
            callbacks=[
                LogToTensorboard(tensorboard),
                Validate(validation_data=self.test_instances,
                         validation_iterator=multiproc_iterator),
                TrackMetrics(),
                ResetMetricsCallback(),
                UpdateLearningRate(scheduler),
                ValidationLogCallback(self.train_reader, self.test_instances)
            ])

        # trainer = Trainer(model=self.model,
        #                   serialization_dir=serialization_dir,
        #                   iterator=train_iterator,
        #                   train_dataset=training_dataset,
        #                   num_epochs=1,
        #                   cuda_device=0,
        #                   optimizer=torch.optim.Adam(self.model.parameters(), lr=1e-3),
        #                   validation_dataset=training_dataset,
        #                   validation_iterator=train_iterator,
        #                   should_log_learning_rate=True,
        #                   learning_rate_scheduler=scheduler
        #                   )

        # for i in range(50):
        #     print('Epoch: {}'.format(i))
        #     trainer.train()
        #
        #     import itertools
        #
        #     predictor = Seq2SeqPredictor(self.model, self.train_reader)
        #
        #     for instance in itertools.islice(training_dataset, 10):
        #         print('SOURCE:', instance.fields['source_tokens'].tokens)
        #         print('GOLD:', instance.fields['target_tokens'].tokens)
        #         print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
        #
        # self.val_outputs_fp.close()

        trainer.train()
Esempio n. 4
0
def train():
    cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
    cmd.add_argument('--seed', default=1, type=int, help='The random seed.')
    cmd.add_argument('--gpu',
                     default=-1,
                     type=int,
                     help='Use id of gpu, -1 if cpu.')
    cmd.add_argument('--format',
                     default='plain',
                     choices=('plain', 'pickle'),
                     help='the input format.')
    cmd.add_argument('--train_path',
                     required=True,
                     help='The path to the training file.')
    cmd.add_argument('--vocab_path',
                     required=True,
                     help='The path to the vocabulary.')
    cmd.add_argument('--valid_path', help='The path to the development file.')
    cmd.add_argument('--test_path', help='The path to the testing file.')
    cmd.add_argument('--config_path',
                     required=True,
                     help='the path to the config file.')
    cmd.add_argument("--model", required=True, help="path to save model")
    cmd.add_argument("--batch_size",
                     "--batch",
                     type=int,
                     default=32,
                     help='the batch size.')
    cmd.add_argument("--max_epoch",
                     type=int,
                     default=100,
                     help='the maximum number of iteration.')
    cmd.add_argument('--max_sent_len',
                     type=int,
                     default=20,
                     help='maximum sentence length.')
    cmd.add_argument('--bucket',
                     action='store_true',
                     default=False,
                     help='do bucket batching.')
    cmd.add_argument('--save_classify_layer',
                     default=False,
                     action='store_true',
                     help="whether to save the classify layer")
    cmd.add_argument(
        '--always_save',
        default=False,
        action='store_true',
        help="always saving the model, appending global epoch id.")
    cmd.add_argument('--valid_size',
                     type=int,
                     default=0,
                     help="size of validation dataset when there's no valid.")
    cmd.add_argument('--eval_steps',
                     type=int,
                     help='evaluate every xx batches.')
    cmd.add_argument('--report_steps',
                     type=int,
                     default=32,
                     help='report every xx batches.')

    opt = cmd.parse_args(sys.argv[2:])

    with open(opt.config_path, 'r') as fin:
        conf = json.load(fin)

    # Dump configurations
    logger.info('hostname: {}'.format(socket.gethostname()))
    logger.info('cmd opt: {}'.format(opt))
    logger.info('model configuration: {}'.format(conf))

    # Set seed.
    torch.manual_seed(opt.seed)
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    if opt.gpu >= 0:
        torch.cuda.set_device(opt.gpu)
        if opt.seed > 0:
            torch.cuda.manual_seed(opt.seed)

    use_cuda = opt.gpu >= 0 and torch.cuda.is_available()
    use_fp16 = False
    if conf['optimizer'].get('fp16', False):
        use_fp16 = True
        if not use_cuda:
            logger.warning(
                'WARNING: fp16 requires --gpu, ignoring fp16 option')
            conf['optimizer']['fp16'] = False
            use_fp16 = False
        else:
            try:
                from apex.fp16_utils import FP16_Optimizer
            except:
                print('WARNING: apex not installed, ignoring --fp16 option')
                conf['optimizer']['fp16'] = False
                use_fp16 = False

    c = conf['token_embedder']
    token_embedder_max_chars = c.get('max_characters_per_token', None)

    # Load training data.
    if opt.format == 'plain':
        raw_training_data = read_corpus(opt.train_path, opt.max_sent_len,
                                        token_embedder_max_chars)
    else:
        raw_training_data = pickle.load(open(opt.train_path, 'rb'))
    logger.info('training instance: {}, training tokens: {}.'.format(
        len(raw_training_data), count_tokens(raw_training_data)))

    # Load valid data if path is provided, else use 10% of training data as valid data
    if opt.valid_path is not None:
        if opt.format == 'plain':
            raw_valid_data = read_corpus(opt.valid_path, opt.max_sent_len,
                                         token_embedder_max_chars)
        else:
            raw_valid_data = pickle.load(open(opt.valid_path, 'rb'))
        logger.info('valid instance: {}, valid tokens: {}.'.format(
            len(raw_valid_data), count_tokens(raw_valid_data)))
    elif opt.valid_size > 0:
        raw_training_data, raw_valid_data = split_train_and_valid(
            raw_training_data, opt.valid_size)
        logger.info(
            'training instance: {}, training tokens after division: {}.'.
            format(len(raw_training_data), count_tokens(raw_training_data)))
        logger.info('valid instance: {}, valid tokens: {}.'.format(
            len(raw_valid_data), count_tokens(raw_valid_data)))
    else:
        raw_valid_data = None

    # Load test data if path is provided.
    if opt.test_path is not None:
        if opt.format == 'plain':
            raw_test_data = read_corpus(opt.test_path, opt.max_sent_len,
                                        token_embedder_max_chars)
        else:
            raw_test_data = pickle.load(open(opt.test_path, 'rb'))
        logger.info('testing instance: {}, testing tokens: {}.'.format(
            len(raw_test_data), count_tokens(raw_test_data)))
    else:
        raw_test_data = None

    # Initialized vocab_batch
    vocab_batch = VocabBatch(
        lower=not conf['classifier'].get('vocab_cased', True),
        normalize_digits=conf['classifier'].get('vocab_digit_normalized',
                                                False),
        use_cuda=use_cuda)
    vocab_batch.create_dict_from_file(opt.vocab_path)

    # Word
    if c.get('word_dim', 0) > 0:
        word_batch = WordBatch(min_cut=c.get('word_min_cut', 0),
                               lower=not c.get('word_cased', True),
                               add_sentence_boundary=c.get(
                                   'add_sentence_boundary_ids', False),
                               use_cuda=use_cuda)
        word_batch.create_dict_from_dataset(raw_training_data)
    else:
        word_batch = None

    # SubToken
    if c.get('char_dim', 0) > 0 or c.get('wordpiece_dim', 0) > 0:
        min_char = max([w
                        for w, _ in c['filters']]) if c['name'] == 'cnn' else 1
        if c.get('char_dim', 0) > 0:
            char_batch = CharacterBatch(
                min_char=min_char,
                lower=not c.get('char_cased', True),
                add_sentence_boundary=c.get('add_sentence_boundary_ids',
                                            False),
                add_word_boundary=c.get('add_word_boundary_ids', False),
                use_cuda=use_cuda)
        else:
            char_batch = WordPieceBatch(
                min_char=min_char,
                vocab_file=c['wordpiece_vocab'],
                lower=not c.get('word_cased', True),
                add_sentence_boundary=c.get('add_sentence_boundary_ids',
                                            False),
                add_word_boundary=c.get('add_word_boundary_ids', False),
                use_cuda=use_cuda)
        char_batch.create_dict_from_dataset(raw_training_data)
    else:
        char_batch = None

    logger.info('vocab size: {0}'.format(len(vocab_batch.mapping)))
    n_classes = len(vocab_batch.mapping)

    model = Model(conf, word_batch, char_batch, n_classes)

    logger.info(str(model))
    if use_cuda:
        model = model.cuda()
        if use_fp16:
            model = model.half()

    # Create training batch
    if opt.bucket:
        training_batcher = BucketBatcher(raw_training_data, word_batch,
                                         char_batch, vocab_batch,
                                         opt.batch_size)
    else:
        training_batcher = Batcher(raw_training_data,
                                   word_batch,
                                   char_batch,
                                   vocab_batch,
                                   opt.batch_size,
                                   keep_full=False,
                                   sorting=True,
                                   shuffle=True)

    # Set up evaluation steps.
    if opt.eval_steps is None:
        opt.eval_steps = training_batcher.num_batches()
    logger.info('Evaluate every {0} batches.'.format(opt.eval_steps))

    # If there is valid, create valid batch.
    if raw_valid_data is not None:
        valid_batcher = Batcher(raw_valid_data,
                                word_batch,
                                char_batch,
                                vocab_batch,
                                opt.batch_size,
                                keep_full=True,
                                sorting=True,
                                shuffle=False)
    else:
        valid_batcher = None

    # If there is test, create test batch.
    if raw_test_data is not None:
        test_batcher = Batcher(raw_test_data,
                               word_batch,
                               char_batch,
                               vocab_batch,
                               opt.batch_size,
                               keep_full=True,
                               sorting=True,
                               shuffle=False)
    else:
        test_batcher = None

    # Save meta data of
    try:
        os.makedirs(opt.model)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    if char_batch is not None:
        with codecs.open(os.path.join(opt.model, 'char.dic'),
                         'w',
                         encoding='utf-8') as fpo:
            for ch, i in char_batch.mapping.items():
                print('{0}\t{1}'.format(ch, i), file=fpo)

    if word_batch is not None:
        with codecs.open(os.path.join(opt.model, 'word.dic'),
                         'w',
                         encoding='utf-8') as fpo:
            for w, i in word_batch.mapping.items():
                print('{0}\t{1}'.format(w, i), file=fpo)

    with codecs.open(os.path.join(opt.model, 'vocab.dic'),
                     'w',
                     encoding='utf-8') as fpo:
        for w, i in vocab_batch.mapping.items():
            print('{0}\t{1}'.format(w, i), file=fpo)

    new_config_path = os.path.join(opt.model,
                                   os.path.basename(opt.config_path))
    shutil.copy(opt.config_path, new_config_path)
    opt.config_path = new_config_path
    json.dump(
        vars(opt),
        codecs.open(os.path.join(opt.model, 'config.json'),
                    'w',
                    encoding='utf-8'))

    c = conf['optimizer']

    optimizer_name = c['type'].lower()
    params = filter(lambda param: param.requires_grad, model.parameters())
    if optimizer_name == 'adamax':
        optimizer = torch.optim.Adamax(params,
                                       lr=c.get('lr', 2e-3),
                                       betas=c.get('betas', (0.9, 0.999)),
                                       eps=c.get('eps', 1e-8))
    elif optimizer_name == 'sgd':
        optimizer = torch.optim.SGD(params,
                                    lr=c.get('lr', 0.01),
                                    momentum=c.get('momentum', 0),
                                    nesterov=c.get('nesterov', False))
    elif optimizer_name == 'dense_sparse_adam' or optimizer_name == 'adam':
        optimizer = DenseSparseAdam(params,
                                    lr=c.get('lr', 1e-3),
                                    betas=c.get('betas', (0.9, 0.999)),
                                    eps=c.get('eps', 1e-8))
    else:
        raise ValueError('Unknown optimizer name: {0}'.format(optimizer_name))

    if use_fp16:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=1.,
                                   dynamic_loss_scale=True,
                                   dynamic_loss_args={'init_scale': 2**16})

    scheduler_name = c.get('scheduler', 'noam')
    if scheduler_name == 'cosine':
        scheduler = CosineWithRestarts(optimizer,
                                       c['max_step'],
                                       eta_min=c.get('eta_min', 0.0))
    elif scheduler_name == 'dev_perf':
        scheduler = LearningRateScheduler.by_name('reduce_on_plateau')(
            optimizer,
            factor=c.get('decay_rate', 0.5),
            patience=c.get('patience', 5),
            min_lr=c.get('lr_min', 1e-6))
    elif scheduler_name == 'noam':
        scheduler = NoamLR(optimizer,
                           model_size=c.get('model_size', 512),
                           warmup_steps=c.get('warmup_step', 6000))
    else:
        scheduler = None

    best_train, best_valid, test_result = 1e8, 1e8, 1e8
    for epoch in range(opt.max_epoch):
        best_train, best_valid, test_result, improved = train_model(
            epoch, conf, opt, model, optimizer, scheduler, training_batcher,
            valid_batcher, test_batcher, best_train, best_valid, test_result)

    if raw_valid_data is None:
        logger.info("best train ppl: {:.6f}.".format(best_train))
    elif raw_test_data is None:
        logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(
            best_train, best_valid))
    else:
        logger.info(
            "best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}."
            .format(best_train, best_valid, test_result))