Ejemplo n.º 1
0
    def test_trainer_saves_metrics_every_epoch(self):
        trainer = GradientDescentTrainer(
            model=self.model,
            optimizer=self.optimizer,
            data_loader=self.data_loader,
            validation_data_loader=self.validation_data_loader,
            num_epochs=5,
            serialization_dir=self.TEST_DIR,
            checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                      num_serialized_models_to_keep=3),
        )
        trainer.train()

        for epoch in range(5):
            epoch_file = self.TEST_DIR / f"metrics_epoch_{epoch}.json"
            assert epoch_file.exists()
            metrics = json.load(open(epoch_file))
            assert "validation_loss" in metrics
            assert "best_validation_loss" in metrics
            assert metrics.get("epoch") == epoch
Ejemplo n.º 2
0
    def test_trainer_respects_num_serialized_models_to_keep(self):
        trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            self.data_loader,
            num_epochs=5,
            serialization_dir=self.TEST_DIR,
            checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                      num_serialized_models_to_keep=3),
        )
        trainer.train()

        # Now check the serialized files
        for prefix in ["model_state_epoch_*", "training_state_epoch_*"]:
            file_names = glob.glob(os.path.join(self.TEST_DIR, prefix))
            epochs = [
                int(re.search(r"_([0-9])\.th", fname).group(1))
                for fname in file_names
            ]
            assert sorted(epochs) == [2, 3, 4]
Ejemplo n.º 3
0
    def test_trainer_respects_keep_serialized_model_every_num_seconds(self):
        # To test:
        #   Create an fake data loader that sleeps for 2.5 second per epoch, so the total
        #   training time for one epoch is slightly greater then 2.5 seconds.
        #   Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds.
        #   Check the resulting checkpoints.  Should then have models at epochs
        #       2, 4, plus the last two at 5 and 6.

        class SlowDataLoader:
            data_loader = SimpleDataLoader(self.instances, batch_size=2)

            def __iter__(self):
                time.sleep(2.5)
                return iter(self.data_loader)

            def __len__(self):
                return len(self.data_loader)

            def set_target_device(self, _):
                pass

        trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            SlowDataLoader(),
            num_epochs=6,
            serialization_dir=self.TEST_DIR,
            checkpointer=Checkpointer(
                serialization_dir=self.TEST_DIR,
                num_serialized_models_to_keep=2,
                keep_serialized_model_every_num_seconds=5,
            ),
        )
        trainer.train()

        # Now check the serialized files
        for prefix in ["model_state_epoch_*", "training_state_epoch_*"]:
            file_names = glob.glob(os.path.join(self.TEST_DIR, prefix))
            epochs = [int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names]
            # epoch N has N-1 in file name
            assert sorted(epochs) == [1, 3, 4, 5]
Ejemplo n.º 4
0
    def test_default_distributed_with_sharded_state(self):
        """
        Simulates using the Checkpointer during distributed training with a sharded model.
        """
        world_size = 2
        default_num_to_keep = 2
        num_epochs = 5
        target = [(e, 0)
                  for e in range(num_epochs - default_num_to_keep, num_epochs)]

        checkpointers = [
            Checkpointer(serialization_dir=self.TEST_DIR)
            for _ in range(world_size)
        ]
        for i, checkpointer in enumerate(checkpointers):
            checkpointer._rank = i
            checkpointer.state_is_sharded = True

        for epochs_completed in range(num_epochs):
            for batches_completed in [0, 5, 10]:
                for i, checkpointer in enumerate(checkpointers):
                    state = {
                        "epochs_completed": epochs_completed,
                        "batches_in_epoch_completed": batches_completed,
                        "rank": i,
                    }
                    checkpointer.maybe_save_checkpoint(
                        FakeTrainer(model_state=state, training_state=state),
                        epochs_completed,
                        batches_completed,
                    )

        for i, checkpointer in enumerate(checkpointers):
            checkpoint = checkpointer.load_checkpoint()
            assert checkpoint is not None
            model_state, training_state = checkpoint
            assert model_state["rank"] == i
            assert training_state["rank"] == i

            models, training = self.retrieve_and_delete_saved(shard=i)
            assert models == training == target
Ejemplo n.º 5
0
 def test_with_time(self):
     num_epochs = 30
     pauses = [5, 18, 26]
     target = [(e, 0) for e in pauses]
     checkpointer = Checkpointer(
         serialization_dir=self.TEST_DIR,
         save_completed_epochs=False,
         save_every_num_seconds=1,
         keep_most_recent_by_count=3,
     )
     for e in range(num_epochs):
         if e in pauses:
             time.sleep(2)
         state = {"epochs_completed": e, "batches_in_epoch_completed": 0}
         checkpointer.maybe_save_checkpoint(
             trainer=FakeTrainer(model_state=state, training_state=state),
             num_epochs_completed=e,
             num_batches_in_epoch_completed=0,
         )
     models, training = self.retrieve_and_delete_saved()
     assert models == training == target
Ejemplo n.º 6
0
    def test_default(self):
        """
        Tests that the default behavior keeps just the last 2 checkpoints.
        """
        default_num_to_keep = 2
        num_epochs = 5
        target = [(e, 0)
                  for e in range(num_epochs - default_num_to_keep, num_epochs)]

        checkpointer = Checkpointer(serialization_dir=self.TEST_DIR)
        for epochs_completed in range(num_epochs):
            for batches_completed in [0, 5, 10]:
                state = {
                    "epochs_completed": epochs_completed,
                    "batches_in_epoch_completed": batches_completed,
                }
                checkpointer.maybe_save_checkpoint(
                    FakeTrainer(model_state=state, training_state=state),
                    epochs_completed,
                    batches_completed,
                )
        models, training = self.retrieve_and_delete_saved()
        assert models == training == target
                        encoder=encoder,
                        decoder=decoder)

# # データローダ
train_loader = PyTorchDataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = PyTorchDataLoader(validation_dataset,
                                      batch_size=32,
                                      shuffle=False)

# GPU上にモデルをコピー
if args.cuda:
    model = model.cuda()

# オプティマイザの作成
optimizer = AdamOptimizer(model.named_parameters())
checkpointer = Checkpointer(serialization_dir=args.serialization_dir,
                            num_serialized_models_to_keep=None)
learning_rate_scheduler = LinearWithWarmup(
    optimizer=optimizer,
    num_epochs=args.num_epochs,
    num_steps_per_epoch=len(train_loader),
    warmup_steps=4000)
# トレイナの作成
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_loader,
    validation_data_loader=validation_loader,
    validation_metric="-loss",
    num_epochs=args.num_epochs,
    learning_rate_scheduler=learning_rate_scheduler,
    checkpointer=checkpointer)
Ejemplo n.º 8
0
    def run(  # type: ignore
        self,
        model: Lazy[Model],
        dataset: DatasetDict,
        data_loader: Lazy[TangoDataLoader],
        optimizer: Lazy[Optimizer],
        validation_data_loader: Optional[Lazy[TangoDataLoader]] = None,
        training_split: str = "train",
        validation_split: Optional[str] = None,
        patience: Optional[int] = None,
        validation_metric: Union[str, List[str]] = "-loss",
        num_epochs: int = 20,
        checkpointer: Optional[Lazy[Checkpointer]] = None,
        grad_norm: Union[float, bool] = False,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler: Optional[Lazy[LearningRateScheduler]] = None,
        momentum_scheduler: Optional[Lazy[MomentumScheduler]] = None,
        moving_average: Optional[Lazy[MovingAverage]] = None,
        callbacks: List[Lazy[TrainerCallback]] = None,
        num_gradient_accumulation_steps: int = 1,
        use_amp: bool = False,
        enable_default_callbacks: bool = True,
        run_confidence_checks: bool = True,
        no_grad: Optional[List[str]] = None,
        limit_batches_per_epoch: Optional[int] = None,
    ) -> Model:
        serialization_dir = self.work_dir()

        if validation_data_loader is None:
            validation_data_loader = data_loader
        if validation_split is None:
            validation_loader = None
        else:
            concrete_validation_data_loader = validation_data_loader.construct(
                instances=dataset.splits[validation_split])
            del validation_data_loader
            if limit_batches_per_epoch is not None:
                concrete_validation_data_loader = MaxBatchesDataLoader(
                    concrete_validation_data_loader, limit_batches_per_epoch)
            validation_loader = DataLoaderAdapter(
                tango_data_loader=concrete_validation_data_loader)

        concrete_data_loader = data_loader.construct(
            instances=dataset.splits[training_split])
        del data_loader
        if limit_batches_per_epoch is not None:
            concrete_data_loader = MaxBatchesDataLoader(
                concrete_data_loader, limit_batches_per_epoch)
        loader = DataLoaderAdapter(tango_data_loader=concrete_data_loader)

        if torch.cuda.device_count() > 0:
            cuda_device = torch.device(0)
        else:
            cuda_device = torch.device("cpu")
        check_for_gpu(cuda_device)
        loader.set_target_device(cuda_device)
        if validation_loader is not None:
            validation_loader.set_target_device(cuda_device)

        concrete_model = model.construct(vocab=dataset.vocab).to(cuda_device)
        del model
        if no_grad:
            for name, parameter in concrete_model.named_parameters():
                if any(re.search(regex, name) for regex in no_grad):
                    parameter.requires_grad_(False)
        parameters = [[n, p] for n, p in concrete_model.named_parameters()
                      if p.requires_grad]
        concrete_optimizer = optimizer.construct(model_parameters=parameters)
        del optimizer
        log_frozen_and_tunable_parameter_names(concrete_model)

        concrete_moving_average = (None if moving_average is None else
                                   moving_average.construct(
                                       parameters=parameters))
        del moving_average

        concrete_learning_rate_scheduler = (
            None if learning_rate_scheduler is None else
            learning_rate_scheduler.construct(
                optimizer=concrete_optimizer,
                num_epochs=num_epochs,
                num_steps_per_epoch=concrete_data_loader.num_batches_per_epoch(
                ),
            ))
        del learning_rate_scheduler

        concrete_momentum_scheduler = (None if momentum_scheduler is None else
                                       momentum_scheduler.construct(
                                           optimizer=concrete_optimizer))
        del momentum_scheduler

        if checkpointer is not None:
            concrete_checkpointer = checkpointer.construct(
                serialization_dir=serialization_dir)
        else:
            concrete_checkpointer = Checkpointer(serialization_dir)
        del checkpointer

        concrete_callbacks: List[TrainerCallback] = [
            cb.construct(serialization_dir=serialization_dir)
            for cb in callbacks or []
        ]
        del callbacks

        trainer = GradientDescentTrainer(
            concrete_model,
            optimizer=concrete_optimizer,
            data_loader=loader,
            patience=patience,
            validation_metric=validation_metric,
            validation_data_loader=validation_loader,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            checkpointer=concrete_checkpointer,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=concrete_learning_rate_scheduler,
            momentum_scheduler=concrete_momentum_scheduler,
            moving_average=concrete_moving_average,
            callbacks=concrete_callbacks,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
            use_amp=use_amp,
            enable_default_callbacks=enable_default_callbacks,
            run_confidence_checks=run_confidence_checks,
        )
        trainer.train()

        return trainer.model