Example #1
0
    def _average_gradients(self, parameters: Any, divisor: int) -> None:
        check.gt_eq(divisor, 1)
        if divisor == 1:
            return

        divisor_value = float(divisor)
        for p in filter(lambda param: param.grad is not None, parameters):
            p.grad.data.div_(divisor_value)
Example #2
0
    def wrap_optimizer(
        self,
        optimizer: torch.optim.Optimizer,  # type: ignore
        backward_passes_per_step: int = 1,
    ) -> torch.optim.Optimizer:  # type: ignore
        """Returns a wrapped optimizer.

        The optimizer must use the models wrapped by :meth:`wrap_model`. This function
        creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training.

        `backward_passes_per_step` can be used to specify how many gradient aggregation
        steps will be performed in a single `train_batch` call per optimizer step.
        In most cases, this will just be the default value 1.  However, this advanced functionality
        can be used to support training loops like the one shown below:

        .. code-block:: python

            def train_batch(
                self, batch: TorchData, epoch_idx: int, batch_idx: int
            ) -> Dict[str, torch.Tensor]:
                data, labels = batch
                output = self.model(data)
                loss1 = output['loss1']
                loss2 = output['loss2']
                self.context.backward(loss1)
                self.context.backward(loss2)
                self.context.step_optimizer(self.optimizer, backward_passes_per_step=2)
                return {"loss1": loss1, "loss2": loss2}

        """
        if self.env.managed_training:
            check.false(
                self._use_amp,
                "Must call wrap_optimizer() before configure_apex_amp.")
            check.gt_eq(
                backward_passes_per_step,
                1,
                "backwar_passes_per_step for local gradient aggregation must be >= 1",
            )

            if self.hvd_config.use:
                use_compression = self.hvd_config.fp16_compression
                optimizer = hvd.DistributedOptimizer(
                    optimizer,
                    named_parameters=self._filter_named_parameters(optimizer),
                    backward_passes_per_step=backward_passes_per_step *
                    self.hvd_config.aggregation_frequency,
                    compression=hvd.Compression.fp16
                    if use_compression else hvd.Compression.none,
                )
                logging.debug(
                    "Initialized optimizer for distributed and optimized parallel training."
                )

        self.optimizers.append(optimizer)
        return optimizer
Example #3
0
 def load(self, path: pathlib.Path) -> None:
     self.chaos_failure(self.chaos_probability_checkpoint)
     time.sleep(self.load_secs)
     fpath = path.joinpath(self.CHECKPOINT_FILENAME)
     with fpath.open("r") as f:
         jbody = {int(k): v for k, v in json.load(f).items()}
         for k, v in jbody.items():
             check.gt_eq(k, 0)
             check.is_type(v, int)
             check.gt_eq(v, 0)
         self.trained_steps = collections.Counter(jbody)
         logging.info("Loaded checkpoint {}, steps_trained {}".format(
             fpath, self.steps_trained()))
Example #4
0
    def __init__(
        self,
        length: int,
        shard_rank: int,
        num_shards: int,
        shuffle: bool,
        shuffle_seed: int,
        prior_batches_trained: int,
    ) -> None:
        self.indices = list(range(length))
        self.num_shards = num_shards
        self.shuffle = shuffle

        check.gt_eq(
            length,
            num_shards,
            "please provide a Sequence that has at least as many batches as the number of slots "
            "used for training",
        )

        # Each shard has a certain offset from which it yields data.  When the dataset length is
        # not evenly divisible by the shard size, that offset will change every epoch.
        # Example:
        #   let length=10, shard_rank=0, and num_shards=3:
        #   epoch 1: 0, 3, 6, 9
        #   epoch 2: 2, 5, 8
        #   epoch 3: 1, 4, 7
        #   epoch 4: (same as epoch 1)
        # In this example, the offset in the first three epochs is 0, then 2, then 1.
        # The initial offset is always shard_rank, and the offest is recalculated in _end_epoch().
        self.offset = shard_rank

        if self.shuffle:
            assert shuffle_seed is not None
            self.rng = np.random.RandomState(shuffle_seed)
            self.rng.shuffle(self.indices)

        # Start in the correct epoch of shuffle.
        batches_to_skip = prior_batches_trained
        while len(self._this_epoch_indices()) <= batches_to_skip:
            batches_to_skip -= len(self._this_epoch_indices())
            self._end_epoch()

        self.offset += self.num_shards * batches_to_skip
Example #5
0
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check.is_instance(trial_inst, PyTorchTrial,
                          "PyTorchTrialController needs an PyTorchTrial")
        self.trial = cast(PyTorchTrial, trial_inst)
        self.context = cast(pytorch.PyTorchTrialContext, self.context)
        self.context.experimental._set_allgather_fn(self.allgather_metrics)
        self.callbacks = self.trial.build_callbacks()

        self._apply_backwards_compatibility()

        check.gt_eq(
            len(self.context.models),
            1,
            "Must have at least one model. "
            "This might be caused by not wrapping your model with wrap_model()",
        )
        check.gt_eq(
            len(self.context.optimizers),
            1,
            "Must have at least one optimizer. "
            "This might be caused by not wrapping your optimizer with wrap_optimizer()",
        )
        self._check_evaluate_implementation()

        # Validation loader will be undefined on process ranks > 0
        # when the user defines `validate_full_dataset()`.
        self.validation_loader = None  # type: Optional[torch.utils.data.DataLoader]
        self._set_data_loaders()

        # We don't want the training_iterator shuffling values after we load state
        self.training_iterator = iter(self.training_loader)

        # If a load path is provided load weights and restore the data location.
        self._load()

        if self.hvd_config.use:
            hvd.broadcast_parameters(self.context._main_model.state_dict(),
                                     root_rank=0)
            for optimizer in self.context.optimizers:
                hvd.broadcast_optimizer_state(optimizer, root_rank=0)
Example #6
0
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check.is_instance(trial_inst, PyTorchTrial,
                          "PyTorchTrialController needs an PyTorchTrial")
        self.trial = cast(PyTorchTrial, trial_inst)
        self.context = cast(PyTorchTrialContext, self.context)
        self.callbacks = self.trial.build_callbacks()

        # TODO(DET-3262): remove this backward compatibility of old interface.
        if (util.is_overridden(self.trial.build_model, PyTorchTrial)
                or util.is_overridden(self.trial.optimizer, PyTorchTrial)
                or util.is_overridden(self.trial.create_lr_scheduler,
                                      PyTorchTrial)):
            check.true(
                util.is_overridden(self.trial.build_model, PyTorchTrial)
                and util.is_overridden(self.trial.optimizer, PyTorchTrial),
                "Both build_model() and optimizer() must be defined "
                "if any of build_model(), optimizer(), and create_lr_scheduler() are defined. "
                "If you want to use the new interface, you should instead instantiate your models, "
                "optimizers, and LR schedulers in __init__ and call context.backward(loss) "
                "and context.step_optimizer(optimizer) in train_batch.",
            )

            model = self.context._Model(self.trial.build_model())
            optim = self.context._Optimizer(self.trial.optimizer(model))

            lr_scheduler = self.trial.create_lr_scheduler(optim)
            if lr_scheduler is not None:
                self.context.lr_schedulers.append(lr_scheduler)

            if det.ExperimentConfig(self.context.get_experiment_config()
                                    ).mixed_precision_enabled():
                self.context._configure_apex_amp(
                    models=model,
                    optimizers=optim,
                    opt_level=self.context.get_experiment_config().get(
                        "optimizations", {}).get("mixed_precision", "O0"),
                )

            train_batch = self.trial.train_batch

            def new_train_batch(
                    batch: TorchData, model: nn.Module, epoch_idx: int,
                    batch_idx: int) -> Union[torch.Tensor, Dict[str, Any]]:
                tr_metrics = train_batch(batch, model, epoch_idx, batch_idx)
                if isinstance(tr_metrics, torch.Tensor):
                    tr_metrics = {"loss": tr_metrics}
                check.is_instance(
                    tr_metrics,
                    dict,
                    "train_batch() must return a dictionary "
                    f"mapping string names to Tensor metrics, got {type(tr_metrics)}",
                )
                check.is_in("loss", tr_metrics.keys(),
                            'Please include "loss" in you training metrics.')

                def clip_grads(parameters: Iterator) -> None:
                    for callback in self.callbacks.values():
                        callback.on_before_optimizer_step(parameters)

                self.context._backward(tr_metrics["loss"])
                self.context._step_optimizer(self.context.optimizers[0],
                                             clip_grads=clip_grads)
                return tr_metrics

            self.trial.__setattr__("train_batch", new_train_batch)

        check.gt_eq(
            len(self.context.models),
            1,
            "Must have at least one model. "
            "This might be caused by not wrapping your model with Model()",
        )
        check.gt_eq(
            len(self.context.optimizers),
            1,
            "Must have at least one optimizer. "
            "This might be caused by not wrapping your model with Optimizer()",
        )
        self._check_evaluate_implementation()

        # Validation loader will be undefined on process ranks > 0
        # when the user defines `validate_full_dataset()`.
        self.validation_loader = None  # type: Optional[torch.utils.data.DataLoader]
        self._set_data_loaders()

        # If a load path is provided load weights and restore the data location.
        self._load()

        if self.hvd_config.use:
            hvd.broadcast_parameters(self.context._main_model.state_dict(),
                                     root_rank=0)
            for optimizer in self.context.optimizers:
                hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        self.training_iterator = iter(self.training_loader)