Exemple #1
0
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check.is_instance(trial_inst, PyTorchTrial,
                          "PyTorchTrialController needs an PyTorchTrial")
        self.trial = cast(PyTorchTrial, trial_inst)
        self.context = cast(pytorch.PyTorchTrialContext, self.context)
        self.callbacks = self.trial.build_callbacks()

        check.gt_eq(
            len(self.context.models),
            1,
            "Must have at least one model. "
            "This might be caused by not wrapping your model with wrap_model()",
        )
        check.gt_eq(
            len(self.context.optimizers),
            1,
            "Must have at least one optimizer. "
            "This might be caused by not wrapping your optimizer with wrap_optimizer()",
        )
        self._check_evaluate_implementation()

        # Validation loader will be undefined on process ranks > 0
        # when the user defines `validate_full_dataset()`.
        self.validation_loader = None  # type: Optional[torch.utils.data.DataLoader]
        self._set_data_loaders()
Exemple #2
0
    def _average_gradients(self, parameters: Any, divisor: int) -> None:
        check.gt_eq(divisor, 1)
        if divisor == 1:
            return

        divisor_value = float(divisor)
        for p in filter(lambda param: param.grad is not None, parameters):
            p.grad.data.div_(divisor_value)
Exemple #3
0
    def wrap_optimizer(
        self,
        optimizer: torch.optim.Optimizer,
        backward_passes_per_step: int = 1,
    ) -> torch.optim.Optimizer:
        """Returns a wrapped optimizer.

        The optimizer must use the models wrapped by :meth:`wrap_model`. This function
        creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training.

        `backward_passes_per_step` can be used to specify how many gradient aggregation
        steps will be performed in a single `train_batch` call per optimizer step.
        In most cases, this will just be the default value 1.  However, this advanced functionality
        can be used to support training loops like the one shown below:

        .. code-block:: python

            def train_batch(
                self, batch: TorchData, epoch_idx: int, batch_idx: int
            ) -> Dict[str, torch.Tensor]:
                data, labels = batch
                output = self.model(data)
                loss1 = output['loss1']
                loss2 = output['loss2']
                self.context.backward(loss1)
                self.context.backward(loss2)
                self.context.step_optimizer(self.optimizer, backward_passes_per_step=2)
                return {"loss1": loss1, "loss2": loss2}

        """
        if self.env.managed_training:
            check.false(
                self._use_apex,
                "Must call wrap_optimizer() before configure_apex_amp.")
            check.gt_eq(
                backward_passes_per_step,
                1,
                "backward_passes_per_step for local gradient aggregation must be >= 1",
            )

            if self.distributed.size > 1 and self._distributed_backend.use_horovod(
            ):
                optimizer = hvd.DistributedOptimizer(
                    optimizer,
                    named_parameters=self._filter_named_parameters(optimizer),
                    backward_passes_per_step=backward_passes_per_step *
                    self._aggregation_frequency,
                    compression=hvd.Compression.fp16
                    if self._fp16_compression else hvd.Compression.none,
                )
                logging.debug(
                    "Initialized optimizer for distributed and optimized parallel training."
                )

        self.optimizers.append(optimizer)
        return optimizer
Exemple #4
0
 def load(self, path: pathlib.Path) -> None:
     self.chaos_failure(self.chaos_probability_checkpoint)
     time.sleep(self.load_secs)
     fpath = path.joinpath(self.CHECKPOINT_FILENAME)
     with fpath.open("r") as f:
         jbody = {int(k): v for k, v in json.load(f).items()}
         for k, v in jbody.items():
             check.gt_eq(k, 0)
             check.is_type(v, int)
             check.gt_eq(v, 0)
         self.trained_steps = collections.Counter(jbody)
         logging.info("Loaded checkpoint {}, steps_trained {}".format(
             fpath, self.steps_trained()))
Exemple #5
0
    def __init__(
        self,
        length: int,
        shard_rank: int,
        num_shards: int,
        shuffle: bool,
        shuffle_seed: int,
        prior_batches_trained: int,
    ) -> None:
        self.indices = list(range(length))
        self.num_shards = num_shards
        self.shuffle = shuffle

        check.gt_eq(
            length,
            num_shards,
            "please provide a Sequence that has at least as many batches as the number of slots "
            "used for training",
        )

        # Each shard has a certain offset from which it yields data.  When the dataset length is
        # not evenly divisible by the shard size, that offset will change every epoch.
        # Example:
        #   let length=10, shard_rank=0, and num_shards=3:
        #   epoch 1: 0, 3, 6, 9
        #   epoch 2: 2, 5, 8
        #   epoch 3: 1, 4, 7
        #   epoch 4: (same as epoch 1)
        # In this example, the offset in the first three epochs is 0, then 2, then 1.
        # The initial offset is always shard_rank, and the offest is recalculated in _end_epoch().
        self.offset = shard_rank

        if self.shuffle:
            assert shuffle_seed is not None
            self.rng = np.random.RandomState(shuffle_seed)
            self.rng.shuffle(self.indices)

        # Start in the correct epoch of shuffle.
        batches_to_skip = prior_batches_trained
        while len(self._this_epoch_indices()) <= batches_to_skip:
            batches_to_skip -= len(self._this_epoch_indices())
            self._end_epoch()

        self.offset += self.num_shards * batches_to_skip
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check.is_instance(trial_inst, PyTorchTrial,
                          "PyTorchTrialController needs an PyTorchTrial")
        self.trial = cast(PyTorchTrial, trial_inst)
        self.context = cast(pytorch.PyTorchTrialContext, self.context)
        self.context._set_determined_profiler(self.prof)
        if torch.cuda.is_available():
            self.prof._set_sync_device(self._sync_device)
        self.callbacks = self.trial.build_callbacks()

        check.gt_eq(
            len(self.context.models),
            1,
            "Must have at least one model. "
            "This might be caused by not wrapping your model with wrap_model()",
        )
        check.gt_eq(
            len(self.context.optimizers),
            1,
            "Must have at least one optimizer. "
            "This might be caused by not wrapping your optimizer with wrap_optimizer()",
        )
        self._check_evaluate_implementation()

        self.wlsq = None  # type: Optional[layers.WorkloadSequencer]
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core,
                self.env,
                self.context.get_global_batch_size(),
            )

        self.steps_completed = self.env.steps_completed

        # Currently only horovod and torch backends are supported for distributed training
        if self.context.distributed.size > 1:
            assert (self.use_horovod or self.use_torch
                    ), "Must use horovod or torch for distributed training"
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check.is_instance(trial_inst, PyTorchTrial,
                          "PyTorchTrialController needs an PyTorchTrial")
        self.trial = cast(PyTorchTrial, trial_inst)
        self.context = cast(pytorch.PyTorchTrialContext, self.context)
        self.context.experimental._set_allgather_fn(self.allgather_metrics)
        self.callbacks = self.trial.build_callbacks()

        check.gt_eq(
            len(self.context.models),
            1,
            "Must have at least one model. "
            "This might be caused by not wrapping your model with wrap_model()",
        )
        check.gt_eq(
            len(self.context.optimizers),
            1,
            "Must have at least one optimizer. "
            "This might be caused by not wrapping your optimizer with wrap_optimizer()",
        )
        self._check_evaluate_implementation()

        # Validation loader will be undefined on process ranks > 0
        # when the user defines `validate_full_dataset()`.
        self.validation_loader = None  # type: Optional[torch.utils.data.DataLoader]
        self._set_data_loaders()

        # We don't want the training_iterator shuffling values after we load state
        self.training_iterator = iter(self.training_loader)

        # If a load path is provided load weights and restore the data location.
        self._load()

        if self.hvd_config.use:
            hvd.broadcast_parameters(self.context._main_model.state_dict(),
                                     root_rank=0)
            for optimizer in self.context.optimizers:
                hvd.broadcast_optimizer_state(optimizer, root_rank=0)