def __init__(self, trial_inst: det.Trial, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) check.is_instance(trial_inst, PyTorchTrial, "PyTorchTrialController needs an PyTorchTrial") self.trial = cast(PyTorchTrial, trial_inst) self.context = cast(pytorch.PyTorchTrialContext, self.context) self.callbacks = self.trial.build_callbacks() check.gt_eq( len(self.context.models), 1, "Must have at least one model. " "This might be caused by not wrapping your model with wrap_model()", ) check.gt_eq( len(self.context.optimizers), 1, "Must have at least one optimizer. " "This might be caused by not wrapping your optimizer with wrap_optimizer()", ) self._check_evaluate_implementation() # Validation loader will be undefined on process ranks > 0 # when the user defines `validate_full_dataset()`. self.validation_loader = None # type: Optional[torch.utils.data.DataLoader] self._set_data_loaders()
def _average_gradients(self, parameters: Any, divisor: int) -> None: check.gt_eq(divisor, 1) if divisor == 1: return divisor_value = float(divisor) for p in filter(lambda param: param.grad is not None, parameters): p.grad.data.div_(divisor_value)
def wrap_optimizer( self, optimizer: torch.optim.Optimizer, backward_passes_per_step: int = 1, ) -> torch.optim.Optimizer: """Returns a wrapped optimizer. The optimizer must use the models wrapped by :meth:`wrap_model`. This function creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training. `backward_passes_per_step` can be used to specify how many gradient aggregation steps will be performed in a single `train_batch` call per optimizer step. In most cases, this will just be the default value 1. However, this advanced functionality can be used to support training loops like the one shown below: .. code-block:: python def train_batch( self, batch: TorchData, epoch_idx: int, batch_idx: int ) -> Dict[str, torch.Tensor]: data, labels = batch output = self.model(data) loss1 = output['loss1'] loss2 = output['loss2'] self.context.backward(loss1) self.context.backward(loss2) self.context.step_optimizer(self.optimizer, backward_passes_per_step=2) return {"loss1": loss1, "loss2": loss2} """ if self.env.managed_training: check.false( self._use_apex, "Must call wrap_optimizer() before configure_apex_amp.") check.gt_eq( backward_passes_per_step, 1, "backward_passes_per_step for local gradient aggregation must be >= 1", ) if self.distributed.size > 1 and self._distributed_backend.use_horovod( ): optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=self._filter_named_parameters(optimizer), backward_passes_per_step=backward_passes_per_step * self._aggregation_frequency, compression=hvd.Compression.fp16 if self._fp16_compression else hvd.Compression.none, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) self.optimizers.append(optimizer) return optimizer
def load(self, path: pathlib.Path) -> None: self.chaos_failure(self.chaos_probability_checkpoint) time.sleep(self.load_secs) fpath = path.joinpath(self.CHECKPOINT_FILENAME) with fpath.open("r") as f: jbody = {int(k): v for k, v in json.load(f).items()} for k, v in jbody.items(): check.gt_eq(k, 0) check.is_type(v, int) check.gt_eq(v, 0) self.trained_steps = collections.Counter(jbody) logging.info("Loaded checkpoint {}, steps_trained {}".format( fpath, self.steps_trained()))
def __init__( self, length: int, shard_rank: int, num_shards: int, shuffle: bool, shuffle_seed: int, prior_batches_trained: int, ) -> None: self.indices = list(range(length)) self.num_shards = num_shards self.shuffle = shuffle check.gt_eq( length, num_shards, "please provide a Sequence that has at least as many batches as the number of slots " "used for training", ) # Each shard has a certain offset from which it yields data. When the dataset length is # not evenly divisible by the shard size, that offset will change every epoch. # Example: # let length=10, shard_rank=0, and num_shards=3: # epoch 1: 0, 3, 6, 9 # epoch 2: 2, 5, 8 # epoch 3: 1, 4, 7 # epoch 4: (same as epoch 1) # In this example, the offset in the first three epochs is 0, then 2, then 1. # The initial offset is always shard_rank, and the offest is recalculated in _end_epoch(). self.offset = shard_rank if self.shuffle: assert shuffle_seed is not None self.rng = np.random.RandomState(shuffle_seed) self.rng.shuffle(self.indices) # Start in the correct epoch of shuffle. batches_to_skip = prior_batches_trained while len(self._this_epoch_indices()) <= batches_to_skip: batches_to_skip -= len(self._this_epoch_indices()) self._end_epoch() self.offset += self.num_shards * batches_to_skip
def __init__(self, trial_inst: det.Trial, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) check.is_instance(trial_inst, PyTorchTrial, "PyTorchTrialController needs an PyTorchTrial") self.trial = cast(PyTorchTrial, trial_inst) self.context = cast(pytorch.PyTorchTrialContext, self.context) self.context._set_determined_profiler(self.prof) if torch.cuda.is_available(): self.prof._set_sync_device(self._sync_device) self.callbacks = self.trial.build_callbacks() check.gt_eq( len(self.context.models), 1, "Must have at least one model. " "This might be caused by not wrapping your model with wrap_model()", ) check.gt_eq( len(self.context.optimizers), 1, "Must have at least one optimizer. " "This might be caused by not wrapping your optimizer with wrap_optimizer()", ) self._check_evaluate_implementation() self.wlsq = None # type: Optional[layers.WorkloadSequencer] if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.get_global_batch_size(), ) self.steps_completed = self.env.steps_completed # Currently only horovod and torch backends are supported for distributed training if self.context.distributed.size > 1: assert (self.use_horovod or self.use_torch ), "Must use horovod or torch for distributed training"
def __init__(self, trial_inst: det.Trial, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) check.is_instance(trial_inst, PyTorchTrial, "PyTorchTrialController needs an PyTorchTrial") self.trial = cast(PyTorchTrial, trial_inst) self.context = cast(pytorch.PyTorchTrialContext, self.context) self.context.experimental._set_allgather_fn(self.allgather_metrics) self.callbacks = self.trial.build_callbacks() check.gt_eq( len(self.context.models), 1, "Must have at least one model. " "This might be caused by not wrapping your model with wrap_model()", ) check.gt_eq( len(self.context.optimizers), 1, "Must have at least one optimizer. " "This might be caused by not wrapping your optimizer with wrap_optimizer()", ) self._check_evaluate_implementation() # Validation loader will be undefined on process ranks > 0 # when the user defines `validate_full_dataset()`. self.validation_loader = None # type: Optional[torch.utils.data.DataLoader] self._set_data_loaders() # We don't want the training_iterator shuffling values after we load state self.training_iterator = iter(self.training_loader) # If a load path is provided load weights and restore the data location. self._load() if self.hvd_config.use: hvd.broadcast_parameters(self.context._main_model.state_dict(), root_rank=0) for optimizer in self.context.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0)