def __init__(self, params, optimizer, optimizer_params=None, compression=Compression.none, gradient_predivide_factor=1.0, prefix=None, num_groups=0): self._compression = compression if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError('gradient_predivide_factor not supported yet with ROCm') if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn("DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") # To ensure consistent parameter ordering across workers, sort params before # passing to base Trainer constructor. This logic is consistent with trainer.py # since v1.6 but we do it here for backwards compatability if isinstance(params, dict): params = OrderedDict(params) elif isinstance(params, (list, tuple)): params = sorted(params) super(DistributedTrainer, self).__init__( params, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by Horovod size, which is equivalent to performing # average in allreduce, has better performance. self._scale *= (gradient_predivide_factor / size()) self._gradient_predivide_factor = gradient_predivide_factor assert prefix is None or isinstance(prefix, str) self._prefix = prefix if prefix else "" self._num_groups = num_groups
def __init__(self, params, optimizer, optimizer_params=None, gradient_predivide_factor=1.0, prefix=None): if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn( "DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") super(DistributedTrainer, self).__init__(params, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by Horovod size, which is equivalent to performing # average in allreduce, has better performance. self._scale *= (gradient_predivide_factor / size()) self._gradient_predivide_factor = gradient_predivide_factor assert prefix is None or isinstance(prefix, str) self._prefix = prefix if prefix else ""
def __init__(self, optimizer, gradient_predivide_factor=1.0): if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') self._optimizer = optimizer # Normalizing rescale_grad by Horovod size, which is equivalent to # performing average in allreduce, has better performance. self._optimizer.rescale_grad *= (gradient_predivide_factor / size()) self._gradient_predivide_factor = gradient_predivide_factor