def __init__(self, params, optimizer, optimizer_params=None, root_rank=0): if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn( "DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") param_list = [] if isinstance(params, mx.gluon.ParameterDict): for key in sorted(list(params.keys())): param_list.append(params[key]) super(DistributedTrainer, self).__init__(param_list, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by BytePS size, which is equivalent to performing # average in push_pull, has better performance. self._scale /= size() self.root_rank = root_rank for i, param in enumerate(self._params): byteps_declare_tensor("parameter_" + str(i)) if param.grad_req != 'null': byteps_declare_tensor("gradient_" + str(i))
def __init__(self, params, optimizer, optimizer_params=None): if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn( "DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") super(DistributedTrainer, self).__init__(params, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by BytePS size, which is equivalent to performing # average in push_pull, has better performance. self._scale /= size()
def __init__(self, params, optimizer, optimizer_params=None, root_rank=0, compression_params=None): if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn( "DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") param_list = [] if isinstance(params, mx.gluon.ParameterDict): for key in sorted(list(params.keys())): param_list.append(params[key]) self._intra_compressor = self._register_compressor( params, optimizer_params, compression_params) super(DistributedTrainer, self).__init__(param_list, optimizer, optimizer_params=optimizer_params, kvstore=None) if local_rank() == 0: self._f = open("lr.s", "wb") self._f.truncate(8) self._bps_size = size() self.root_rank = root_rank self._intra_compressors = {} for i, param in enumerate(self._params): byteps_declare_tensor("parameter_" + str(i)) self._intra_compressors[param.name] = copy.deepcopy( self._intra_compressor) if param.grad_req != 'null': byteps_params = dict( filter(lambda attr: attr[0].startswith("byteps_", ), param.__dict__.items())) byteps_declare_tensor("gradient_" + str(i), **byteps_params)