def __init__(self, params, optimizer, optimizer_params=None, root_rank=0): if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn( "DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") param_list = [] if isinstance(params, mx.gluon.ParameterDict): for key in sorted(list(params.keys())): param_list.append(params[key]) super(DistributedTrainer, self).__init__(param_list, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by BytePS size, which is equivalent to performing # average in push_pull, has better performance. self._scale /= size() self.root_rank = root_rank for i, param in enumerate(self._params): byteps_declare_tensor("parameter_" + str(i)) if param.grad_req != 'null': byteps_declare_tensor("gradient_" + str(i))
def _do_push_pull(self, index, grad): if isinstance(index, (tuple, list)): for i in range(len(index)): byteps_declare_tensor(grad[i], "gradient_"+str(index[i])) byteps_push_pull(grad[i], version=0, priority=-index[i], name="gradient_"+str(index[i]), is_average=True) else: byteps_declare_tensor(grad, "gradient_"+str(index)) byteps_push_pull(grad, version=0, priority=-index, name="gradient_"+str(index), is_average=True)
def _allreduce_grads(self): for i, param in enumerate(self._params): if param.grad_req != 'null': byteps_declare_tensor(param.list_grad()[0], "gradient_" + str(i)) byteps_push_pull(param.list_grad()[0], is_average=False, name="gradient_" + str(i), priority=-i)
def broadcast_parameters(params, root_rank=0): """ Broadcasts the parameters from root rank to all other processes. Typical usage is to broadcast the `Module.get_params()` or the `Block.collect_params()`. Arguments: params: One of the following: - dict of parameters to broadcast - ParameterDict to broadcast root_rank: The rank of the process from which parameters will be broadcasted to all other processes. """ tensors = [] if isinstance(params, dict): tensors = [p for _, p in sorted(params.items())] elif isinstance(params, mx.gluon.parameter.ParameterDict): for _, p in sorted(params.items()): try: tensors.append(p.data()) except mx.gluon.parameter.DeferredInitializationError: # Inject wrapper method with post-initialization broadcast to # handle parameters with deferred initialization global parameter_index byteps_declare_tensor(p.data(), "parameter_" + str(parameter_index)) new_init = _append_broadcast_init(p, root_rank, parameter_index) parameter_index += 1 p._init_impl = types.MethodType(new_init, p) else: raise ValueError('invalid params of type: %s' % type(params)) # Run tensor initilization for i in range(len(tensors)): byteps_declare_tensor(tensors[i], "parameter_" + str(parameter_index)) # Broadcast is implemented as push + pull in BytePS # To broadcast: we should zero-out all non-root tensors, and disable push_pull average if rank() != root_rank: tensors[i].__imul__(0) byteps_push_pull(tensors[i], version=0, priority=0, name="parameter_" + str(parameter_index), is_average=False) parameter_index += 1 # Make sure tensors pushed to MXNet engine get processed such that all # workers are synced before starting training. for tensor in tensors: tensor.wait_to_read()
def _do_push_pull_param(self, index, delta_weight): if isinstance(index, (tuple, list)): for i in range(len(index)): byteps_declare_tensor("weight_" + str(index[i])) byteps_push_pull(delta_weight[i], version=0, priority=-index[i], name="weight_" + str(index[i]), is_average=False) else: byteps_declare_tensor("weight_" + str(index)) byteps_push_pull(delta_weight, version=0, priority=-index, name="weight_" + str(index), is_average=False)
def __init__(self, params, optimizer, optimizer_params=None, root_rank=0, compression_params=None): if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn( "DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") param_list = [] if isinstance(params, mx.gluon.ParameterDict): for key in sorted(list(params.keys())): param_list.append(params[key]) self._intra_compressor = self._register_compressor( params, optimizer_params, compression_params) super(DistributedTrainer, self).__init__(param_list, optimizer, optimizer_params=optimizer_params, kvstore=None) if local_rank() == 0: self._f = open("lr.s", "wb") self._f.truncate(8) self._bps_size = size() self.root_rank = root_rank self._intra_compressors = {} for i, param in enumerate(self._params): byteps_declare_tensor("parameter_" + str(i)) self._intra_compressors[param.name] = copy.deepcopy( self._intra_compressor) if param.grad_req != 'null': byteps_params = dict( filter(lambda attr: attr[0].startswith("byteps_", ), param.__dict__.items())) byteps_declare_tensor("gradient_" + str(i), **byteps_params)
def broadcast_parameters(params, root_rank=0): """ Broadcasts the parameters from root rank to all other processes. Typical usage is to broadcast the `Module.get_params()`. Arguments: params: dict of parameters to broadcast root_rank: The rank of the process from which parameters will be broadcasted to all other processes. """ global parameter_index if isinstance(params, dict): tensors = [p for _, p in sorted(params.items())] # Run tensor initilization for i in range(len(tensors)): byteps_declare_tensor(tensors[i], "parameter_" + str(parameter_index)) # Broadcast is implemented as push + pull in BytePS # To broadcast: we should zero-out all non-root tensors, and disable push_pull average if rank() != root_rank: tensors[i].__imul__(0) byteps_push_pull(tensors[i], version=0, priority=0, name="parameter_" + str(parameter_index), is_average=False) parameter_index += 1 # Make sure tensors pushed to MXNet engine get processed such that all # workers are synced before starting training. for tensor in tensors: tensor.wait_to_read() elif isinstance(params, mx.gluon.parameter.ParameterDict): raise TypeError("For gluon users, you should not call this function. " "DistributedTrainer will broadcast all parameters at " "the first training step.") else: raise ValueError('Invalid params of type: %s' % type(params))
def _init_params(self): tensors = [] for param in self._params_to_init: if param._deferred_init: tensors.append(param) else: param_arrays = param._check_and_get(param._data, list) idx = self._param2idx[param.name] byteps_declare_tensor(param_arrays[0], "parameter_" + str(idx)) if rank() != self.root_rank: param_arrays[0].__imul__(0) byteps_push_pull(param_arrays[0], version=0, priority=0, name="parameter_" + str(idx), is_average=False) param_arrays[0].wait_to_read() self._params_to_init = tensors