def _allreduce_grads(self): for i, param in enumerate(self._params): if param.grad_req != 'null': if param.list_grad()[0].stype == 'default': allreduce_(param.list_grad()[0], average=True, name=str(i), priority=-i) else: if i not in self._hvd_param_buf: self._hvd_param_buf[i] = mx.nd.zeros( param.list_grad()[0].shape, param.list_grad()[0].context, dtype=param.list_grad()[0].dtype) param_dense = self._hvd_param_buf[i] mx.nd.sparse.cast_storage(param.list_grad()[0], 'default', out=param_dense) allreduce_(param_dense, average=True, name=str(i), priority=-i) mx.nd.sparse.cast_storage(param_dense, 'row_sparse', out=param.list_grad()[0])
def _do_allreduce(self, index, grad): if isinstance(index, (tuple, list)): for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i) else: allreduce_(grad, average=False, name=str(index))
def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads]) grouped_allreduce_(tensors=tensors_compressed, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)] else: for i in range(len(index)): tensor_compressed, ctx = self._compression.compress(grad[i]) allreduce_(tensor_compressed, average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grad[i] = self._compression.decompress(tensor_compressed, ctx) else: tensor_compressed, ctx = self._compression.compress(grad) allreduce_(tensor_compressed, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor) grad = self._compression.decompress(tensor_compressed, ctx)
def _allreduce_grads(self): for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=str(i), priority=-i)
def _allreduce_grads(self): if size() == 1: return for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=param.name, priority=-i)
def _allreduce_grads(self): if size() == 1: return if (self._num_groups > 0): grads = [] names = [] for i, param in enumerate(self._params): if param.grad_req != 'null': grads.append(param.list_grad()[0]) names.append(self._prefix + str(i)) grads_split = split_list(grads, self._num_groups) names_split = split_list(names, self._num_groups) for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)): # For better performance, enqueue groups in separate grouped_allreduce calls by dtype. entries_by_dtype = defaultdict(list) for grad, name in zip(group_grads, group_names): entries_by_dtype[grad.dtype].append((grad, name)) for entries in entries_by_dtype.values(): grads, names = zip(*entries) grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: # In MXNet 2.0, param.name is no longer unique. # Meanwhile, since horovod requires Python 3.6, there is no need to sort # self._params as enumerating a python dict is always deterministic. for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=self._prefix + str(i), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def _do_allreduce(self, index, grad): if self._process_set.size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): grouped_allreduce_( tensors=grads, average=False, name="{}:{}".format(indices[0], indices[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set) else: for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set) else: allreduce_(grad, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set)
def _allreduce_grads(self): # sort needed for Python < 3.6 is not guaranteed for i, param in enumerate(sorted(self._params, key=lambda p: p.name)): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=str(i), priority=-i)
def _allreduce_grads(self): if size() == 1: return for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=param.name, priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def _allreduce_grads(self): for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=True, name=param.name, priority=-i) # communication counter self._comm_counter += param.list_grad()[0].size * 2
def _allreduce_grads(self): if size() == 1: return # In MXNet 2.0, param.name is no longer unique. # Meanwhile, since horovod requires Python 3.6, there is no need to sort # self._params as enumerating a python dict is always deterministic. for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=self._prefix + str(i), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: allreduce_(grad, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor)
def _allreduce_params(self): for i, param in enumerate(self._params): if param.grad_req != 'null': if param.list_grad()[0].stype == 'default': # Partial-local-SGD x = param.list_data()[0] if self._multi_precision and x.dtype == np.float16: _, x_32 = self._updaters[0].states[i] if random.uniform(0,1) <= self._layer_sparse_ratio: # compress input_size = x.shape[0] k1 = max(1, round(input_size*self._input_sparse_ratio)) sparse_input_begin = random.choice(range(math.ceil(input_size/k1))) * k1 sparse_input_end = min(sparse_input_begin + k1, input_size) if len(x.shape) > 1: output_size = x.shape[1] k2 = max(1, round(output_size*self._output_sparse_ratio)) sparse_output_begin = random.choice(range(math.ceil(output_size/k2))) * k2 sparse_output_end = min(sparse_output_begin + k2, output_size) x_sync = x[sparse_input_begin:sparse_input_end,sparse_output_begin:sparse_output_end] # partial sync allreduce_(x_sync, average=True, name=str(i), priority=-i) x[sparse_input_begin:sparse_input_end,sparse_output_begin:sparse_output_end] = x_sync if self._multi_precision and x.dtype == np.float16: x_32[sparse_input_begin:sparse_input_end,sparse_output_begin:sparse_output_end] = x_sync else: x_sync = x[sparse_input_begin:sparse_input_end] # partial sync allreduce_(x_sync, average=True, name=str(i), priority=-i) x[sparse_input_begin:sparse_input_end] = x_sync if self._multi_precision and x.dtype == np.float16: x_32[sparse_input_begin:sparse_input_end] = x_sync if x.dtype == np.float16: sync_factor = 0.5 else: sync_factor = 1.0 self._comm_counter += x_sync.size * 2 * sync_factor else: raise ValueError("Cannot pull row_sparse parameters for local SGD")
def _allreduce_grads(self): # super(DistributedRspTrainer, self)._allreduce_grads() # print(self._kvstore) for i, param in enumerate(self._params): if param.grad_req != 'null': if param.list_grad()[0].stype == 'default': allreduce_(param.list_grad()[0], average=True, name=str(i), priority=-i) else: if i not in self._hvd_param_buf: self._hvd_param_buf[i] = mx.nd.zeros( param.list_grad()[0].shape, param.list_grad()[0].context, dtype=self._sdtype) param_dense = self._hvd_param_buf[i] if self._dtype_mismatch: param_dense[:] = param.list_grad()[0].tostype( 'default') else: mx.nd.sparse.cast_storage(param.list_grad()[0], 'default', out=param_dense) allreduce_(param_dense, average=True, name=str(i), priority=-i) # mx.nd.sparse.cast_storage(param_dense, 'row_sparse', out=param.list_grad()[0]) for i, param in enumerate(self._params): if param.grad_req != 'null': if param.list_grad()[0].stype != 'default': if i in self._hvd_param_buf: if self._dtype_mismatch: param.list_grad()[0][:] = self._hvd_param_buf[ i].tostype('row_sparse') else: mx.nd.sparse.cast_storage(self._hvd_param_buf[i], 'row_sparse', out=param.list_grad()[0])
def allreduce_states(self): n_params = len(self._params) for i, param in enumerate(self._params): if param.grad_req != 'null': if param.list_grad()[0].stype == 'default': # Partial-local-SGD x = param.list_data()[0] if self._multi_precision and x.dtype == np.float16: m, _ = self._updaters[0].states[i] # allreduce_(x_32, average=True, name=str(i), priority=-i) allreduce_(m, average=True, name=str(i + n_params), priority=-i) # x[:] = x_32 else: m = self._updaters[0].states[i] # allreduce_(x, average=True, name=str(i), priority=-i) allreduce_(m, average=True, name=str(i + n_params), priority=-i)
def _allreduce_grads(self): # local sgd self._local_sgd_counter += 1 if self._local_sgd_counter == self._local_sgd_interval: # reset local error self._local_sgd_counter = 0 input_sparse_ratio = self._input_sparse_ratio_2 output_sparse_ratio = self._output_sparse_ratio_2 layer_sparse_ratio = self._layer_sparse_ratio_2 else: input_sparse_ratio = self._input_sparse_ratio_1 output_sparse_ratio = self._output_sparse_ratio_1 layer_sparse_ratio = self._layer_sparse_ratio_1 for i, param in enumerate(self._params): if param.grad_req != 'null': if param.list_grad()[0].stype == 'default': # ER-SGD r, _, _ = self._updaters[0].states[i] if random.uniform(0, 1) <= layer_sparse_ratio: # compress input_size = r.shape[0] k1 = max(1, round(input_size * input_sparse_ratio)) sparse_input_begin = random.choice( range(math.ceil(input_size / k1))) * k1 sparse_input_end = min(sparse_input_begin + k1, input_size) if len(r.shape) > 1: output_size = r.shape[1] k2 = max(1, round(output_size * output_sparse_ratio)) sparse_output_begin = random.choice( range(math.ceil(output_size / k2))) * k2 sparse_output_end = min(sparse_output_begin + k2, output_size) r_sync = r[sparse_input_begin:sparse_input_end, sparse_output_begin:sparse_output_end] param.list_data()[0][ sparse_input_begin:sparse_input_end, sparse_output_begin: sparse_output_end] += r_sync # partial sync allreduce_(r_sync, average=True, name=str(i), priority=-i) param.list_data()[0][ sparse_input_begin:sparse_input_end, sparse_output_begin: sparse_output_end] -= r_sync r[sparse_input_begin:sparse_input_end, sparse_output_begin:sparse_output_end] = 0 else: r_sync = r[sparse_input_begin:sparse_input_end] param.list_data( )[0][sparse_input_begin:sparse_input_end] += r_sync # partial sync allreduce_(r_sync, average=True, name=str(i), priority=-i) param.list_data( )[0][sparse_input_begin:sparse_input_end] -= r_sync r[sparse_input_begin:sparse_input_end] = 0 # communication counter self._comm_counter += r_sync.size * 2 else: raise ValueError( "Cannot pull row_sparse parameters for local SGD")
def update_multi_precision(self, index, weight, grad, state): allreduce_(grad, average=True, name=str(index)) return self._optimizer.update_multi_precision(index, weight, grad, state)