def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads]) grouped_allreduce_(tensors=tensors_compressed, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)] else: for i in range(len(index)): tensor_compressed, ctx = self._compression.compress(grad[i]) allreduce_(tensor_compressed, average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grad[i] = self._compression.decompress(tensor_compressed, ctx) else: tensor_compressed, ctx = self._compression.compress(grad) allreduce_(tensor_compressed, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor) grad = self._compression.decompress(tensor_compressed, ctx)
def _allreduce_grads(self): if size() == 1: return if (self._num_groups > 0): grads = [] names = [] for i, param in enumerate(self._params): if param.grad_req != 'null': grads.append(param.list_grad()[0]) names.append(self._prefix + str(i)) grads_split = split_list(grads, self._num_groups) names_split = split_list(names, self._num_groups) for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)): # For better performance, enqueue groups in separate grouped_allreduce calls by dtype. entries_by_dtype = defaultdict(list) for grad, name in zip(group_grads, group_names): entries_by_dtype[grad.dtype].append((grad, name)) for entries in entries_by_dtype.values(): grads, names = zip(*entries) grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: # In MXNet 2.0, param.name is no longer unique. # Meanwhile, since horovod requires Python 3.6, there is no need to sort # self._params as enumerating a python dict is always deterministic. for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=self._prefix + str(i), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def _do_allreduce(self, index, grad): if self._process_set.size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): grouped_allreduce_( tensors=grads, average=False, name="{}:{}".format(indices[0], indices[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set) else: for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set) else: allreduce_(grad, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set)
def allreduce_grads(grads): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] if num_groups > 0: grads_clean = [grad for grad in grads if grad is not None] grads_split = split_list(grads_clean, num_groups) reduce_ops = [] for group in grads_split: reduce_ops += _grouped_allreduce_cond( group, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) return reduce_ops return [ _allreduce_cond(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) if grad is not None else grad for grad in grads ]
def _register_hooks(self): if self._num_groups > 0: p_list = [] # Get list of parameters with grads for param_group in self.param_groups: for p in param_group['params']: if p.requires_grad: p_list.append(p) # To ensure parameter order and group formation is consistent, broadcast p_list order # from rank 0 and use for every worker p_list_names = [self._parameter_names.get(p) for p in p_list] p_list_names = broadcast_object(p_list_names, root_rank=0) p_list = sorted( p_list, key=lambda p: p_list_names.index(self._parameter_names.get(p))) # Form groups p_groups = split_list(p_list, self._num_groups) p_groups = [tuple(p) for p in p_groups] for group in p_groups: for p in group: self._p_to_group[p] = group self._group_counts[group] = 0 for param_group in self.param_groups: for p in param_group['params']: if p.requires_grad: p.grad = p.data.new(p.size()).zero_() self._requires_update.add(p) p_tmp = p.expand_as(p) grad_acc = p_tmp.grad_fn.next_functions[0][0] grad_acc.register_hook(self._make_hook(p)) self._grad_accs.append(grad_acc)
def allreduce_grads(grads, vars=None): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] if groups is not None: if isinstance(groups, list): var_name2grad = {} for i in range(len(vars)): var = vars[i] grad = grads[i] if grad is not None: var_name2grad[var.name] = (i, grad) grads_split = [] for group in groups: grad_group = [] for var in group: if var.name in var_name2grad: grad_group.append(var_name2grad[var.name]) del var_name2grad[var.name] grads_split.append(grad_group) for _, grad in var_name2grad.items(): grads_split.append([grad]) elif groups > 0: grads_clean = [(i, grad) for i, grad in enumerate(grads) if grad is not None] grads_split = split_list(grads_clean, groups) reduce_ops = [None] * len(vars) for group in grads_split: index_group, grad_group = [list(t) for t in zip(*group)] reduce_ops_group = _grouped_allreduce_cond( grad_group, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor, process_set=process_set) for i in range(len(index_group)): reduce_ops[index_group[i]] = reduce_ops_group[i] return reduce_ops return [ _allreduce_cond(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor, process_set=process_set) if grad is not None else grad for grad in grads ]
def _register_hooks(self): if self._groups is not None: p_list = [] # Get list of parameters with grads for param_group in self.param_groups: for p in param_group['params']: if p.requires_grad: p_list.append(p) # To ensure parameter order and group formation is consistent, broadcast p_list order # from rank 0 and use for every worker p_list_names = [self._parameter_names.get(p) for p in p_list] p_list_names = broadcast_object(p_list_names, root_rank=0, process_set=self.process_set) p_list = sorted( p_list, key=lambda p: p_list_names.index(self._parameter_names.get(p))) # Form groups if isinstance(self._groups, list): p_groups = [] grouped_id = set() p_list_ids = [id(p) for p in p_list] for group in self._groups: p_groups.append([p for p in group if id(p) in p_list_ids]) for p in p_groups[-1]: grouped_id.add(id(p)) for p in p_list: if id(p) not in grouped_id: p_groups.append([p]) else: p_groups = split_list(p_list, self._groups) p_groups = [tuple(p) for p in p_groups] for group in p_groups: for p in group: self._p_to_group[p] = group self._group_counts[group] = 0 for param_group in self.param_groups: for p in param_group['params']: if p.requires_grad: self._requires_update.add(p) p_tmp = p.expand_as(p) grad_acc = p_tmp.grad_fn.next_functions[0][0] grad_acc.register_hook(self._make_hook(p)) self._grad_accs.append(grad_acc)