Beispiel #1
0
    def _allreduce_grads(self):
        if size() == 1: return

        if (self._num_groups > 0):
            grads = []
            names = []

            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    grads.append(param.list_grad()[0])
                    names.append(self._prefix + str(i))

            grads_split = split_list(grads, self._num_groups)
            names_split = split_list(names, self._num_groups)

            for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)):
                # For better performance, enqueue groups in separate grouped_allreduce calls by dtype.
                entries_by_dtype = defaultdict(list)
                for grad, name in zip(group_grads, group_names):
                    entries_by_dtype[grad.dtype].append((grad, name))

                for entries in entries_by_dtype.values():
                    grads, names = zip(*entries)
                    grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
        else:
            # In MXNet 2.0, param.name is no longer unique.
            # Meanwhile, since horovod requires Python 3.6, there is no need to sort
            # self._params as enumerating a python dict is always deterministic.
            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    allreduce_(param.list_grad()[0], average=False,
                               name=self._prefix + str(i), priority=-i,
                               prescale_factor=1.0 / self._gradient_predivide_factor)
Beispiel #2
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads, indices) in enumerate(zip(grad_split, index_split)):
                    tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads])
                    grouped_allreduce_(tensors=tensors_compressed, average=False,
                                       name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
                    grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)]
            else:
              for i in range(len(index)):
                  tensor_compressed, ctx = self._compression.compress(grad[i])
                  allreduce_(tensor_compressed, average=False,
                             name=str(index[i]), priority=-i,
                             prescale_factor=1.0 / self._gradient_predivide_factor)
                  grad[i] = self._compression.decompress(tensor_compressed, ctx)
        else:
            tensor_compressed, ctx = self._compression.compress(grad)
            allreduce_(tensor_compressed, average=False, name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor)
            grad = self._compression.decompress(tensor_compressed, ctx)
Beispiel #3
0
    def _do_allreduce(self, index, grad):
        if self._process_set.size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads,
                        indices) in enumerate(zip(grad_split, index_split)):
                    grouped_allreduce_(
                        tensors=grads,
                        average=False,
                        name="{}:{}".format(indices[0], indices[-1]),
                        priority=-i,
                        prescale_factor=1.0 / self._gradient_predivide_factor,
                        process_set=self._process_set)
            else:
                for i in range(len(index)):
                    allreduce_(grad[i],
                               average=False,
                               name=str(index[i]),
                               priority=-i,
                               prescale_factor=1.0 /
                               self._gradient_predivide_factor,
                               process_set=self._process_set)
        else:
            allreduce_(grad,
                       average=False,
                       name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor,
                       process_set=self._process_set)