def _allreduce_grads(self):
     for i, param in enumerate(self._params):
         if param.grad_req != 'null':
             if param.list_grad()[0].stype == 'default':
                 allreduce_(param.list_grad()[0],
                            average=True,
                            name=str(i),
                            priority=-i)
             else:
                 if i not in self._hvd_param_buf:
                     self._hvd_param_buf[i] = mx.nd.zeros(
                         param.list_grad()[0].shape,
                         param.list_grad()[0].context,
                         dtype=param.list_grad()[0].dtype)
                 param_dense = self._hvd_param_buf[i]
                 mx.nd.sparse.cast_storage(param.list_grad()[0],
                                           'default',
                                           out=param_dense)
                 allreduce_(param_dense,
                            average=True,
                            name=str(i),
                            priority=-i)
                 mx.nd.sparse.cast_storage(param_dense,
                                           'row_sparse',
                                           out=param.list_grad()[0])
Example #2
0
 def _do_allreduce(self, index, grad):
     if isinstance(index, (tuple, list)):
         for i in range(len(index)):
             allreduce_(grad[i], average=False,
                        name=str(index[i]), priority=-i)
     else:
         allreduce_(grad, average=False, name=str(index))
Example #3
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads, indices) in enumerate(zip(grad_split, index_split)):
                    tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads])
                    grouped_allreduce_(tensors=tensors_compressed, average=False,
                                       name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
                    grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)]
            else:
              for i in range(len(index)):
                  tensor_compressed, ctx = self._compression.compress(grad[i])
                  allreduce_(tensor_compressed, average=False,
                             name=str(index[i]), priority=-i,
                             prescale_factor=1.0 / self._gradient_predivide_factor)
                  grad[i] = self._compression.decompress(tensor_compressed, ctx)
        else:
            tensor_compressed, ctx = self._compression.compress(grad)
            allreduce_(tensor_compressed, average=False, name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor)
            grad = self._compression.decompress(tensor_compressed, ctx)
Example #4
0
 def _allreduce_grads(self):
     for i, param in enumerate(self._params):
         if param.grad_req != 'null':
             allreduce_(param.list_grad()[0],
                        average=False,
                        name=str(i),
                        priority=-i)
Example #5
0
    def _allreduce_grads(self):
        if size() == 1: return

        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0], average=False,
                           name=param.name, priority=-i)
Example #6
0
    def _allreduce_grads(self):
        if size() == 1: return

        if (self._num_groups > 0):
            grads = []
            names = []

            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    grads.append(param.list_grad()[0])
                    names.append(self._prefix + str(i))

            grads_split = split_list(grads, self._num_groups)
            names_split = split_list(names, self._num_groups)

            for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)):
                # For better performance, enqueue groups in separate grouped_allreduce calls by dtype.
                entries_by_dtype = defaultdict(list)
                for grad, name in zip(group_grads, group_names):
                    entries_by_dtype[grad.dtype].append((grad, name))

                for entries in entries_by_dtype.values():
                    grads, names = zip(*entries)
                    grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
        else:
            # In MXNet 2.0, param.name is no longer unique.
            # Meanwhile, since horovod requires Python 3.6, there is no need to sort
            # self._params as enumerating a python dict is always deterministic.
            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    allreduce_(param.list_grad()[0], average=False,
                               name=self._prefix + str(i), priority=-i,
                               prescale_factor=1.0 / self._gradient_predivide_factor)
Example #7
0
    def _do_allreduce(self, index, grad):
        if self._process_set.size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads,
                        indices) in enumerate(zip(grad_split, index_split)):
                    grouped_allreduce_(
                        tensors=grads,
                        average=False,
                        name="{}:{}".format(indices[0], indices[-1]),
                        priority=-i,
                        prescale_factor=1.0 / self._gradient_predivide_factor,
                        process_set=self._process_set)
            else:
                for i in range(len(index)):
                    allreduce_(grad[i],
                               average=False,
                               name=str(index[i]),
                               priority=-i,
                               prescale_factor=1.0 /
                               self._gradient_predivide_factor,
                               process_set=self._process_set)
        else:
            allreduce_(grad,
                       average=False,
                       name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor,
                       process_set=self._process_set)
Example #8
0
 def _allreduce_grads(self):
     # sort needed for Python < 3.6 is not guaranteed
     for i, param in enumerate(sorted(self._params, key=lambda p: p.name)):
         if param.grad_req != 'null':
             allreduce_(param.list_grad()[0],
                        average=False,
                        name=str(i),
                        priority=-i)
Example #9
0
    def _allreduce_grads(self):
        if size() == 1: return

        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0],
                           average=False,
                           name=param.name,
                           priority=-i,
                           prescale_factor=1.0 /
                           self._gradient_predivide_factor)
Example #10
0
    def _allreduce_grads(self):

        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0],
                           average=True,
                           name=param.name,
                           priority=-i)

                # communication counter
                self._comm_counter += param.list_grad()[0].size * 2
Example #11
0
    def _allreduce_grads(self):
        if size() == 1: return

        # In MXNet 2.0, param.name is no longer unique.
        # Meanwhile, since horovod requires Python 3.6, there is no need to sort
        # self._params as enumerating a python dict is always deterministic.
        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0],
                           average=False,
                           name=self._prefix + str(i),
                           priority=-i,
                           prescale_factor=1.0 /
                           self._gradient_predivide_factor)
Example #12
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            for i in range(len(index)):
                allreduce_(grad[i],
                           average=False,
                           name=str(index[i]),
                           priority=-i,
                           prescale_factor=1.0 /
                           self._gradient_predivide_factor)
        else:
            allreduce_(grad,
                       average=False,
                       name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor)
    def _allreduce_params(self):
        
        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                if param.list_grad()[0].stype == 'default':
                    # Partial-local-SGD
                    x = param.list_data()[0]

                    if self._multi_precision and x.dtype == np.float16:
                        _, x_32 = self._updaters[0].states[i]

                    if random.uniform(0,1) <= self._layer_sparse_ratio:
                        # compress
                        input_size = x.shape[0]
                        k1 = max(1, round(input_size*self._input_sparse_ratio))
                        sparse_input_begin = random.choice(range(math.ceil(input_size/k1))) * k1
                        sparse_input_end = min(sparse_input_begin + k1, input_size)

                        if len(x.shape) > 1:
                            output_size = x.shape[1]
                            k2 = max(1, round(output_size*self._output_sparse_ratio))
                            sparse_output_begin = random.choice(range(math.ceil(output_size/k2))) * k2
                            sparse_output_end = min(sparse_output_begin + k2, output_size)
                            x_sync = x[sparse_input_begin:sparse_input_end,sparse_output_begin:sparse_output_end]
                            # partial sync
                            allreduce_(x_sync, average=True,
                                        name=str(i), priority=-i)
                            x[sparse_input_begin:sparse_input_end,sparse_output_begin:sparse_output_end] = x_sync
                            if self._multi_precision and x.dtype == np.float16:
                                x_32[sparse_input_begin:sparse_input_end,sparse_output_begin:sparse_output_end] = x_sync
                        else:
                            x_sync = x[sparse_input_begin:sparse_input_end]
                            # partial sync
                            allreduce_(x_sync, average=True,
                                    name=str(i), priority=-i)
                            x[sparse_input_begin:sparse_input_end] = x_sync
                            if self._multi_precision and x.dtype == np.float16:
                                x_32[sparse_input_begin:sparse_input_end] = x_sync

                        if x.dtype == np.float16:
                            sync_factor = 0.5
                        else:
                            sync_factor = 1.0
                        self._comm_counter += x_sync.size * 2 * sync_factor
                else:
                    raise ValueError("Cannot pull row_sparse parameters for local SGD")
    def _allreduce_grads(self):
        # super(DistributedRspTrainer, self)._allreduce_grads()
        # print(self._kvstore)
        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                if param.list_grad()[0].stype == 'default':
                    allreduce_(param.list_grad()[0],
                               average=True,
                               name=str(i),
                               priority=-i)
                else:
                    if i not in self._hvd_param_buf:
                        self._hvd_param_buf[i] = mx.nd.zeros(
                            param.list_grad()[0].shape,
                            param.list_grad()[0].context,
                            dtype=self._sdtype)
                    param_dense = self._hvd_param_buf[i]
                    if self._dtype_mismatch:
                        param_dense[:] = param.list_grad()[0].tostype(
                            'default')
                    else:
                        mx.nd.sparse.cast_storage(param.list_grad()[0],
                                                  'default',
                                                  out=param_dense)
                    allreduce_(param_dense,
                               average=True,
                               name=str(i),
                               priority=-i)
                    # mx.nd.sparse.cast_storage(param_dense, 'row_sparse', out=param.list_grad()[0])

        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                if param.list_grad()[0].stype != 'default':
                    if i in self._hvd_param_buf:
                        if self._dtype_mismatch:
                            param.list_grad()[0][:] = self._hvd_param_buf[
                                i].tostype('row_sparse')
                        else:
                            mx.nd.sparse.cast_storage(self._hvd_param_buf[i],
                                                      'row_sparse',
                                                      out=param.list_grad()[0])
Example #15
0
    def allreduce_states(self):
        n_params = len(self._params)
        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                if param.list_grad()[0].stype == 'default':
                    # Partial-local-SGD
                    x = param.list_data()[0]

                    if self._multi_precision and x.dtype == np.float16:
                        m, _ = self._updaters[0].states[i]
                        # allreduce_(x_32, average=True, name=str(i), priority=-i)
                        allreduce_(m,
                                   average=True,
                                   name=str(i + n_params),
                                   priority=-i)
                        # x[:] = x_32
                    else:
                        m = self._updaters[0].states[i]
                        # allreduce_(x, average=True, name=str(i), priority=-i)
                        allreduce_(m,
                                   average=True,
                                   name=str(i + n_params),
                                   priority=-i)
Example #16
0
    def _allreduce_grads(self):

        # local sgd
        self._local_sgd_counter += 1
        if self._local_sgd_counter == self._local_sgd_interval:
            # reset local error
            self._local_sgd_counter = 0
            input_sparse_ratio = self._input_sparse_ratio_2
            output_sparse_ratio = self._output_sparse_ratio_2
            layer_sparse_ratio = self._layer_sparse_ratio_2
        else:
            input_sparse_ratio = self._input_sparse_ratio_1
            output_sparse_ratio = self._output_sparse_ratio_1
            layer_sparse_ratio = self._layer_sparse_ratio_1

        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                if param.list_grad()[0].stype == 'default':
                    # ER-SGD
                    r, _, _ = self._updaters[0].states[i]

                    if random.uniform(0, 1) <= layer_sparse_ratio:
                        # compress
                        input_size = r.shape[0]
                        k1 = max(1, round(input_size * input_sparse_ratio))
                        sparse_input_begin = random.choice(
                            range(math.ceil(input_size / k1))) * k1
                        sparse_input_end = min(sparse_input_begin + k1,
                                               input_size)
                        if len(r.shape) > 1:
                            output_size = r.shape[1]
                            k2 = max(1,
                                     round(output_size * output_sparse_ratio))
                            sparse_output_begin = random.choice(
                                range(math.ceil(output_size / k2))) * k2
                            sparse_output_end = min(sparse_output_begin + k2,
                                                    output_size)

                            r_sync = r[sparse_input_begin:sparse_input_end,
                                       sparse_output_begin:sparse_output_end]
                            param.list_data()[0][
                                sparse_input_begin:sparse_input_end,
                                sparse_output_begin:
                                sparse_output_end] += r_sync
                            # partial sync
                            allreduce_(r_sync,
                                       average=True,
                                       name=str(i),
                                       priority=-i)

                            param.list_data()[0][
                                sparse_input_begin:sparse_input_end,
                                sparse_output_begin:
                                sparse_output_end] -= r_sync
                            r[sparse_input_begin:sparse_input_end,
                              sparse_output_begin:sparse_output_end] = 0
                        else:

                            r_sync = r[sparse_input_begin:sparse_input_end]
                            param.list_data(
                            )[0][sparse_input_begin:sparse_input_end] += r_sync
                            # partial sync
                            allreduce_(r_sync,
                                       average=True,
                                       name=str(i),
                                       priority=-i)

                            param.list_data(
                            )[0][sparse_input_begin:sparse_input_end] -= r_sync
                            r[sparse_input_begin:sparse_input_end] = 0

                        # communication counter
                        self._comm_counter += r_sync.size * 2
                else:
                    raise ValueError(
                        "Cannot pull row_sparse parameters for local SGD")
Example #17
0
 def update_multi_precision(self, index, weight, grad, state):
     allreduce_(grad, average=True, name=str(index))
     return self._optimizer.update_multi_precision(index, weight, grad,
                                                   state)