Esempio n. 1
0
    def _allreduce_grads(self):
        if size() == 1: return

        if (self._num_groups > 0):
            grads = []
            names = []

            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    grads.append(param.list_grad()[0])
                    names.append(self._prefix + str(i))

            grads_split = split_list(grads, self._num_groups)
            names_split = split_list(names, self._num_groups)

            for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)):
                # For better performance, enqueue groups in separate grouped_allreduce calls by dtype.
                entries_by_dtype = defaultdict(list)
                for grad, name in zip(group_grads, group_names):
                    entries_by_dtype[grad.dtype].append((grad, name))

                for entries in entries_by_dtype.values():
                    grads, names = zip(*entries)
                    grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
        else:
            # In MXNet 2.0, param.name is no longer unique.
            # Meanwhile, since horovod requires Python 3.6, there is no need to sort
            # self._params as enumerating a python dict is always deterministic.
            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    allreduce_(param.list_grad()[0], average=False,
                               name=self._prefix + str(i), priority=-i,
                               prescale_factor=1.0 / self._gradient_predivide_factor)
Esempio n. 2
0
    def __init__(self, params, optimizer, optimizer_params=None,
                 compression=Compression.none,
                 gradient_predivide_factor=1.0, prefix=None,
                 num_groups=0):
        self._compression = compression

        if gradient_predivide_factor != 1.0 and rocm_built():
            raise ValueError('gradient_predivide_factor not supported yet with ROCm')
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn("DistributedTrainer does not take DistributedOptimizer "
                          "as its optimizer. We have unwrapped it for you.")

        # To ensure consistent parameter ordering across workers, sort params before
        # passing to base Trainer constructor. This logic is consistent with trainer.py
        # since v1.6 but we do it here for backwards compatability
        if isinstance(params, dict):
            params = OrderedDict(params)
        elif isinstance(params, (list, tuple)):
            params = sorted(params)

        super(DistributedTrainer, self).__init__(
            params, optimizer, optimizer_params=optimizer_params, kvstore=None)

        # _scale is used to check and set rescale_grad for optimizer in Trainer.step()
        # function. Normalizing it by Horovod size, which is equivalent to performing
        # average in allreduce, has better performance. 
        self._scale *= (gradient_predivide_factor / size())
        self._gradient_predivide_factor = gradient_predivide_factor
        assert prefix is None or isinstance(prefix, str)
        self._prefix = prefix if prefix else ""
        self._num_groups = num_groups
Esempio n. 3
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads, indices) in enumerate(zip(grad_split, index_split)):
                    tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads])
                    grouped_allreduce_(tensors=tensors_compressed, average=False,
                                       name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
                    grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)]
            else:
              for i in range(len(index)):
                  tensor_compressed, ctx = self._compression.compress(grad[i])
                  allreduce_(tensor_compressed, average=False,
                             name=str(index[i]), priority=-i,
                             prescale_factor=1.0 / self._gradient_predivide_factor)
                  grad[i] = self._compression.decompress(tensor_compressed, ctx)
        else:
            tensor_compressed, ctx = self._compression.compress(grad)
            allreduce_(tensor_compressed, average=False, name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor)
            grad = self._compression.decompress(tensor_compressed, ctx)
Esempio n. 4
0
    def _allreduce_grads(self):
        if size() == 1: return

        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0], average=False,
                           name=param.name, priority=-i)
Esempio n. 5
0
def allgather_object(obj, name=None):
    """
    Serializes and allgathers an object from all other processes.

    Arguments:
        obj: An object capable of being serialized without losing any context.
        name: Optional name to use during allgather, will default to the class
              type.

    Returns:
        The list of objects that were allgathered across all ranks.
    """
    if name is None:
        name = type(obj).__name__

    def load(byte_array):
        buf = io.BytesIO(byte_array.tobytes())
        return cloudpickle.load(buf)

    b = io.BytesIO()
    cloudpickle.dump(obj, b)

    t = mx.nd.array(bytearray(b.getvalue()), dtype='byte')
    sz = mx.nd.array([t.size], dtype='int')

    sizes = allgather(sz, name=name + '.sz').asnumpy()
    gathered = allgather(t, name=name + '.t').asnumpy()

    def select(i):
        start = sizes[i - 1] if i > 0 else 0
        end = start + sizes[i]
        return gathered[start:end]

    return [load(select(i)) for i in range(size())]
Esempio n. 6
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads,
                        indices) in enumerate(zip(grad_split, index_split)):
                    grouped_allreduce_(
                        tensors=grads,
                        average=False,
                        name="{}:{}".format(indices[0], indices[-1]),
                        priority=-i,
                        prescale_factor=1.0 / self._gradient_predivide_factor)
            else:
                for i in range(len(index)):
                    allreduce_(grad[i],
                               average=False,
                               name=str(index[i]),
                               priority=-i,
                               prescale_factor=1.0 /
                               self._gradient_predivide_factor)
        else:
            allreduce_(grad,
                       average=False,
                       name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor)
Esempio n. 7
0
    def __init__(self,
                 params,
                 optimizer,
                 optimizer_params=None,
                 gradient_predivide_factor=1.0,
                 prefix=None):
        if gradient_predivide_factor != 1.0 and rocm_built():
            raise ValueError(
                'gradient_predivide_factor not supported yet with ROCm')
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn(
                "DistributedTrainer does not take DistributedOptimizer "
                "as its optimizer. We have unwrapped it for you.")

        super(DistributedTrainer,
              self).__init__(params,
                             optimizer,
                             optimizer_params=optimizer_params,
                             kvstore=None)

        # _scale is used to check and set rescale_grad for optimizer in Trainer.step()
        # function. Normalizing it by Horovod size, which is equivalent to performing
        # average in allreduce, has better performance.
        self._scale *= (gradient_predivide_factor / size())
        self._gradient_predivide_factor = gradient_predivide_factor
        assert prefix is None or isinstance(prefix, str)
        self._prefix = prefix if prefix else ""
Esempio n. 8
0
def broadcast_parameters(params, root_rank=0):
    """
    Broadcasts the parameters from root rank to all other processes.
    Typical usage is to broadcast the `Module.get_params()` or the
    `Block.collect_params()`.

    Arguments:
        params: One of the following:
            - dict of parameters to broadcast
            - ParameterDict to broadcast
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
    """
    if size() == 1: return

    tensors = []
    names = []
    if isinstance(params, dict):
        names, tensors = zip(*params.items())
    elif isinstance(params, mx.gluon.parameter.ParameterDict):
        for name, p in sorted(params.items()):
            try:
                tensors.append(p.data())
                names.append(name)
            except mx.gluon.parameter.DeferredInitializationError:
                # Inject wrapper method with post-initialization broadcast to
                # handle parameters with deferred initialization
                new_init = _append_broadcast_init(p, root_rank)
                p._init_impl = types.MethodType(new_init, p)
    else:
        raise ValueError('invalid params of type: %s' % type(params))

    # Run broadcasts.
    for tensor, name in zip(tensors, names):
        broadcast_(tensor, root_rank, name=str(name))
Esempio n. 9
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            for i in range(len(index)):
                allreduce_(grad[i], average=False,
                           name=str(index[i]), priority=-i)
        else:
            allreduce_(grad, average=False, name=str(index))
Esempio n. 10
0
    def _allreduce_grads(self):
        if size() == 1: return

        # sort needed for Python < 3.6 is not guaranteed
        for i, param in enumerate(sorted(self._params, key=lambda p: p.name)):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0],
                           average=False,
                           name=str(i),
                           priority=-i)
Esempio n. 11
0
    def __init__(self, optimizer, gradient_predivide_factor=1.0):
        if gradient_predivide_factor != 1.0 and rocm_built():
            raise ValueError(
                'gradient_predivide_factor not supported yet with ROCm')

        self._optimizer = optimizer
        # Normalizing rescale_grad by Horovod size, which is equivalent to
        # performing average in allreduce, has better performance.
        self._optimizer.rescale_grad *= (gradient_predivide_factor / size())
        self._gradient_predivide_factor = gradient_predivide_factor
Esempio n. 12
0
    def _allreduce_grads(self):
        if size() == 1: return

        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0],
                           average=False,
                           name=param.name,
                           priority=-i,
                           prescale_factor=1.0 /
                           self._gradient_predivide_factor)
Esempio n. 13
0
    def __init__(self, params, optimizer, optimizer_params=None):
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn("DistributedTrainer does not take DistributedOptimizer "
                          "as its optimizer. We have unwrapped it for you.")

        super(DistributedTrainer, self).__init__(
            params, optimizer, optimizer_params=optimizer_params, kvstore=None)

        # _scale is used to check and set rescale_grad for optimizer in Trainer.step()
        # function. Normalizing it by Horovod size, which is equivalent to performing
        # average in allreduce, has better performance. 
        self._scale /= size()
Esempio n. 14
0
    def _allreduce_grads(self):
        if size() == 1: return

        # In MXNet 2.0, param.name is no longer unique.
        # Meanwhile, since horovod requires Python 3.6, there is no need to sort
        # self._params as enumerating a python dict is always deterministic.
        for i, param in enumerate(self._params):
            if param.grad_req != 'null':
                allreduce_(param.list_grad()[0],
                           average=False,
                           name=self._prefix + str(i),
                           priority=-i,
                           prescale_factor=1.0 /
                           self._gradient_predivide_factor)
Esempio n. 15
0
def broadcast_parameters(params, root_rank=0, prefix=None):
    """Broadcasts the parameters from root rank to all other processes.
    Typical usage is to broadcast the `Module.get_params()` or the
    `Block.collect_params()`.

    Arguments:
        params: One of the following:
            - dict of parameters to broadcast
            - ParameterDict to broadcast
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
        prefix: The prefix of the parameters to broadcast.
              If multiple `broadcast_parameters` are called in the same program,
              they must be specified by different prefixes to avoid tensor name collision.
    """
    if size() == 1: return

    tensors = []
    names = []
    assert prefix is None or isinstance(prefix, str)
    prefix = prefix if prefix else ""
    try:
        from mxnet.gluon.parameter import ParameterDict
        valid_types = (dict, ParameterDict)
    except ImportError:
        valid_types = (dict, )
    if isinstance(params, valid_types):
        for name, p in sorted(params.items()):
            try:
                if isinstance(p, mx.gluon.parameter.Parameter):
                    tensors.append(p.data())
                else:
                    tensors.append(p)
                names.append(prefix + str(name))
            except mx.gluon.parameter.DeferredInitializationError:
                # Inject wrapper method with post-initialization broadcast to
                # handle parameters with deferred initialization
                # we use the key of params instead of param.name, since
                # param.name is no longer unique in MXNet 2.0
                new_init = _append_broadcast_init(p, root_rank,
                                                  prefix + str(name))
                p._init_impl = types.MethodType(new_init, p)
    else:
        raise ValueError('invalid params of type: %s' % type(params))

    # Run broadcasts.
    for tensor, name in zip(tensors, names):
        broadcast_(tensor, root_rank, name=name)
Esempio n. 16
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            for i in range(len(index)):
                allreduce_(grad[i],
                           average=False,
                           name=str(index[i]),
                           priority=-i,
                           prescale_factor=1.0 /
                           self._gradient_predivide_factor)
        else:
            allreduce_(grad,
                       average=False,
                       name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor)
Esempio n. 17
0
def broadcast_parameters(params, root_rank=0):
    """
    Broadcasts the parameters from root rank to all other processes.
    Typical usage is to broadcast the `Module.get_params()` or the
    `Block.collect_params()`.

    Arguments:
        params: One of the following:
            - dict of parameters to broadcast
            - ParameterDict to broadcast
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
    """
    if size() == 1: return

    tensors = []
    if isinstance(params, dict):
        tensors = [p for _, p in sorted(params.items())]
    elif isinstance(params, mx.gluon.parameter.ParameterDict):
        for _, p in sorted(params.items()):
            try:
                tensors.append(p.data())
            except mx.gluon.parameter.DeferredInitializationError:
                # Inject wrapper method with post-initialization broadcast to
                # handle parameters with deferred initialization
                new_init = _append_broadcast_init(p, root_rank)
                p._init_impl = types.MethodType(new_init, p)
    else:
        raise ValueError('invalid params of type: %s' % type(params))

    # Run broadcasts.
    for i, tensor in enumerate(tensors):
        broadcast_(tensor, root_rank, str(i))

    # Make sure tensors pushed to MXNet engine get processed such that all
    # workers are synced before starting training.
    for tensor in tensors:
        tensor.wait_to_read()
Esempio n. 18
0
 def __init__(self, optimizer):
     self._optimizer = optimizer
     # Normalizing rescale_grad by Horovod size, which is equivalent to
     # performing average in allreduce, has better performance.
     self._optimizer.rescale_grad /= size()
Esempio n. 19
0
 def __init__(self, *args, **kwargs):
     self._size = size()
     self._rank = rank()
     return super().__init__(*args, **kwargs)