Ejemplo n.º 1
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = []  # shape (num_tensors, num_gpus)
    output = []
    for tensor_at_gpus in zip(*inputs):
        if tensor_at_gpus[0].is_sparse:
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
        else:
            dense_tensors.append(tensor_at_gpus)
    itrs = [_take_tensors(tensors, buffer_size) for tensors in zip(*dense_tensors)]
    for chunks in zip(*itrs):
        tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        result = reduce_add(tensors, destination)
        output.extend(_unflatten_dense_tensors(result, chunks[0]))
    return tuple(_reorder_tensors_as(output, inputs[0]))
Ejemplo n.º 2
0
        def reduction_fn_nccl():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = [[] for _ in range(len(self._module_copies))]
            all_grads_buckets_iters = []

            # Bucketing all the gradients
            for dev_idx, module in enumerate(self._module_copies):
                for param in module.parameters():
                    if not param.requires_grad or param.grad is None:
                        continue
                    if param.grad.requires_grad:
                        raise RuntimeError(
                            "DistributedDataParallel only works "
                            "with gradients that don't require "
                            "grad")
                    # Adding the gradients for reduction
                    all_grads[dev_idx].append(param.grad.data)

                # Now bucketing the parameters
                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
                                                  self.nccl_reduce_bucket_size)

                all_grads_buckets_iters.append(dev_grads_buckets)

            # Now reduce each bucket one after another
            for grads_batch in zip(*all_grads_buckets_iters):
                grads_batch_coalesced = []
                # Coalesce each bucket
                for dev_idx, dev_grads_batch in enumerate(grads_batch):
                    dev_id = self.device_ids[dev_idx]
                    with torch.cuda.device(dev_id):
                        dev_grads_batch_coalesced = _flatten_dense_tensors(
                            dev_grads_batch)
                        grads_batch_coalesced.append(dev_grads_batch_coalesced)

                # We will only use device 0's results, but this single op should be
                # faster than doing the following two operation sequentially:
                # (1) intra-node reduce to lead GPU, followed by
                # (2) inter-node allreduce for all the first lead GPUs in all nodes
                dist.all_reduce_multigpu(grads_batch_coalesced,
                                         group=self.nccl_reduction_group_id)

                # Now only work on the first device of self.device_ids, uncoalesce
                # the gradients for each bucket
                grads_batch_coalesced[0] /= dist.get_world_size()
                grads_batch_reduced = _unflatten_dense_tensors(
                    grads_batch_coalesced[0], grads_batch[0])
                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                    grad.copy_(reduced)

            # clear the gradients and save memory for replicas
            for module in self._module_copies[1:]:
                for param in module.parameters():
                    if param.requires_grad:
                        param.grad = None
                        param.data.set_()
Ejemplo n.º 3
0
def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """Broadcasts a sequence tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        tensors (sequence): tensors to broadcast.
        devices (Iterable): an iterable of devices among which to broadcast.
          Note that it should be like (src, dst1, dst2, ...), the first element
          of which is the source device to broadcast from.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of the ``tensor``, placed on devices
        corresponding to indices from ``devices``.
    """
    for tensor in tensors:
        if tensor.get_device() != devices[0]:
            raise RuntimeError('all tensors must be on devices[0]')
    outputs = [[] for _ in devices]
    # use the original tensors for the first device
    outputs[0].extend(tensors)
    for chunk in _take_tensors(tensors, buffer_size):
        results = broadcast(_flatten_tensors(chunk), devices)
        # use the broadcasted tensors for the remaining devices
        for dst, res in zip(outputs[1:], results[1:]):
            dst.extend(_unflatten_tensors(res, chunk))
    return tuple(outputs)
Ejemplo n.º 4
0
 def _dist_broadcast_coalesced(self, tensors, buffer_size):
     for tensors in _take_tensors(tensors, buffer_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.broadcast(flat_tensors, 0)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 5
0
def nccl_allreduce_by_buckets(nc, kn, all_grads):
    # Now bucketing the parameters
    dev_grads_buckets = _take_tensors(all_grads, nccl_reduce_bucket_size)


    for grads_batch in dev_grads_buckets:
        grads_batch_coalesced = _flatten_dense_tensors(grads_batch)

        # NOTE:
        torch.cuda.synchronize()

        # NOTE:
        #nbutils.cuda_current_context().synchronize()
        # or,
        nc.stream_sync()

        sz = np.prod(grads_batch_coalesced.size())
        nc.do_all_reduce(grads_batch_coalesced.data_ptr(),
                         grads_batch_coalesced.data_ptr(),
                         sz)

        nc.stream_sync()

        grads_batch_coalesced[:] = grads_batch_coalesced / float(kn)

        grads_batch_reduced = _unflatten_dense_tensors(
            grads_batch_coalesced, grads_batch)

        for grad, reduced in zip(grads_batch, grads_batch_reduced):
            grad.copy_(reduced)
Ejemplo n.º 6
0
def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """Broadcasts a sequence tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        tensors (sequence): tensors to broadcast.
        devices (Iterable): an iterable of devices among which to broadcast.
          Note that it should be like (src, dst1, dst2, ...), the first element
          of which is the source device to broadcast from.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of the ``tensor``, placed on devices
        corresponding to indices from ``devices``.
    """
    for tensor in tensors:
        if tensor.get_device() != devices[0]:
            raise RuntimeError('all tensors must be on devices[0]')
    outputs = [[] for _ in devices]
    # use the original tensors for the first device
    outputs[0].extend(tensors)
    for chunk in _take_tensors(tensors, buffer_size):
        results = broadcast(_flatten_tensors(chunk), devices)
        # use the broadcasted tensors for the remaining devices
        for dst, res in zip(outputs[1:], results[1:]):
            dst.extend(_unflatten_tensors(res, chunk))
    return tuple(outputs)
Ejemplo n.º 7
0
def all_reduce_coalesced(tensors, divisor=1, op=ReduceOp.SUM, buffer_size=256 * MB):
    for tensors in _take_tensors(tensors, buffer_size):
        flat_tensors = _flatten_dense_tensors(tensors)
        dist.all_reduce(flat_tensors, op)
        if divisor != 1:
            flat_tensors.div_(divisor)
        for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
            old_t.data = new_t
Ejemplo n.º 8
0
 def sync_params_bucket(self):
     params = [p.data for p in list(self.model.parameters())]
     for tensors in _take_tensors(params, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.broadcast(flat_tensors, src=0)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 9
0
 def sync_grads_bucket(self):
     grads = [p.grad.data for p in list(self.model.parameters()) if p.requires_grad]
     for tensors in _take_tensors(grads, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         new_all_reduce(flat_tensors, cuda=self.cuda)
         flat_tensors.div_(self.world_size)
         for tensor, synced in zip(tensors,_unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 10
0
 def sync_grads_bucket(self):
     grads = [
         p.grad.data for p in list(self.model.parameters())
         if p.requires_grad
     ]
     for tensors in _take_tensors(grads, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
Ejemplo n.º 11
0
        def reduction_fn_nccl():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = [[] for _ in range(len(self._module_copies))]
            all_grads_buckets_iters = []

            # Bucketing all the gradients
            for dev_idx, module in enumerate(self._module_copies):
                for param in module.parameters():
                    if not param.requires_grad or param.grad is None:
                        continue
                    if param.grad.requires_grad:
                        raise RuntimeError("DistributedDataParallel only works "
                                           "with gradients that don't require "
                                           "grad")
                    # Adding the gradients for reduction
                    all_grads[dev_idx].append(param.grad.data)

                # Now bucketing the parameters
                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
                                                  self.nccl_reduce_bucket_size)

                all_grads_buckets_iters.append(dev_grads_buckets)

            # Now reduce each bucket one after another
            for grads_batch in zip(*all_grads_buckets_iters):
                grads_batch_coalesced = []
                # Coalesce each bucket
                for dev_idx, dev_grads_batch in enumerate(grads_batch):
                    dev_id = self.device_ids[dev_idx]
                    with torch.cuda.device(dev_id):
                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
                        grads_batch_coalesced.append(dev_grads_batch_coalesced)

                # We will only use device 0's results, but this single op should be
                # faster than doing the following two operation sequentially:
                # (1) intra-node reduce to lead GPU, followed by
                # (2) inter-node allreduce for all the first lead GPUs in all nodes
                dist.all_reduce_multigpu(grads_batch_coalesced,
                                         group=self.nccl_reduction_group_id)

                # Now only work on the first device of self.device_ids, uncoalesce
                # the gradients for each bucket
                grads_batch_coalesced[0] /= dist.get_world_size()
                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                    grad.copy_(reduced)

            # clear the gradients and save memory for replicas
            for module in self._module_copies[1:]:
                for param in module.parameters():
                    if param.requires_grad:
                        param.grad = None
                        param.data.set_()
Ejemplo n.º 12
0
 def sync_buffers_bucket(self):
     buffers = [p.data for p in list(self.model._all_buffers())]
     for tensors in _take_tensors(buffers, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         flat_tensors.zero_()
         dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
         flat_tensors.div_(self.num_workers)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 13
0
 def sync_grads_bucket(self):
     grads = [
         p.grad.data for p in list(self.model.parameters())
         if p.requires_grad
     ]
     for tensors in _take_tensors(grads, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
         flat_tensors.div_(self.num_workers)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 14
0
def all_gather_coalesced(tensors, buffer_size=256 * MB):
    assert dist.get_backend() == dist.dist_backend.NCCL  # gloo gives some weird device error
    world_size = dist.get_world_size()
    rcv_lsts = [[] for _ in range(world_size)]
    for tensors in _take_tensors(tensors, buffer_size):
        flat_tensors = _flatten_dense_tensors(tensors)
        tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)]
        dist.all_gather(tmp_rcv_lst, flat_tensors)
        for i, rcv_flat_tensors in enumerate(tmp_rcv_lst):
            for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors):
                rcv_lsts[i].append(rcv_t)
    return rcv_lsts
Ejemplo n.º 15
0
def _get_coalesced_bucket(tensors, buffer_size_mb=-1):
    if buffer_size_mb > 0:
        buffer_size = buffer_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, buffer_size)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()
    return buckets
Ejemplo n.º 16
0
    def _dist_broadcast_coalesced(self, tensors, buffer_size):
        """
        Broadcast a sequence of tensors to the default group from rank 0.
        Small tensors are first coalesced into a buffer to reduce the number of
        broadcasts.

        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
                            same GPU.
        buffer_size (int): maximum size of the buffer for coalescing
        """
        for tensors in _take_tensors(tensors, buffer_size):
            flat_tensors = _flatten_dense_tensors(tensors)
            dist.broadcast(flat_tensors, 0)
            for tensor, synced in zip(tensors,
                                      _unflatten_dense_tensors(flat_tensors, tensors)):
                tensor.copy_(synced)
Ejemplo n.º 17
0
    def _dist_broadcast_coalesced(self, tensors, buffer_size):
        """
        Broadcast a sequence of tensors to the default group from rank 0.
        Small tensors are first coalesced into a buffer to reduce the number of
        broadcasts.

        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
                            same GPU.
        buffer_size (int): maximum size of the buffer for coalescing
        """
        for tensors in _take_tensors(tensors, buffer_size):
            flat_tensors = _flatten_dense_tensors(tensors)
            dist.broadcast(flat_tensors, 0)
            for tensor, synced in zip(
                    tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
                tensor.copy_(synced)
Ejemplo n.º 18
0
def broadcast_coalesced(tensors, src=0, buffer_size=10 * MB):
    r"""
    Broadcast a sequence of tensors to the default group from rank 0.
    Small tensors are first coalesced into a buffer to reduce the number of
    broadcasts.

    tensors (sequence): tensors to broadcast. Each tensor needs to be on the
                        same GPU.
    src (int): src rank. Default: 0.
    buffer_size (int): maximum size of the buffer for coalescing. Default: 10MB.
    """
    for tensors in _take_tensors(tensors, buffer_size):
        flat_tensors = _flatten_dense_tensors(tensors)
        dist.broadcast(flat_tensors, src)
        for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
            old_t.data = new_t
Ejemplo n.º 19
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just
    #       return `inputs`.
    dense_tensors: List[List] = [[] for _ in inputs
                                 ]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus,
                                destination)  # this will be sparse too
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk)
                        for chunk in chunks]  # (num_gpus,)
        flat_result = reduce_add(flat_tensors, destination)
        for t in _unflatten_dense_tensors(flat_result, chunks[0]):
            # The unflattened tensors do not share storage, and we don't expose
            # base flat tensor anyways, so give them different version counters.
            # See NOTE [ Version Counter in comm.*_coalesced ]
            output.append(t.data)
    return tuple(_reorder_tensors_as(output, ref_order))
Ejemplo n.º 20
0
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=(- 1)):
    if (bucket_size_mb > 0):
        bucket_size_bytes = ((bucket_size_mb * 1024) * 1024)
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if (tp not in buckets):
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()
    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for (tensor, synced) in zip(bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)
Ejemplo n.º 21
0
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        # all_reduce perform SUM by default and we div gpu_num here.
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)
Ejemplo n.º 22
0
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    """Allreduce parameters as a whole."""
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)
Ejemplo n.º 23
0
        def reduction_fn():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = []

            # Bucketing all the gradients
            for param in self.module.parameters():
                if not param.requires_grad:
                    continue
                if param.grad is not None and param.grad.requires_grad:
                    raise RuntimeError("DistributedDataParallel only works "
                                       "with gradients that don't require "
                                       "grad")
                if param.grad is not None:
                    # Adding the gradients for reduction
                    all_grads.append(param.grad.data)
                else:
                    all_grads.append(torch.zeros_like(param))

            # Now bucketing the parameters
            dev_grads_buckets = _take_tensors(all_grads,
                                              self.reduce_bucket_size)

            # Now reduce each bucket one after another
            for grads_batch in dev_grads_buckets:
                grads_batch_coalesced = _flatten_dense_tensors(grads_batch)

                grads_batch_coalesced /= self.world_size

                distributed_utils.all_reduce(grads_batch_coalesced,
                                             self.process_group)

                grads_batch_reduced = _unflatten_dense_tensors(
                    grads_batch_coalesced, grads_batch)
                for grad, reduced in zip(grads_batch, grads_batch_reduced):
                    grad.copy_(reduced)
Ejemplo n.º 24
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        flat_result = reduce_add(flat_tensors, destination)
        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
    return tuple(_reorder_tensors_as(output, ref_order))
Ejemplo n.º 25
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        flat_result = reduce_add(flat_tensors, destination)
        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
    return tuple(_reorder_tensors_as(output, ref_order))
Ejemplo n.º 26
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    output = []
    itrs = [_take_tensors(tensors, buffer_size) for tensors in inputs]
    for chunks in zip(*itrs):
        flattened = [_flatten_tensors(chunk) for chunk in chunks]
        result = reduce_add(flattened, destination)
        output.extend(_unflatten_tensors(result, chunks[0]))
    return tuple(output)
Ejemplo n.º 27
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            # collect gradients first
            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                all_grads.append(d_p)
            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for dev_grads in dev_grads_buckets:
                d_p_new = _flatten_dense_tensors(dev_grads)
                if self.all_reduce:
                    self.all_reduce_time.set()
                    dist.all_reduce(d_p_new, group=0)
                    self.all_reduce_time.record()

                dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                if grad.is_sparse:
                    raise RuntimeError(
                        'Adam_distribute does not support sparse gradients, please consider SparseAdam_distribute instead'
                    )
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1**state['step']
                bias_correction2 = 1 - beta2**state['step']
                step_size = group['lr'] * math.sqrt(
                    bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss
Ejemplo n.º 28
0
    def __init__(self, module, device_ids=None, output_device=None, dim=0,
                 broadcast_buffers=True):
        super(DistributedDataParallel, self).__init__()
        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device
        self.broadcast_buffers = broadcast_buffers

        # Flag used by the NCCL backend to make sure we only reduce gradients
        # one time in the execution engine
        self.need_reduction = False

        MB = 1024 * 1024
        # used for intra-node param sync and inter-node sync as well
        self.broadcast_bucket_size = 10 * MB
        self.nccl_reduce_bucket_size = 256 * MB

        # Sync params and buffers
        module_states = list(self.module.state_dict().values())
        if len(module_states) > 0:
            self._dist_broadcast_coalesced(module_states,
                                           self.broadcast_bucket_size)

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module, self.device_ids, detach=True)
            self._module_copies[0] = self.module

            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
                    copy_param.requires_grad = param.requires_grad

        else:
            self._module_copies = [self.module]

        # For NCCL backend, since every single NCCL call is asynchoronous, we
        # therefore directly enqueue all the NCCL reduction calls to the
        # default CUDA stream without spawning up other reduction threads.
        # This achieves the best performance.
        if dist._backend == dist.dist_backend.NCCL:
            self._register_nccl_grad_hook()
            return

        bucket_bytes_cap = 1 * MB

        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
        param_buckets = []
        # Split the parameters into buckets and by types as well
        for dev_idx, module in enumerate(self._module_copies):
            param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap)))

        self.bucket_sizes = []
        self.bucket_map = {}

        # We transpose param_buckets, so the loop is over buckets.
        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
            self.bucket_sizes.append(0)
            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
            # of params from each device.
            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
                if idx == 0:
                    # Bucket parameter type tracking
                    bucket_param_type = param_tuple[0].type()
                    # Only gloo and nccl support half-precision
                    if bucket_param_type == torch.cuda.HalfTensor and \
                            dist._backend != dist.dist_backend.GLOO:
                        raise RuntimeError("DistributedDataParallel currently only "
                                           "supports half precision parameters "
                                           "with Nccl and Gloo backend")
                if not param_tuple[0].requires_grad:
                    continue
                for p in param_tuple:
                    self.bucket_map[p] = bucket_idx
                self.bucket_sizes[bucket_idx] += 1

        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
        self.reduced = [False] * len(self.bucket_sizes)

        self._register_grad_hooks()

        self.dispatch_lock = threading.Lock()
        self._start_reduction_threads()
Ejemplo n.º 29
0
    def __init__(self,
                 module,
                 device_ids=None,
                 output_device=None,
                 dim=0,
                 broadcast_buffers=True):
        super(DistributedDataParallel, self).__init__()
        if dist._backend not in (dist.dist_backend.NCCL,
                                 dist.dist_backend.GLOO):
            raise ValueError(
                'Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel'
            )

        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device
        self.broadcast_buffers = broadcast_buffers

        # Flag used by the NCCL backend to make sure we only reduce gradients
        # one time in the execution engine
        self.need_reduction = False

        MB = 1024 * 1024
        # used for intra-node param sync and inter-node sync as well
        self.broadcast_bucket_size = 10 * MB
        self.nccl_reduce_bucket_size = 256 * MB

        # Sync params and buffers
        module_states = list(self.module.state_dict().values())
        if len(module_states) > 0:
            self._dist_broadcast_coalesced(module_states,
                                           self.broadcast_bucket_size)

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module,
                                            self.device_ids,
                                            detach=True)
            self._module_copies[0] = self.module

            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(),
                                             module_copy.parameters()):
                    copy_param.requires_grad = param.requires_grad

        else:
            self._module_copies = [self.module]

        # For NCCL backend, since every single NCCL call is asynchoronous, we
        # therefore directly enqueue all the NCCL reduction calls to the
        # default CUDA stream without spawning up other reduction threads.
        # This achieves the best performance.
        if dist._backend == dist.dist_backend.NCCL:
            self._register_nccl_grad_hook()
            return

        bucket_bytes_cap = 1 * MB

        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
        param_buckets = []
        # Split the parameters into buckets and by types as well
        for dev_idx, module in enumerate(self._module_copies):
            param_buckets.append(
                list(_take_tensors(module.parameters(), bucket_bytes_cap)))

        self.bucket_sizes = []
        self.bucket_map = {}

        # We transpose param_buckets, so the loop is over buckets.
        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
            self.bucket_sizes.append(0)
            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
            # of params from each device.
            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
                if idx == 0:
                    # Bucket parameter type tracking
                    bucket_param_type = param_tuple[0].type()
                    # Only gloo and nccl support half-precision
                    if bucket_param_type == torch.cuda.HalfTensor and \
                            dist._backend != dist.dist_backend.GLOO:
                        raise RuntimeError(
                            "DistributedDataParallel currently only "
                            "supports half precision parameters "
                            "with Nccl and Gloo backend")
                if not param_tuple[0].requires_grad:
                    continue
                for p in param_tuple:
                    self.bucket_map[p] = bucket_idx
                self.bucket_sizes[bucket_idx] += 1

        self.buckets = [[[] for _ in range(len(self.device_ids))]
                        for _ in range(len(self.bucket_sizes))]
        self.bucket_events = [[None] * len(self.device_ids)
                              for _ in range(len(self.bucket_sizes))]
        self.reduced = [False] * len(self.bucket_sizes)

        self._register_grad_hooks()

        self.dispatch_lock = threading.Lock()
        self._start_reduction_threads()
    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if self.compression_buffer == False:
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_((1 - momentum), d_p)
                    d_p.copy_(buf)
                all_grads.append(d_p)

            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for dev_grads in dev_grads_buckets:
                d_p_new = _flatten_dense_tensors(dev_grads)

                if self.all_reduce:
                    dist.all_reduce(d_p_new, group=0)  #self.all_gpu
                else:
                    if self.nodes > 1:
                        if self.compression_buffer:
                            coded, data_time = QSGD_gpu.encode(d_p_new)
                            #specific coded dic just on CPU
                            tensor_signs = coded['signs'].float().to(
                                self.device)
                            tensor_selected = coded['selected'].float().to(
                                self.device)
                            tensor_norm = coded['norm']
                            #size
                            tensor_signs_size = self.pack_len_tensor_into_tensor(
                                tensor_signs)
                            tensor_selected_size = self.pack_len_tensor_into_tensor(
                                tensor_selected)
                            #tensor_norm_size = self.pack_len_tensor_into_tensor(tensor_norm) norm doesn't need size

                            #custom
                            '''
                            print(tensor_signs.type())
                            print(tensor_selected.type())
                            print(tensor_norm.type())
                            '''

                        else:
                            d_p_new = torch.sign(d_p_new)

                        if self.local_rank == 0:

                            if self.all_gather_commu:
                                #This version only for instances each with one GPU
                                for node_index in self.inter_node_list:
                                    if node_index != self.nodes_rank:

                                        d.set()
                                        f.set()
                                        coded_temp = coded.copy()
                                        f.record()
                                        b.set()
                                        tensor_signs_size_temp = tensor_signs_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_signs_size_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        b.record()
                                        c.set()
                                        tensor_signs_temp = torch.zeros(
                                            [int(tensor_signs_size_temp[0])],
                                            device=self.device,
                                            dtype=torch.float)
                                        c.record()
                                        a.set()
                                        dist.broadcast(
                                            tensor_signs_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        a.record()
                                        d.record()

                                        e.set()
                                        tensor_selected_size_temp = tensor_selected_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_selected_size_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        tensor_selected_temp = torch.zeros(
                                            [
                                                int(tensor_selected_size_temp[
                                                    0])
                                            ],
                                            device=self.device,
                                            dtype=torch.float)
                                        dist.broadcast(
                                            tensor_selected_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        e.record()

                                        tensor_norm_temp = tensor_norm.clone()
                                        dist.broadcast(
                                            tensor_norm_temp,
                                            node_index,
                                            group=self.all_inter_node_group)

                                        coded_temp[
                                            'signs'] = tensor_signs_temp.int()
                                        coded_temp[
                                            'selected'] = tensor_selected_temp.long(
                                            )
                                        coded_temp['norm'] = tensor_norm_temp

                                        tensor_decoded = QSGD_gpu.decode(
                                            coded_temp, cuda=True)
                                        d_p_new = d_p_new + tensor_decoded
                                        '''
                                        print('a', a.get_time())
                                        print('b', b.get_time())
                                        print('c', c.get_time())
                                        print('d', d.get_time())
                                        print('e', e.get_time())
                                        print('f', f.get_time())
                                        '''

                                    else:
                                        dist.broadcast(
                                            tensor_signs_size,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_signs,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_selected_size,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_selected,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_norm,
                                            node_index,
                                            group=self.all_inter_node_group)
                                d_p_new = d_p_new / dist.get_world_size()

                            else:
                                if dist.get_rank() == 0:
                                    for index, inter_node_group in enumerate(
                                            self.inter_node_group_list):
                                        coded_temp = coded.copy()

                                        tensor_signs_size_temp = tensor_signs_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_signs_size_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)
                                        tensor_signs_temp = torch.zeros(
                                            [int(tensor_signs_size_temp[0])],
                                            device=self.device,
                                            dtype=torch.float)
                                        dist.broadcast(
                                            tensor_signs_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)

                                        tensor_selected_size_temp = tensor_selected_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_selected_size_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)
                                        tensor_selected_temp = torch.zeros(
                                            [
                                                int(tensor_selected_size_temp[
                                                    0])
                                            ],
                                            device=self.device,
                                            dtype=torch.float)
                                        dist.broadcast(
                                            tensor_selected_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)

                                        tensor_norm_temp = tensor_norm.clone()
                                        dist.broadcast(
                                            tensor_norm_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)

                                        coded_temp[
                                            'signs'] = tensor_signs_temp.int()
                                        coded_temp[
                                            'selected'] = tensor_selected_temp.long(
                                            )
                                        coded_temp['norm'] = tensor_norm_temp

                                        tensor_decoded = QSGD_gpu.decode(
                                            coded_temp, cuda=True)
                                        d_p_new = d_p_new + tensor_decoded
                                        '''
                                        #temp
                                        print(tensor_decoded)
                                        tensor_decoded_temp = tensor_decoded.clone()
                                        dist.broadcast(tensor_decoded_temp, self.inter_node_list[index + 1], group = inter_node_group)
                                        if tensor_decoded == tensor_decoded_temp:
                                            print('success')
                                        print(tensor_signs_size_temp)
                                        print(tensor_selected_size_temp)
                                        '''

                                    d_p_new = d_p_new / dist.get_world_size()

                                else:
                                    dist.broadcast(
                                        tensor_signs_size,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_signs,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_selected_size,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_selected,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_norm,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    '''
                                    #temp
                                    tensor_decoded = QSGD_gpu.decode(coded, cuda = True)
                                    print(tensor_decoded)
                                    dist.broadcast(tensor_decoded, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) 
                                    print(tensor_signs_size)
                                    print(tensor_selected_size)
                                    '''

                                    dist.barrier(
                                        group=self.all_inter_node_group)

                                #os._exit()

                                if self.bidirection_compress:
                                    if dist.get_rank() == 0:

                                        coded, data_time = QSGD_gpu.encode(
                                            d_p_new)
                                        tensor_signs = coded['signs']
                                        tensor_selected = coded['selected']
                                        tensor_norm = coded['norm']

                                        tensor_signs_size = self.pack_len_tensor_into_tensor(
                                            tensor_signs)
                                        tensor_selected_size = self.pack_len_tensor_into_tensor(
                                            tensor_selected)

                                        dist.barrier(
                                            group=self.all_inter_node_group)

                                    dist.broadcast(
                                        tensor_signs_size,
                                        0,
                                        group=self.all_inter_node_group)
                                    dist.broadcast(
                                        tensor_selected_size,
                                        0,
                                        group=self.all_inter_node_group)
                                    if dist.get_rank() != 0:
                                        tensor_signs = torch.randn([
                                            int(tensor_signs_size[0])
                                        ]).type_as(tensor_signs)
                                        tensor_selected = torch.randn([
                                            int(tensor_selected_size[0])
                                        ]).type_as(tensor_selected)

                                    dist.barrier(
                                        group=self.all_inter_node_group)

                                    dist.broadcast(
                                        tensor_signs,
                                        0,
                                        group=self.all_inter_node_group)
                                    dist.broadcast(
                                        tensor_selected,
                                        0,
                                        group=self.all_inter_node_group)
                                    dist.broadcast(
                                        tensor_norm,
                                        0,
                                        group=self.all_inter_node_group)

                                    coded['signs'] = tensor_signs
                                    coded['selected'] = tensor_selected
                                    coded['norm'] = tensor_norm

                                    tensor_decoded = QSGD_gpu.decode(coded,
                                                                     cuda=True)
                                    d_p_new = tensor_decoded

                                else:
                                    if dist.get_rank() == 0:
                                        dist.barrier(
                                            group=self.all_inter_node_group)
                                    dist.broadcast(
                                        d_p_new,
                                        0,
                                        group=self.all_inter_node_group)

                    else:
                        # test for one
                        coded, data_time = QSGD_gpu.encode(d_p_new)
                        tensor_decoded = QSGD_gpu.decode(coded, cuda=True)
                        d_p_new = tensor_decoded

                #unflatten
                dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)
            for p in group['params']:
                if self.compression_buffer:
                    if weight_decay != 0:
                        p.grad.data.add_(weight_decay, p.data)
                p.data.add_(-group['lr'], p.grad.data)

        return loss
Ejemplo n.º 31
0
    def __init__(self,
                 module,
                 device_ids=None,
                 output_device=None,
                 dim=0,
                 broadcast_buffers=True,
                 process_group=None,
                 bucket_cap_mb=25):

        super(_DistributedDataParallelC10d, self).__init__()

        # Use all devices by default
        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))

        if output_device is None:
            output_device = device_ids[0]

        if process_group is None:
            self.process_group = c10d.get_default_group()
        else:
            self.process_group = process_group

        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device
        self.broadcast_buffers = broadcast_buffers

        self.allreduce_opts = c10d.AllreduceOptions()

        MB = 1024 * 1024

        # used for intra-node param sync and inter-node sync as well
        self.broadcast_bucket_size = 25 * MB

        # Sync params and buffers
        module_states = list(self.module.state_dict().values())
        if len(module_states) > 0:
            self._dist_broadcast_coalesced(module_states,
                                           self.broadcast_bucket_size)

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module,
                                            self.device_ids,
                                            detach=True)
            self._module_copies[0] = self.module

            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(),
                                             module_copy.parameters()):
                    copy_param.requires_grad = param.requires_grad

        else:
            self._module_copies = [self.module]

        # .data() of each parameter for each model replica
        self.modules_params_data = [[] for _ in range(len(self.device_ids))]
        # .data() of each buffer for each model replica
        self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]

        for dev_idx, module in enumerate(self._module_copies):
            self.modules_params_data[dev_idx] = [
                p.data for p in module.parameters()
            ]
            self.modules_buffers_data[dev_idx] = [
                b.data for b in module.buffers()
            ]

        bucket_bytes_cap = bucket_cap_mb * MB

        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
        param_buckets = []
        # Split the parameters into buckets and by types as well
        param_buckets = [
            list(_take_tensors(m.parameters(), bucket_bytes_cap))
            for m in self._module_copies
        ]

        self.bucket_sizes = []
        self.bucket_map = {}

        # We transpose param_buckets, so the loop is over buckets.
        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
            self.bucket_sizes.append(0)
            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
            # of params from each device.
            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
                if not param_tuple[0].requires_grad:
                    continue
                for p in param_tuple:
                    self.bucket_map[p] = (bucket_idx, idx)
                self.bucket_sizes[bucket_idx] += 1

        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
                         for _ in range(len(self.device_ids))]
                        for i in range(len(self.bucket_sizes))]
        # The number of params ready in each bucket
        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))]
                                   for i in range(len(self.bucket_sizes))]

        # coalesced bucket for only device 0
        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
        # We will always reduce the bucket following the reverse order
        # that is, alway reduces following the order of: n - 1, n - 2, ..., 0
        self.next_bucket = len(self.bucket_sizes) - 1
        self.ready_buckets_not_reduced = set()
        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]

        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]

        # default stream tracking to launch nccl reduce kernels
        self.default_streams = []
        for dev_id in self.device_ids:
            with torch.cuda.device(dev_id):
                self.default_streams.append(torch.cuda.current_stream())

        self._register_grad_hooks()
Ejemplo n.º 32
0
    def step(self, closure=None):

        args = self.args

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue

                d_p = p.grad.data
                if self.compression_buffer == False:
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_((1 - momentum), d_p)
                    d_p.copy_(buf)
                all_grads.append(d_p)

            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for dev_grads in dev_grads_buckets:
                d_p_new = _flatten_dense_tensors(dev_grads)

                if self.all_reduce:
                    dist.all_reduce(d_p_new)  #self.all_gpu, group = 0
                    if self.signum:
                        d_p_new = torch.sign(d_p_new)
                elif self.signum:
                    if self.nodes > 1:
                        if self.compression_buffer:
                            d_p_new, tensor_size = self.compressor.compress(
                                d_p_new)
                        else:
                            d_p_new = torch.sign(d_p_new)

                        if self.local_rank == 0:
                            if dist.get_rank() == 0:
                                d_p_new_list = []
                                for index, inter_node_group in enumerate(
                                        self.inter_node_group_list):
                                    d_p_temp = d_p_new.clone()
                                    dist.broadcast(d_p_temp,
                                                   self.inter_node_list[index +
                                                                        1],
                                                   group=inter_node_group)
                                    d_p_new_list.append(d_p_temp)
                            else:
                                dist.broadcast(
                                    d_p_new,
                                    dist.get_rank(),
                                    group=self.inter_node_group_list[
                                        self.nodes_rank - 1])
                                dist.barrier(group=self.all_inter_node_group)

                            if dist.get_rank() == 0:
                                if self.compression_buffer:
                                    d_p_new_list.append(d_p_new)  #count itself
                                    d_p_new = self.compressor.majority_vote(
                                        d_p_new_list)
                                else:
                                    for d_p_temp in d_p_new_list:
                                        d_p_new.add_(d_p_temp)
                                    d_p_new = d_p_new / self.nodes
                                dist.barrier(group=self.all_inter_node_group)
                            dist.broadcast(d_p_new,
                                           0,
                                           group=self.all_inter_node_group)

                        if self.compression_buffer:
                            d_p_new = self.compressor.uncompress(
                                d_p_new, tensor_size)
                else:
                    print('You can not run without signum or all_reduce')

                #unflatten
                dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)
            #LARC saving
            self.layer_adaptive_lr = []
            layer_index = 0
            laryer_saving = [
                1, 2, 3, 23, 49, 87
            ]  #conv1.weight(no bias), bn1.weight, layer1.1.conv1.weight, layer2.1.conv1.weight, layer3.1.conv1.weight, layer4.1.conv1.weight
            ###
            for p in group['params']:
                layer_index += 1
                ###
                '''
                LARC
                This part of code was originally forked from (https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py)
                '''
                if args.larc_enable:
                    trust_coefficient = args.larc_trust_coefficient
                    clip = args.larc_clip
                    eps = args.larc_eps
                    param_norm = torch.norm(p.data)
                    grad_norm = torch.norm(p.grad.data)
                    if param_norm != 0 and grad_norm != 0:
                        # calculate adaptive lr + weight decay
                        adaptive_lr = trust_coefficient * (param_norm) / (
                            grad_norm + param_norm * weight_decay + eps)

                        #add adaptive lr saving
                        if layer_index in laryer_saving:
                            self.layer_adaptive_lr.append(adaptive_lr)

                        # clip learning rate for LARC
                        if clip:
                            # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
                            adaptive_lr = min(adaptive_lr / group['lr'], 1)

                        else:
                            adaptive_lr = adaptive_lr / group['lr']

                        p.grad.data *= adaptive_lr
                ###

                if self.compression_buffer:  #This part of code is temporary
                    if weight_decay != 0:
                        p.grad.data.add_(weight_decay, p.data)
                p.data.add_(-group['lr'], p.grad.data)

        return loss
Ejemplo n.º 33
0
    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                '''
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                '''
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_((1 - momentum), d_p)
                    d_p.copy_(buf)
                all_grads.append(d_p)

            #torch.cuda.init()
            #torch.cuda.empty_cache()

            if not self.single_worker:

                self.all_time.set()
                self.bucketing_time.set()
                #start bucketing
                dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
                self.bucketing_time.record()
                for dev_grads in dev_grads_buckets:

                    self.bucketing_time.set()
                    d_p_new = _flatten_dense_tensors(dev_grads)
                    #print('the size of each bucket',d_p_new.size())
                    #os._exit(0)
                    self.bucketing_time.record()

                    #d_p_new = d_p.clone()
                    if self.all_reduce:
                        self.all_reduce_time.set()
                        #torch.cuda.synchronize()
                        #d_p_new = torch.sign(d_p_new)
                        dist.all_reduce(d_p_new, group=0)  #self.all_gpu

                        #use boradcast_gather

                        #take the sign to test
                        #d_p_new = torch.sign(d_p_new)
                        #torch.cuda.synchronize()
                        self.all_reduce_time.record()
                    else:
                        #print('once')
                        self.compress_all_time.set()
                        self.all_reduce_time.set()
                        #torch.cuda.synchronize()

                        if self.gpus_per_machine > 1:
                            dist.all_reduce(d_p_new,
                                            group=self.intra_node_group_list[
                                                self.nodes_rank])
                            dist.barrier(group=self.all_gpu)
                        self.all_reduce_time.record()

                        #leave compression
                        if self.nodes > 1:

                            self.compression_time.set()
                            ##torch.cuda.synchronize()
                            if self.compression_buffer:
                                d_p_new, tensor_size = self.compressor.compress(
                                    d_p_new)
                            else:
                                d_p_new = torch.sign(d_p_new)

                            ##torch.cuda.synchronize()
                            self.compression_time.record()
                            self.compress_all_time.record()
                            self.gather_all_time.set()
                            if self.local_rank == 0:
                                if dist.get_rank() == 0:
                                    d_p_new_list = []
                                    for index, inter_node_group in enumerate(
                                            self.inter_node_group_list):
                                        #print('gather', inter_node_list[index + 1])
                                        d_p_temp = d_p_new.clone()
                                        self.broadcast_time.set()
                                        dist.broadcast(
                                            d_p_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)
                                        self.broadcast_time.record()
                                        d_p_new_list.append(d_p_temp)
                                else:
                                    self.broadcast_time.set()
                                    dist.broadcast(
                                        d_p_new,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    self.broadcast_time.record()
                                    #print(dist.get_rank(), 'finish broadcast')

                                    dist.barrier(
                                        group=self.all_inter_node_group)
                                self.gather_all_time.record()
                                self.calculate_all_time.set()

                                if dist.get_rank() == 0:

                                    self.majority_vote_time.set()
                                    if self.compression_buffer:
                                        d_p_new_list.append(
                                            d_p_new)  #count itself
                                        d_p_new = self.compressor.majority_vote(
                                            d_p_new_list)
                                    else:
                                        for d_p_temp in d_p_new_list:
                                            d_p_new.add_(d_p_temp)
                                        d_p_new = d_p_new / self.nodes

                                    ##torch.cuda.synchronize()
                                    self.majority_vote_time.record()
                                    dist.barrier(
                                        group=self.all_inter_node_group)
                                self.calculate_all_time.record()
                                self.broadcast_all_time.set()
                                self.broadcast_time.set()
                                dist.broadcast(d_p_new,
                                               0,
                                               group=self.all_inter_node_group)
                                self.broadcast_time.record()

                                #dist.barrier(group = self.all_inter_node_group)

                            #broadcast to all
                            #print('start broadcast')
                            #self.broadcast_time.set()
                            dist.broadcast(d_p_new,
                                           self.local_dst_in_global,
                                           group=self.intra_node_group_list[
                                               self.nodes_rank])
                            #self.broadcast_time.record()
                            self.uncompression_time.set()
                            ##torch.cuda.synchronize()
                            if self.compression_buffer:
                                d_p_new = self.compressor.uncompress(
                                    d_p_new, tensor_size)

                            #torch.cuda.synchronize()
                            self.uncompression_time.record()
                            self.broadcast_all_time.record()
                            #os._exit(0)

                    self.debucketing_time.set()
                    #unflatten
                    dev_grads_new = _unflatten_dense_tensors(
                        d_p_new, dev_grads)
                    for grad, reduced in zip(dev_grads, dev_grads_new):
                        grad.copy_(reduced)
                    self.debucketing_time.record()

                self.all_time.record()

            self.update_para_time.set()
            #torch.cuda.synchronize()
            for p in group['params']:
                if weight_decay != 0:
                    p.grad.data.add_(weight_decay, p.data)
                if self.single_worker and self.compression_buffer:
                    p.data.add_(-group['lr'], torch.sign(p.grad.data))
                else:
                    p.data.add_(-group['lr'], p.grad.data)

            #torch.cuda.synchronize()
            self.update_para_time.record()

        return loss
Ejemplo n.º 34
0
    def __init__(self, module, device_ids=None, output_device=None, dim=0,
                 broadcast_buffers=True):
        super(DistributedDataParallel, self).__init__()
        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device
        self.broadcast_buffers = broadcast_buffers

        MB = 1024 * 1024
        # used for intra-node param sync and inter-node sync as well
        self.broadcast_bucket_size = 10 * MB

        # Sync params and buffers
        module_states = list(self.module.state_dict().values())
        if len(module_states) > 0:
            self._dist_broadcast_coalesced(module_states,
                                           self.broadcast_bucket_size)

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module, self.device_ids)
            self._module_copies[0] = self.module
            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
                    copy_param.detach_()
                    copy_param.requires_grad = param.requires_grad
        else:
            self._module_copies = [self.module]

        # Currently NCCL backend only supports single reduction thread/bucket
        if dist._backend == dist.dist_backend.NCCL:
            bucket_bytes_cap = float('inf')
        else:
            bucket_bytes_cap = 1 * MB

        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
        param_buckets = []
        # Split the parameters into buckets and by types as well
        for dev_idx, module in enumerate(self._module_copies):
            param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap)))

        self.bucket_sizes = []
        self.bucket_map = {}
        param_types = set()

        # We transpose param_buckets, so the loop is over buckets.
        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
            self.bucket_sizes.append(0)
            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
            # of params from each device.
            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
                if idx == 0:
                    # Bucket parameter type tracking
                    bucket_param_type = param_tuple[0].type()
                    param_types.add(bucket_param_type)
                    # Only gloo and nccl support half-precision
                    if bucket_param_type == torch.cuda.HalfTensor and \
                            dist._backend != dist.dist_backend.NCCL and \
                            dist._backend != dist.dist_backend.GLOO:
                        raise RuntimeError("DistributedDataParallel currently only "
                                           "supports half precision parameters "
                                           "with Nccl and Gloo backend")
                if not param_tuple[0].requires_grad:
                    continue
                for p in param_tuple:
                    self.bucket_map[p] = bucket_idx
                self.bucket_sizes[bucket_idx] += 1

        # TODO, adding mixed precision support in NCCL reduction code path
        # This is because NCCL backend doesn't support multiple reduction
        # bucket.
        if len(param_types) > 1 and dist._backend == dist.dist_backend.NCCL:
            raise RuntimeError("DistributedDataParallel currently doesn't "
                               "support mixed precision type for NCCL backend")

        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
        self.reduced = [False] * len(self.bucket_sizes)

        self._register_grad_hooks()

        self.dispatch_lock = threading.Lock()
        self._start_reduction_threads()
Ejemplo n.º 35
0
    def step(self, closure=None):

        args = self.args

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            cur_lr = group['lr']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue

                d_p = p.grad.data
                if self.compression_buffer == False:
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_(d_p)
                    d_p.add_(momentum, buf)

                all_grads.append(d_p)

            length = 0
            for _ in _take_tensors(all_grads, self.bucket_size):
                length += 1

            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for i, dev_grads in enumerate(dev_grads_buckets):
                d_p_new = _flatten_dense_tensors(dev_grads)

                if len(self.err_buf) < length:
                    self.err_buf.append(torch.zeros_like(d_p_new))
                    self.server_err_buf.append(torch.zeros_like(d_p_new))

                err_buf = self.err_buf[i]
                server_err_buf = self.server_err_buf[i]

                d_p_new.add_(self.prev_lr / cur_lr, err_buf)

                p_buf = d_p_new

                if self.all_reduce:
                    dist.all_reduce(d_p_new)  #self.all_gpu, group = 0
                    if self.signum:
                        d_p_new = torch.sign(d_p_new)
                elif self.signum:
                    if self.nodes > 1:
                        if self.compression_buffer:
                            d_p_new_scale = torch.ones(1)
                            d_p_new_scale[0] = d_p_new.abs().sum().cpu().item(
                            ) / d_p_new.numel()
                            d_p_new, tensor_size = self.compressor.compress(
                                d_p_new)

                            tmp = self.compressor.uncompress(
                                d_p_new.clone(), tensor_size)
                            tmp.mul_(d_p_new_scale.item())

                            err_buf.copy_(p_buf).sub_(tmp)
                        else:
                            d_p_new = torch.sign(d_p_new)

                        if dist.get_rank() == 0:
                            d_p_new_list = []
                            d_p_new_scale_list = []
                            for index, inter_node_group in enumerate(
                                    self.inter_node_group_list):
                                d_p_temp = d_p_new.clone()
                                d_p_scale_temp = d_p_new_scale.clone()
                                dist.broadcast(d_p_scale_temp,
                                               self.inter_node_list[index + 1],
                                               group=inter_node_group)
                                dist.broadcast(d_p_temp,
                                               self.inter_node_list[index + 1],
                                               group=inter_node_group)
                                d_p_new_list.append(d_p_temp)
                                d_p_new_scale_list.append(d_p_scale_temp)
                        else:
                            dist.broadcast(d_p_new_scale,
                                           dist.get_rank(),
                                           group=self.inter_node_group_list[
                                               self.nodes_rank - 1])
                            dist.broadcast(d_p_new,
                                           dist.get_rank(),
                                           group=self.inter_node_group_list[
                                               self.nodes_rank - 1])
                            dist.barrier(group=self.all_inter_node_group)

                        if dist.get_rank() == 0:
                            if self.compression_buffer:
                                d_p_new_list.append(d_p_new)  #count itself
                                d_p_new_scale_list.append(
                                    d_p_new_scale)  #count itself
                                #d_p_new = self.compressor.majority_vote(d_p_new_list)
                                d_p_new = torch.zeros(tensor_size).cuda()
                                for d_p, d_p_scale in zip(
                                        d_p_new_list, d_p_new_scale_list):
                                    tmp = self.compressor.uncompress(
                                        d_p, tensor_size)
                                    d_p_new.add_(d_p_scale.item(), tmp)
                                d_p_new /= self.nodes

                                d_p_new.add_(self.prev_lr / cur_lr,
                                             server_err_buf)

                                un_compr = d_p_new

                                d_p_new_scale = torch.ones(1)
                                d_p_new_scale[0] = d_p_new.abs().sum().cpu(
                                ).item() / d_p_new.numel()

                                d_p_new, _ = self.compressor.compress(d_p_new)

                                tmp = self.compressor.uncompress(
                                    d_p_new.clone(), tensor_size)
                                tmp.mul_(d_p_new_scale.item())

                                server_err_buf.copy_(un_compr).sub_(tmp)
                            else:
                                for d_p_temp in d_p_new_list:
                                    d_p_new.add_(d_p_temp)
                                d_p_new = d_p_new / self.nodes

                            dist.barrier(group=self.all_inter_node_group)

                        dist.broadcast(d_p_new,
                                       0,
                                       group=self.all_inter_node_group)
                        if self.compression_buffer:
                            dist.broadcast(d_p_new_scale,
                                           0,
                                           group=self.all_inter_node_group)

                        if self.compression_buffer:
                            d_p_new = self.compressor.uncompress(
                                d_p_new, tensor_size)
                            d_p_new.mul_(d_p_new_scale.item())
                else:
                    print('You can not run without signum or all_reduce')

                #unflatten
                dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)

            for p in group['params']:
                if self.compression_buffer:  #This part of code is temporary
                    if weight_decay != 0:
                        if momentum != 0:
                            param_state = self.state[p]
                            if 'wd_mom' not in param_state:
                                buf = param_state['wd_mom'] = torch.zeros_like(
                                    p.data)
                            else:
                                buf = param_state['wd_mom']

                            buf.mul_(momentum).add_(weight_decay, p.data)
                            p.grad.data.add_(momentum, buf)

                        p.grad.data.add_(weight_decay, p.data)

                p.data.add_(-group['lr'], p.grad.data)

            self.prev_lr = group['lr']

        return loss
Ejemplo n.º 36
0
 def sync_buffers_bucket(self):
     buffers = [p.data for p in list(self.model._all_buffers())]
     for tensors in _take_tensors(buffers, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)