Ejemplo n.º 1
0
        def allreduce_params():
            if (self.needs_reduction):
                self.needs_reduction = False
                buckets = {}
                for name, param in self.module.named_parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if self.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print(
                            "WARNING: gloo dist backend for half parameters may be extremely slow."
                            +
                            " It is recommended to use the NCCL backend in this case."
                        )
                        self.warn_on_half = False

                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    torch.cuda.synchronize()
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(
                            grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
Ejemplo n.º 2
0
    def _sync_params(self):
        if len(self.device_ids) > 1:
            # intra-node parameter sync
            params = [p.data for p in self.module.parameters()]
            result = broadcast_coalesced(params, self.device_ids,
                                         self.broadcast_bucket_size)
            for tensors, module in zip(result[1:], self._module_copies[1:]):
                for tensor, param in zip(tensors, module.parameters()):
                    param.data.set_(tensor)

        buffers = list(self.module._all_buffers())
        if len(buffers) > 0:
            # cross-node buffer sync
            flat_buffers = _flatten_dense_tensors(buffers)
            dist.broadcast(flat_buffers, 0)
            for buf, synced in zip(
                    buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
                buf.copy_(synced)

            if len(self.device_ids) > 1:
                # intra-node buffer sync
                result = broadcast_coalesced(buffers, self.device_ids,
                                             self.broadcast_bucket_size)
                for tensors, module in zip(result[1:],
                                           self._module_copies[1:]):
                    for tensor, buf in zip(tensors, module._all_buffers()):
                        buf.set_(tensor)
Ejemplo n.º 3
0
def flat_dist_call(tensors, call, extra_args=None):
    flat_dist_call.warn_on_half = True
    buckets = {}
    for tensor in tensors:
        tp = tensor.type()
        if tp not in buckets:
            buckets[tp] = []
        buckets[tp].append(tensor)

    if flat_dist_call.warn_on_half:
        if torch.cuda.HalfTensor in buckets:
            print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                  " It is recommended to use the NCCL backend in this case.")
            flat_dist_call.warn_on_half = False

    for tp in buckets:
        bucket = buckets[tp]
        coalesced = _flatten_dense_tensors(bucket)
        if extra_args is not None:
            call(coalesced, *extra_args)
        else:
            call(coalesced)
        coalesced /= dist.get_world_size()
        for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
            buf.copy_(synced)
Ejemplo n.º 4
0
 def _sync_params(self):
     groups = dict()
     for p in self.module.parameters():
         if not p.requires_grad or p.grad is None:
             continue
         if hasattr(p, "dp_comm"):
             dp_comm = p.dp_comm
         else:
             dp_comm = "dp"
         group_key = (dp_comm, p.dtype)
         if group_key not in groups:
             groups[group_key] = [p]
         else:
             groups[group_key].append(p)
     for (dp_comm, _), group in groups.items():
         if dp_comm not in self.comms:
             continue
         comm = self.comms[dp_comm]
         datas = [p.data for p in group]
         coalesced = _flatten_dense_tensors(datas)
         torch.distributed.broadcast(coalesced, 0, group=comm)
         torch.cuda.synchronize()
         synced = _unflatten_dense_tensors(coalesced, datas)
         for d, s in zip(datas, synced):
             d.copy_(s)
Ejemplo n.º 5
0
def nccl_allreduce_by_buckets(nc, kn, all_grads):
    # Now bucketing the parameters
    dev_grads_buckets = _take_tensors(all_grads, nccl_reduce_bucket_size)


    for grads_batch in dev_grads_buckets:
        grads_batch_coalesced = _flatten_dense_tensors(grads_batch)

        # NOTE:
        torch.cuda.synchronize()

        # NOTE:
        #nbutils.cuda_current_context().synchronize()
        # or,
        nc.stream_sync()

        sz = np.prod(grads_batch_coalesced.size())
        nc.do_all_reduce(grads_batch_coalesced.data_ptr(),
                         grads_batch_coalesced.data_ptr(),
                         sz)

        nc.stream_sync()

        grads_batch_coalesced[:] = grads_batch_coalesced / float(kn)

        grads_batch_reduced = _unflatten_dense_tensors(
            grads_batch_coalesced, grads_batch)

        for grad, reduced in zip(grads_batch, grads_batch_reduced):
            grad.copy_(reduced)
Ejemplo n.º 6
0
 def allreduce_params(no_scale=False,
                      reduce_after=False,
                      fp32_allreduce=False):
     groups = dict()
     for p in self.module.parameters():
         if not p.requires_grad or p.grad is None:
             continue
         if hasattr(p, "dp_comm"):
             dp_comm = p.dp_comm
         else:
             dp_comm = "dp"
         group_key = (dp_comm, p.dtype)
         if group_key not in groups:
             groups[group_key] = [p]
         else:
             groups[group_key].append(p)
     for (dp_comm, dtype), group in groups.items():
         if dp_comm not in self.comms:
             continue
         comm = self.comms[dp_comm]
         grads = [p.grad.data for p in group]
         coalesced = _flatten_dense_tensors(grads)
         if fp32_allreduce and dtype != torch.float32:
             coalesced = coalesced.float()
         if not no_scale and not reduce_after:
             coalesced /= comm.size()
         torch.distributed.all_reduce(coalesced, group=comm)
         torch.cuda.synchronize()
         if not no_scale and reduce_after:
             coalesced /= comm.size()
         synced = _unflatten_dense_tensors(coalesced, grads)
         for g, s in zip(grads, synced):
             g.copy_(s)
Ejemplo n.º 7
0
        def _process_batch():
            dev_grad_batch, dev_events, job_event = queue.get()
            dev_coalesced = []
            # Coalesce the tensors on all devices and start a local reduction
            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
                    stream.wait_event(event)
                    coalesced = _flatten_dense_tensors(grad_batch)
                    dev_coalesced.append(coalesced)
            # Wait for all copies to complete before starting the NCCL kernel
            for stream in reduction_streams:
                stream.synchronize()
            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)

            # From now on we're only going to work on the first device (from device_ids)
            grad_batch = dev_grad_batch[0]
            coalesced = dev_coalesced[0]
            reduce_stream = reduction_streams[0]
            with torch.cuda.stream(reduce_stream):
                reduce_stream.wait_stream(nccl_streams[0])
                coalesced /= dist.get_world_size()
                dist.all_reduce(coalesced, group=group_id)
                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
                    grad.copy_(reduced)
            job_event.set()
Ejemplo n.º 8
0
    def _sync_reduction_works(self):
        # Now only work on the first GPU of self.device_ids, uncoalesce
        # the gradients for each bucket
        for bucket_idx, grads_batch in enumerate(self.buckets):
            # wait will let current stream wait on the c10d reduction stream
            self.reduction_works[bucket_idx].wait()

            self.buckets_coalesced[bucket_idx] /= self.process_group.size()
            grads_batch_reduced = _unflatten_dense_tensors(
                self.buckets_coalesced[bucket_idx], grads_batch[0])

            for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                grad.copy_(reduced)

        # Reset the module states
        self.next_bucket = len(self.bucket_sizes) - 1
        self.ready_buckets_not_reduced = set()
        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]

        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
                         for _ in range(len(self.device_ids))]
                        for i in range(len(self.bucket_sizes))]
        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))]
                                   for i in range(len(self.bucket_sizes))]
Ejemplo n.º 9
0
    def _sync_reduction_works(self):
        # Now only work on the first GPU of self.device_ids
        # _sync_reduction will use a seperate CUDA stream to uncoalesce
        # the coalesced tensors to achieve more parallelisms
        temp = [None for _ in range(self.parameter_length)]
        for p in self.module.parameters():
            if p.requires_grad:
                bucket_idx = self.bucket_map[p]
                temp[bucket_idx] = p.grad.data
        flatten_tensor = _flatten_dense_tensors(temp)
        self.buckets = flatten_tensor[self.mask[0]] / self.process_group.size()
        dist.all_reduce(self.buckets, async_op=False)
        temp_zero = torch.zeros(self.flat_parameter.shape,
                                device=self.device_id)
        temp_zero[self.mask[0]] = self.buckets
        dense_tensor = _unflatten_dense_tensors(temp_zero, temp)

        for p in self.module.parameters():
            if p.requires_grad:
                bucket_idx = self.bucket_map[p]
                p.grad.data.copy_(dense_tensor[bucket_idx])

        # Reset the module states
        self.next_bucket = len(self.bucket_sizes) - 1
        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]

        self.buckets = [None]
        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
        self.buckets_ready_size = [0 for i in range(len(self.bucket_sizes))]
Ejemplo n.º 10
0
def master2model(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False):
    "Copy master parameters to model parameters"
    if flat_master:
        for model, master in zip(model_params, _unflatten_dense_tensors(master_params[0].data, model_params)):
            model.data.copy_(master)
    else: 
        for model, master in zip(model_params, master_params): model.data.copy_(master.data)
Ejemplo n.º 11
0
    def allreduce_gradients(self):
        """Reduce gradients across data parallel ranks."""
        # If we have buffers, simply reduce the data in the buffer.
        if self._grad_buffers is not None:
            for _, buffer_ in self._grad_buffers.items():
                buffer_.data /= mpu.get_data_parallel_world_size()
                torch.distributed.all_reduce(
                    buffer_.data, group=mpu.get_data_parallel_group())
        else:
            # Otherwise, bucketize and all-reduce
            buckets = {}
            # Pack the buckets.
            for param in self.module.parameters():
                if param.requires_grad and param.grad is not None:
                    tp = param.data.type()
                    if tp not in buckets:
                        buckets[tp] = []
                    buckets[tp].append(param)
                    param.main_grad = param.grad

            # For each bucket, all-reduce and copy all-reduced grads.
            for tp in buckets:
                bucket = buckets[tp]
                grads = [param.grad.data for param in bucket]
                coalesced = _flatten_dense_tensors(grads)
                coalesced /= mpu.get_data_parallel_world_size()
                torch.distributed.all_reduce(
                    coalesced, group=mpu.get_data_parallel_group())
                for buf, synced in zip(
                        grads, _unflatten_dense_tensors(coalesced, grads)):
                    buf.copy_(synced)
Ejemplo n.º 12
0
 def _dist_broadcast_coalesced(self, tensors, buffer_size):
     for tensors in _take_tensors(tensors, buffer_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.broadcast(flat_tensors, 0)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
 def allreduce_params(self,
                      reduce_after=True,
                      no_scale=False,
                      fp32_allreduce=False):
     # adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/distributed.py
     buckets = {}
     for param in self.all_parameters:
         if param.requires_grad and param.grad is not None:
             tp = (param.data.type())
             if tp not in buckets:
                 buckets[tp] = []
             buckets[tp].append(param)
     for tp in buckets:
         bucket = buckets[tp]
         grads = [param.grad.data for param in bucket]
         coalesced = _flatten_dense_tensors(grads)
         if fp32_allreduce:
             coalesced = coalesced.float()
         if not no_scale and not reduce_after:
             coalesced /= dist.get_world_size(
                 group=self.data_parallel_group)
         dist.all_reduce(coalesced, group=self.data_parallel_group)
         torch.cuda.synchronize()
         if not no_scale and reduce_after:
             coalesced /= dist.get_world_size(
                 group=self.data_parallel_group)
         for buf, synced in zip(grads,
                                _unflatten_dense_tensors(coalesced, grads)):
             buf.copy_(synced)
Ejemplo n.º 14
0
 def _copy_params_fp32_to_fp16(self):
     for fp16_group, fp32_group in zip(self.fp16_param_groups,
                                       self.fp32_param_groups):
         for fp16_param, fp32_param in zip(
                 fp16_group,
                 _unflatten_dense_tensors(fp32_group, fp16_group)):
             fp16_param.data.copy_(fp32_param.data)
Ejemplo n.º 15
0
    def step(self, closure=None):
        """
        Not supporting closure.
        """
        # First compute norm for all group so we know if there is overflow
        grads_groups_flat = []
        norm_groups = []
        skip = False
        for i, group in enumerate(self.fp16_groups):
            grads_groups_flat.append(
                _flatten_dense_tensors([p.grad for p in group]))
            norm_groups.append(self._compute_grad_norm(grads_groups_flat[i]))
            if norm_groups[i] == -1:  #TODO: early break
                skip = True

        if skip:
            self._update_scale(skip)
            return

        # norm is in fact norm*cur_scale
        self.optimizer.step(grads=[[g] for g in grads_groups_flat],
                            output_params=[[p] for p in self.fp16_groups_flat],
                            scale=self.cur_scale,
                            grad_norms=norm_groups)

        # TODO: we probably don't need this? just to be safe
        for i in range(len(norm_groups)):
            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data = q.data

        self._update_scale(False)
        return
Ejemplo n.º 16
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = []  # shape (num_tensors, num_gpus)
    output = []
    for tensor_at_gpus in zip(*inputs):
        if tensor_at_gpus[0].is_sparse:
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
        else:
            dense_tensors.append(tensor_at_gpus)
    itrs = [_take_tensors(tensors, buffer_size) for tensors in zip(*dense_tensors)]
    for chunks in zip(*itrs):
        tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        result = reduce_add(tensors, destination)
        output.extend(_unflatten_dense_tensors(result, chunks[0]))
    return tuple(_reorder_tensors_as(output, inputs[0]))
Ejemplo n.º 17
0
def flat_dist_call(tensors, call, extra_args=None):
    flat_dist_call.warn_on_half = True
    buckets = {}
    for tensor in tensors:
        tp = tensor.type()
        if tp not in buckets:
            buckets[tp] = []
        buckets[tp].append(tensor)
                    
    if flat_dist_call.warn_on_half:
        if torch.cuda.HalfTensor in buckets:
            print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                  " It is recommended to use the NCCL backend in this case.")
            flat_dist_call.warn_on_half = False

    for tp in buckets:
        bucket = buckets[tp]
        coalesced = _flatten_dense_tensors(bucket)
        if extra_args is not None:
            call(coalesced, *extra_args)
        else:
            call(coalesced)
        coalesced /= dist.get_world_size()
        for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
            buf.copy_(synced)
Ejemplo n.º 18
0
        def reduction_fn_nccl():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = [[] for _ in range(len(self._module_copies))]
            all_grads_buckets_iters = []

            # Bucketing all the gradients
            for dev_idx, module in enumerate(self._module_copies):
                for param in module.parameters():
                    if not param.requires_grad or param.grad is None:
                        continue
                    if param.grad.requires_grad:
                        raise RuntimeError(
                            "DistributedDataParallel only works "
                            "with gradients that don't require "
                            "grad")
                    # Adding the gradients for reduction
                    all_grads[dev_idx].append(param.grad.data)

                # Now bucketing the parameters
                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
                                                  self.nccl_reduce_bucket_size)

                all_grads_buckets_iters.append(dev_grads_buckets)

            # Now reduce each bucket one after another
            for grads_batch in zip(*all_grads_buckets_iters):
                grads_batch_coalesced = []
                # Coalesce each bucket
                for dev_idx, dev_grads_batch in enumerate(grads_batch):
                    dev_id = self.device_ids[dev_idx]
                    with torch.cuda.device(dev_id):
                        dev_grads_batch_coalesced = _flatten_dense_tensors(
                            dev_grads_batch)
                        grads_batch_coalesced.append(dev_grads_batch_coalesced)

                # We will only use device 0's results, but this single op should be
                # faster than doing the following two operation sequentially:
                # (1) intra-node reduce to lead GPU, followed by
                # (2) inter-node allreduce for all the first lead GPUs in all nodes
                dist.all_reduce_multigpu(grads_batch_coalesced,
                                         group=self.nccl_reduction_group_id)

                # Now only work on the first device of self.device_ids, uncoalesce
                # the gradients for each bucket
                grads_batch_coalesced[0] /= dist.get_world_size()
                grads_batch_reduced = _unflatten_dense_tensors(
                    grads_batch_coalesced[0], grads_batch[0])
                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                    grad.copy_(reduced)

            # clear the gradients and save memory for replicas
            for module in self._module_copies[1:]:
                for param in module.parameters():
                    if param.requires_grad:
                        param.grad = None
                        param.data.set_()
Ejemplo n.º 19
0
        def _process_batch():
            dev_grad_batch, dev_events, job_event = queue.get()
            dev_coalesced = []
            # Coalesce the tensors on all devices and start a local reduction
            for dev_id, grad_batch, event, stream in zip(
                    device_ids, dev_grad_batch, dev_events, reduction_streams):
                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
                    stream.wait_event(event)
                    coalesced = _flatten_dense_tensors(grad_batch)
                    dev_coalesced.append(coalesced)
            # Wait for all copies to complete before starting the NCCL kernel
            for stream in reduction_streams:
                stream.synchronize()
            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)

            # From now on we're only going to work on the first device (from device_ids)
            grad_batch = dev_grad_batch[0]
            coalesced = dev_coalesced[0]
            reduce_stream = reduction_streams[0]
            with torch.cuda.stream(reduce_stream):
                reduce_stream.wait_stream(nccl_streams[0])
                coalesced /= dist.get_world_size()
                dist.all_reduce(coalesced, group=group_id)
                for grad, reduced in zip(
                        grad_batch,
                        _unflatten_dense_tensors(coalesced, grad_batch)):
                    grad.copy_(reduced)
            job_event.set()
Ejemplo n.º 20
0
 def sync_params_bucket(self):
     params = [p.data for p in list(self.model.parameters())]
     for tensors in _take_tensors(params, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.broadcast(flat_tensors, src=0)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 21
0
def all_reduce_coalesced(tensors, divisor=1, op=ReduceOp.SUM, buffer_size=256 * MB):
    for tensors in _take_tensors(tensors, buffer_size):
        flat_tensors = _flatten_dense_tensors(tensors)
        dist.all_reduce(flat_tensors, op)
        if divisor != 1:
            flat_tensors.div_(divisor)
        for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
            old_t.data = new_t
Ejemplo n.º 22
0
def _broadcast_coalesced(tensors, bucket_size_mb=-1):
    buckets = _get_coalesced_bucket(tensors, bucket_size_mb)
    for tensors in buckets:
        flat_tensors = _flatten_dense_tensors(tensors)
        dist.broadcast(flat_tensors, 0)
        for tensor, synced in zip(
                tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
            tensor.copy_(synced)
Ejemplo n.º 23
0
 def _copy_params_fp32_to_fp16(self):
     for fp16_group, fp32_group in zip(self.fp16_param_groups,
                                       self.fp32_flattened_groups):
         if len(fp16_group) > 0:
             for fp16_param, fp32_data in zip(
                     fp16_group,
                     _unflatten_dense_tensors(fp32_group.data, fp16_group)):
                 fp16_param.data.copy_(fp32_data)
Ejemplo n.º 24
0
 def sync_grads_bucket(self):
     grads = [p.grad.data for p in list(self.model.parameters()) if p.requires_grad]
     for tensors in _take_tensors(grads, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         new_all_reduce(flat_tensors, cuda=self.cuda)
         flat_tensors.div_(self.world_size)
         for tensor, synced in zip(tensors,_unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 25
0
        def reduction_fn_nccl():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = [[] for _ in range(len(self._module_copies))]
            all_grads_buckets_iters = []

            # Bucketing all the gradients
            for dev_idx, module in enumerate(self._module_copies):
                for param in module.parameters():
                    if not param.requires_grad or param.grad is None:
                        continue
                    if param.grad.requires_grad:
                        raise RuntimeError("DistributedDataParallel only works "
                                           "with gradients that don't require "
                                           "grad")
                    # Adding the gradients for reduction
                    all_grads[dev_idx].append(param.grad.data)

                # Now bucketing the parameters
                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
                                                  self.nccl_reduce_bucket_size)

                all_grads_buckets_iters.append(dev_grads_buckets)

            # Now reduce each bucket one after another
            for grads_batch in zip(*all_grads_buckets_iters):
                grads_batch_coalesced = []
                # Coalesce each bucket
                for dev_idx, dev_grads_batch in enumerate(grads_batch):
                    dev_id = self.device_ids[dev_idx]
                    with torch.cuda.device(dev_id):
                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
                        grads_batch_coalesced.append(dev_grads_batch_coalesced)

                # We will only use device 0's results, but this single op should be
                # faster than doing the following two operation sequentially:
                # (1) intra-node reduce to lead GPU, followed by
                # (2) inter-node allreduce for all the first lead GPUs in all nodes
                dist.all_reduce_multigpu(grads_batch_coalesced,
                                         group=self.nccl_reduction_group_id)

                # Now only work on the first device of self.device_ids, uncoalesce
                # the gradients for each bucket
                grads_batch_coalesced[0] /= dist.get_world_size()
                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                    grad.copy_(reduced)

            # clear the gradients and save memory for replicas
            for module in self._module_copies[1:]:
                for param in module.parameters():
                    if param.requires_grad:
                        param.grad = None
                        param.data.set_()
Ejemplo n.º 26
0
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    buckets = _get_coalesced_bucket(tensors, bucket_size_mb)
    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)
Ejemplo n.º 27
0
 def sync_buffers_bucket(self):
     buffers = [p.data for p in list(self.model._all_buffers())]
     for tensors in _take_tensors(buffers, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         flat_tensors.zero_()
         dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
         flat_tensors.div_(self.num_workers)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 28
0
def master2model(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False)->None:
    "Copy `master_params` to `model_params`."
    if flat_master:
        for model_group,master_group in zip(model_params,master_params):
            if len(model_group) != 0:
                for model, master in zip(model_group, _unflatten_dense_tensors(master_group[0].data, model_group)):
                    model.data.copy_(master)
    else:
        for model_group,master_group in zip(model_params,master_params):
            for model, master in zip(model_group, master_group): model.data.copy_(master.data)
Ejemplo n.º 29
0
    def __init__(self,
                 init_optimizer,
                 static_loss_scale=1.0,
                 dynamic_loss_scale=False,
                 dynamic_loss_args=None,
                 verbose=True):

        # The fused optimizer does all the work. We need this layer for two reason:
        # 1. maintain same user API from apex.fp16_utils
        # 2. keep common stuff here in case we need to add new fused optimizer later

        # differences from apex.fp16_utils:
        # - assume all model params in fp16
        # - assume all params requires grad
        # - flat by groups, not keeping state. TODO: remove state explicitly?
        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")
        self.optimizer = init_optimizer

        # param flattened by groups
        self.fp16_groups = []
        self.fp16_groups_flat = []
        self.fp32_groups_flat = []

        # loop to deal with groups
        for i, param_group in enumerate(self.optimizer.param_groups):
            # push this group to list before modify
            self.fp16_groups.append(param_group['params'])
            # init fp16 weight buffer, flattened
            self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]]))
            # set model fp16 weight to slices of flattened buffer
            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
            for p,q in zip(self.fp16_groups[i], updated_params):
                p.data = q.data
            # init master weight, flattened
            self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach())
            # modify optimizer of have flat master weight
            self.fp32_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it
            param_group['params'] = [self.fp32_groups_flat[i]]

        # we may have a way of fusing dynamic scale. Do not support for now
        if dynamic_loss_scale:
            if dynamic_loss_args is not None:
                raise SystemError("Do not support dynamic loss scale args for now.")
            self.dynamic_loss_scale = True
            self.cur_scale = 2**32
            self.cur_iter = 0
            self.last_overflow_iter = -1
            self.scale_factor = 2
            self.scale_window = 1000
        else:
            self.dynamic_loss_scale = False
            self.cur_iter = 0
            self.cur_scale = static_loss_scale
Ejemplo n.º 30
0
def to_model_params(model_params,
                    master_params,
                    flat_master: bool = False) -> None:
    if flat_master:
        for model, master in zip(
                model_params,
                _unflatten_dense_tensors(master_params[0].data, model_params)):
            model.data.copy_(master)
    else:
        for model, master in zip(model_params, master_params):
            model.data.copy_(master.data)
Ejemplo n.º 31
0
def all_gather_multigpu(output_tensor_lists,
                        input_tensor_list,
                        group=group.WORLD):
    """Gathers tensors from the whole group in a list.
    Each tensor in tensor_list should reside on a separate GPU

    Only nccl backend is currently supported
    tensors should only be GPU tensors

    Arguments:
        output_tensor_lists (List[List[Tensor]]): Output lists. It should
            contain correctly-sized tensors on each GPU to be used for output of
            the collective.

            e.g. output_tensor_lists[i] contains the all_gather
            result that resides on the GPU of input_tensor_list[i].

            Note that each element of output_tensor_lists[i] has the size of
            world_size * len(input_tensor_list), since the function all gathers
            the result from every single GPU in the group. To interpret each
            element of output_tensor_list[i], note that input_tensor_list[j] of
            rank k will be appear in
            output_tensor_list[i][rank * world_size + j]

            Also note that len(output_tensor_lists), and the size of each
            element in output_tensor_lists (each element is a list,
            therefore len(output_tensor_lists[i])), need to be the same
            for all the distributed processes calling this function.

        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
            be broadcast from current process.
            Note that len(input_tensor_list) needs to be the same for
            all the distributed processes calling this function.
        group (optional): Group of the collective.
    """
    assert torch.distributed._initialized == _INITIALIZED_PG, \
        "collective only supported in process-group mode"

    flatten_tensor_list = []
    for output_tensor_list in output_tensor_lists:
        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))

    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
                                             input_tensor_list,
                                             group)

    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
                                                  flatten_tensor_list):
        for tensor, value in zip(output_tensor_list,
                                 _unflatten_dense_tensors(flatten_tensor,
                                                          output_tensor_list)):
            tensor.copy_(value)

    return ret
Ejemplo n.º 32
0
def all_gather_multigpu(output_tensor_lists,
                        input_tensor_list,
                        group=group.WORLD):
    """Gathers tensors from the whole group in a list.
    Each tensor in tensor_list should reside on a separate GPU

    Only nccl backend is currently supported
    tensors should only be GPU tensors

    Arguments:
        output_tensor_lists (List[List[Tensor]]): Output lists. It should
            contain correctly-sized tensors on each GPU to be used for output of
            the collective.

            e.g. output_tensor_lists[i] contains the all_gather
            result that resides on the GPU of input_tensor_list[i].

            Note that each element of output_tensor_lists[i] has the size of
            world_size * len(input_tensor_list), since the function all gathers
            the result from every single GPU in the group. To interpret each
            element of output_tensor_list[i], note that input_tensor_list[j] of
            rank k will be appear in
            output_tensor_list[i][rank * world_size + j]

            Also note that len(output_tensor_lists), and the size of each
            element in output_tensor_lists (each element is a list,
            therefore len(output_tensor_lists[i])), need to be the same
            for all the distributed processes calling this function.

        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
            be broadcast from current process.
            Note that len(input_tensor_list) needs to be the same for
            all the distributed processes calling this function.
        group (optional): Group of the collective.
    """
    assert torch.distributed._initialized == _INITIALIZED_PG, \
        "collective only supported in process-group mode"

    flatten_tensor_list = []
    for output_tensor_list in output_tensor_lists:
        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))

    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
                                             input_tensor_list, group)

    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
                                                  flatten_tensor_list):
        for tensor, value in zip(
                output_tensor_list,
                _unflatten_dense_tensors(flatten_tensor, output_tensor_list)):
            tensor.copy_(value)

    return ret
Ejemplo n.º 33
0
 def sync_grads_bucket(self):
     grads = [
         p.grad.data for p in list(self.model.parameters())
         if p.requires_grad
     ]
     for tensors in _take_tensors(grads, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
         flat_tensors.div_(self.num_workers)
         for tensor, synced in zip(
                 tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
             tensor.copy_(synced)
Ejemplo n.º 34
0
def all_gather_coalesced(tensors, buffer_size=256 * MB):
    assert dist.get_backend() == dist.dist_backend.NCCL  # gloo gives some weird device error
    world_size = dist.get_world_size()
    rcv_lsts = [[] for _ in range(world_size)]
    for tensors in _take_tensors(tensors, buffer_size):
        flat_tensors = _flatten_dense_tensors(tensors)
        tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)]
        dist.all_gather(tmp_rcv_lst, flat_tensors)
        for i, rcv_flat_tensors in enumerate(tmp_rcv_lst):
            for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors):
                rcv_lsts[i].append(rcv_t)
    return rcv_lsts
Ejemplo n.º 35
0
    def _dist_broadcast_coalesced(self, tensors, buffer_size):
        """
        Broadcast a sequence of tensors to the default group from rank 0.
        Small tensors are first coalesced into a buffer to reduce the number of
        broadcasts.

        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
                            same GPU.
        buffer_size (int): maximum size of the buffer for coalescing
        """
        for tensors in _take_tensors(tensors, buffer_size):
            flat_tensors = _flatten_dense_tensors(tensors)
            dist.broadcast(flat_tensors, 0)
            for tensor, synced in zip(tensors,
                                      _unflatten_dense_tensors(flat_tensors, tensors)):
                tensor.copy_(synced)
Ejemplo n.º 36
0
        def allreduce_params():
            if self.needs_reduction:
                self.needs_reduction = False
                buckets = defaultdict(list)
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        buckets[tp].append(param)

                for bucket in buckets.values():
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
Ejemplo n.º 37
0
def all_gather_multigpu(output_tensor_lists,
                        input_tensor_list,
                        group=group.WORLD):
    """Gathers tensors from the whole group in a list.
    Each tensor in tensor_list should reside on a separate GPU

    Only nccl backend is currently supported
    tensors should only be GPU tensors

    Arguments:
        output_tensor_lists (List[List[Tensor]]): Output lists. It should
            contain correctly-sized tensors on each GPU to be used for output of
            the collective.
        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
            be broadcast from current process.
        group (optional): Group of the collective.
    """
    assert torch.distributed._initialized == _INITIALIZED_PG, \
        "collective only supported in process-group mode"

    warnings.warn("""
    ================================================================================
                                        WARNING
    ================================================================================
    all_gather_multigpu is still experimental. The API will change without
    notice and we're can't guarantee full correctness and expected performance yet.
    We'll announce it once it's ready.
    """)

    flatten_tensor_list = []
    for output_tensor_list in output_tensor_lists:
        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))

    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
                                             input_tensor_list,
                                             group)

    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
                                                  flatten_tensor_list):
        for tensor, value in zip(output_tensor_list,
                                 _unflatten_dense_tensors(flatten_tensor,
                                                          output_tensor_list)):
            tensor.copy_(value)

    return ret
Ejemplo n.º 38
0
    def _sync_params(self):
        params = [p.data for p in self.module.parameters()]
        result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
        for tensors, module in zip(result[1:], self._module_copies[1:]):
            for tensor, param in zip(tensors, module.parameters()):
                param.data.set_(tensor)

        buffers = list(self.module._all_buffers())
        if len(buffers) > 0:
            # cross-node buffer sync
            flat_buffers = _flatten_dense_tensors(buffers)
            dist.broadcast(flat_buffers, 0)
            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
                buf.copy_(synced)

            # intra-node buffer sync
            result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
            for tensors, module in zip(result[1:], self._module_copies[1:]):
                for tensor, buf in zip(tensors, module._all_buffers()):
                    buf.set_(tensor)
Ejemplo n.º 39
0
def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """Broadcasts a sequence tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        tensors (sequence): tensors to broadcast.
        devices (Iterable): an iterable of devices among which to broadcast.
          Note that it should be like (src, dst1, dst2, ...), the first element
          of which is the source device to broadcast from.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of the ``tensor``, placed on devices
        corresponding to indices from ``devices``.
    """
    for tensor in tensors:
        if tensor.get_device() != devices[0]:
            raise RuntimeError('all tensors must be on devices[0]')
    outputs = [[] for _ in devices]
    # use the original tensors for the first device
    outputs[0].extend(tensors)
    for chunk in _take_tensors(tensors, buffer_size):
        if chunk[0].is_sparse:
            flat_indices, flat_values = _flatten_sparse_tensors(chunk)
            result_indices = broadcast(flat_indices, devices)
            result_values = broadcast(flat_values, devices)
            unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values))
        else:
            flat = _flatten_dense_tensors(chunk)
            results = broadcast(flat, devices)
            unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results)
        # use the broadcasted tensors for the remaining devices
        for dst, unflat_res in zip(outputs[1:], unflat_results[1:]):
            dst.extend(unflat_res)
    for i, output in enumerate(outputs):
        outputs[i] = _reorder_tensors_as(output, tensors)
    return tuple(outputs)
Ejemplo n.º 40
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        flat_result = reduce_add(flat_tensors, destination)
        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
    return tuple(_reorder_tensors_as(output, ref_order))
Ejemplo n.º 41
0
        def allreduce_params():
            if (self.needs_reduction):
                self.needs_reduction = False
                buckets = {}
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if self.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                              " It is recommended to use the NCCL backend in this case.")
                        self.warn_on_half = False

                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)