コード例 #1
0
    def _test_all_reduce_multigpu_helper(
        self,
        group,
        group_id,
        rank,
        rank_to_GPU,
        op,
        master_value,
        worker_value,
        expected_value,
    ):
        for src in group:
            if rank == src:
                tensors = [
                    _build_tensor(src + 1, master_value).cuda(device=i)
                    for i in rank_to_GPU[rank]
                ]
            else:
                tensors = [
                    _build_tensor(src + 1, worker_value).cuda(device=i)
                    for i in rank_to_GPU[rank]
                ]

            dist.all_reduce_multigpu(tensors, op, group_id)
            expected_tensor = _build_tensor(src + 1, expected_value)
            for tensor in tensors:
                self.assertEqual(tensor, expected_tensor)

        self._barrier()
コード例 #2
0
        def reduction_fn_nccl():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = [[] for _ in range(len(self._module_copies))]
            all_grads_buckets_iters = []

            # Bucketing all the gradients
            for dev_idx, module in enumerate(self._module_copies):
                for param in module.parameters():
                    if not param.requires_grad or param.grad is None:
                        continue
                    if param.grad.requires_grad:
                        raise RuntimeError(
                            "DistributedDataParallel only works "
                            "with gradients that don't require "
                            "grad")
                    # Adding the gradients for reduction
                    all_grads[dev_idx].append(param.grad.data)

                # Now bucketing the parameters
                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
                                                  self.nccl_reduce_bucket_size)

                all_grads_buckets_iters.append(dev_grads_buckets)

            # Now reduce each bucket one after another
            for grads_batch in zip(*all_grads_buckets_iters):
                grads_batch_coalesced = []
                # Coalesce each bucket
                for dev_idx, dev_grads_batch in enumerate(grads_batch):
                    dev_id = self.device_ids[dev_idx]
                    with torch.cuda.device(dev_id):
                        dev_grads_batch_coalesced = _flatten_dense_tensors(
                            dev_grads_batch)
                        grads_batch_coalesced.append(dev_grads_batch_coalesced)

                # We will only use device 0's results, but this single op should be
                # faster than doing the following two operation sequentially:
                # (1) intra-node reduce to lead GPU, followed by
                # (2) inter-node allreduce for all the first lead GPUs in all nodes
                dist.all_reduce_multigpu(grads_batch_coalesced,
                                         group=self.nccl_reduction_group_id)

                # Now only work on the first device of self.device_ids, uncoalesce
                # the gradients for each bucket
                grads_batch_coalesced[0] /= dist.get_world_size()
                grads_batch_reduced = _unflatten_dense_tensors(
                    grads_batch_coalesced[0], grads_batch[0])
                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                    grad.copy_(reduced)

            # clear the gradients and save memory for replicas
            for module in self._module_copies[1:]:
                for param in module.parameters():
                    if param.requires_grad:
                        param.grad = None
                        param.data.set_()