def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD
    def all_reduce(self, tensor, name):
        if dist.get_backend() == "nccl":
            assert tensor.is_cuda, f"Bad tensor - NCCL backend only supports cuda tensors: {name}"

        if len(self.ranks) == 1:
            return tensor

        dist.all_reduce(tensor, group=self._group, async_op=False)
        return tensor
Exemple #3
0
    def __init__(self, indices, batch_size, drop_last, ddp_seed):
        if isinstance(indices, Mapping):
            self._mapping_keys = list(indices.keys())
        else:
            self._mapping_keys = None

        self.rank = dist.get_rank()
        self.num_replicas = dist.get_world_size()
        self.seed = ddp_seed
        self.epoch = 0
        self.batch_size = batch_size
        self.drop_last = drop_last

        if self.drop_last and len(indices) % self.num_replicas != 0:
            self.num_samples = math.ceil((len(indices) - self.num_replicas) / self.num_replicas)
        else:
            self.num_samples = math.ceil(len(indices) / self.num_replicas)
        self.total_size = self.num_samples * self.num_replicas
        # If drop_last is True, we create a shared memory array larger than the number
        # of indices since we will need to pad it after shuffling to make it evenly
        # divisible before every epoch.  If drop_last is False, we create an array
        # with the same size as the indices so we can trim it later.
        self.shared_mem_size = self.total_size if not self.drop_last else len(indices)
        self.num_indices = len(indices)

        if self.rank == 0:
            name, id_ = _generate_shared_mem_name_id()
            if isinstance(indices, Mapping):
                device = next(iter(indices.values())).device
                id_tensor = _get_id_tensor_from_mapping(indices, device, self._mapping_keys)
                self._tensor_dataset = create_shared_mem_array(
                    name, (self.shared_mem_size, 2), torch.int64)
                self._tensor_dataset[:id_tensor.shape[0], :] = id_tensor
            else:
                self._tensor_dataset = create_shared_mem_array(
                    name, (self.shared_mem_size,), torch.int64)
                self._tensor_dataset[:len(indices)] = indices
            self._device = self._tensor_dataset.device
            meta_info = torch.LongTensor([id_, self._tensor_dataset.shape[0]])
        else:
            meta_info = torch.LongTensor([0, 0])

        if dist.get_backend() == 'nccl':
            # Use default CUDA device; PyTorch DDP required the users to set the CUDA
            # device for each process themselves so calling .cuda() should be safe.
            meta_info = meta_info.cuda()
        dist.broadcast(meta_info, src=0)

        if self.rank != 0:
            id_, num_samples = meta_info.tolist()
            name = _get_shared_mem_name(id_)
            if isinstance(indices, Mapping):
                indices_shared = get_shared_mem_array(name, (num_samples, 2), torch.int64)
            else:
                indices_shared = get_shared_mem_array(name, (num_samples,), torch.int64)
            self._tensor_dataset = indices_shared
            self._device = indices_shared.device
    def all_gather_no_backward(self, tensor, name):
        if dist.get_backend() == "nccl":
            assert tensor.is_cuda, f"Bad tensor - NCCL backend only supports cuda tensors: {name}"

        tensor_list = [
            torch.zeros(tensor.size(), dtype=tensor.dtype, device=tensor.device)
            for _ in range(self.size)
        ]
        dist.all_gather(tensor_list, tensor, group=self._group)
        return tensor_list
Exemple #5
0
    def _get_distributed_info(self):
        import torch.distributed as dist
        if not dist.is_available() or not dist.is_initialized():
            return None

        return {
            "backend": dist.get_backend(),
            "rank": dist.get_rank(),
            "world_size": dist.get_world_size()
        }
Exemple #6
0
def config_pytorch(use_cuda=False, seed=None, cudnn_deterministic=False):
    """Config pytorch packages.

    Fix random number for packages and initialize distributed environment for pytorch.
    Setup cuda environment for pytorch.

    :param config: A global object containing specified config.
    :type config: argparse.Namespace
    """
    # Setting `cudnn.deterministic = True` will turn on
    # CUDNN deterministic setting which can slow down training considerably.
    # Unexpected behavior may also be observed from checkpoint.
    # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py
    if cudnn_deterministic:
        # cudnn.deterministic = True
        print("You have chosen to seed training. "
              "This will turn on the CUDNN deterministic setting, "
              "which can slow down your training considerably! "
              "You may see unexpected behavior when restarting "
              "from checkpoints.")

    if seed:
        random.seed(seed)
        torch.manual_seed(seed)

    # define the graph for the computation.
    if use_cuda:
        assert torch.cuda.is_available()

    rank = dist.get_rank()
    world_size = dist.get_world_size()
    backend = dist.get_backend() if dist.is_initialized() else None
    graph = FCGraph(rank, world_size, use_cuda)

    # enable cudnn accelerator if we are using cuda.
    if use_cuda:
        graph.assigned_gpu_id()
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

        if torch.backends.cudnn.version() is None:
            print("CUDNN not found on device.")

        print(
            "World size={}, Rank={}, hostname={}, backend={}, cuda_available={}, cuda_device={}"
            .format(
                world_size,
                rank,
                socket.gethostname(),
                backend,
                torch.cuda.is_available(),
                torch.cuda.current_device(),
            ))

    return rank, world_size, graph
Exemple #7
0
    def reduce_tensors(self, tensors, dst, group, bufs=None):
        r"""Perform reduce on a list of tensors.

        Args:
            tensors:
                The list of tensors to reduce.

            dst:
                The destination rank.

            group:
                The desired communication group.

            bufs (optional):
                The buffers to store reduced parameters. If not provided,
                in-place operations will be performed on tensors.

        Returns:
            list: Returns a list of request handlers.
        """
        reqs = []

        if bufs is None:
            if self.device.type == 'cpu':
                for tensor in tensors:
                    # Hack for Gloo on CPU. It may change the sender's tensor.
                    if dist.get_backend() == 'gloo':
                        tensor = tensor.clone().detach()
                    reqs.append(
                        dist.reduce(tensor, dst, group=group, async_op=True))
            else:
                for i, tensor in enumerate(tensors):
                    with torch.cuda.stream(self.streams[i % self.num_streams]):
                        reqs.append(
                            dist.reduce(tensor,
                                        dst,
                                        group=group,
                                        async_op=True))
            # fi
        else:
            if self.device.type == 'cpu':
                for tensor, buf in zip(tensors, bufs):
                    buf[:] = tensor[:]
                    reqs.append(
                        dist.reduce(buf, dst, group=group, async_op=True))
            else:
                for i, tensor in enumerate(tensors):
                    with torch.cuda.stream(self.streams[i % self.num_streams]):
                        buf = bufs[i]
                        buf[:] = tensor[:]
                        reqs.append(
                            dist.reduce(buf, dst, group=group, async_op=True))
            # fi
        # fi
        return reqs
def all_gather_coalesced(tensors, buffer_size=256 * MB):
    assert dist.get_backend() == dist.dist_backend.NCCL  # gloo gives some weird device error
    world_size = dist.get_world_size()
    rcv_lsts = [[] for _ in range(world_size)]
    for tensors in _take_tensors(tensors, buffer_size):
        flat_tensors = _flatten_dense_tensors(tensors)
        tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)]
        dist.all_gather(tmp_rcv_lst, flat_tensors)
        for i, rcv_flat_tensors in enumerate(tmp_rcv_lst):
            for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors):
                rcv_lsts[i].append(rcv_t)
    return rcv_lsts
Exemple #9
0
 def backward(ctx, *grad_outputs):
     if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
         rank = dist.get_rank()
         gx = torch.empty_like(grad_outputs[rank])
         _Reduce_Scatter.apply(ReduceOp.SUM, ctx.group, gx, *grad_outputs)
     else:
         # As many backends doesn't support ReduceScatter, we use AlltoAll with .sum()
         # to emulate the ReduceScatter behavior
         tensor_list = [torch.empty_like(tensor) for tensor in grad_outputs]
         gxs = _AlltoAll.apply(ctx.group, tensor_list, *grad_outputs)
         gx = torch.sum(torch.stack(gxs), dim=0)
     return (None, gx)
Exemple #10
0
def _test_dist_spawn_fn(local_rank, backend, world_size, device):
    from ignite.distributed.utils import _model

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    assert isinstance(_model,
                      _NativeDistModel), f"{type(_model)} vs _NativeDistModel"

    assert _model.get_local_rank() == local_rank
    assert _model.get_world_size() == world_size
    assert _model.device().type == torch.device(device).type
Exemple #11
0
def dist_get_info():
    if not dist_is_on():
        return {'distributed': False}

    return {
        'distributed': True,
        'distributed backend': tdist.get_backend(),
        'world size': tdist.get_world_size(),
        'master addr': environ.get('MASTER_ADDR'),
        'master port': environ.get('MASTER_PORT'),
        'rank': tdist.get_rank(),
        'local rank': environ.get('LOCAL_RANK'),
    }
def _serialize_to_tensor(data, group):
    backend = dist.get_backend(group)
    assert backend in ["gloo", "nccl"]
    device = torch.device("cpu" if backend == "gloo" else "cuda")

    buffer = pickle.dumps(data)
    if len(buffer) > 1024**3:
        print("Rank {} trying to all-gather {:.2f} GB of data on device {}".
              format(dist.get_rank(),
                     len(buffer) / (1024**3), device))
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(device=device)
    return tensor
Exemple #13
0
def _test__native_dist_model_create_from_backend_dist(init_method, local_rank,
                                                      rank, world_size,
                                                      backend, true_device):

    import os
    from datetime import timedelta

    timeout = timedelta(seconds=20)
    os.environ["RANK"] = f"{rank}"

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ

    model = _NativeDistModel.create_from_backend(backend=backend,
                                                 timeout=timeout,
                                                 init_method=init_method)

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    with pytest.raises(
            RuntimeError,
            match=
            r"Can not create new distributed process group if default one is"):
        _NativeDistModel.create_from_backend(backend=backend, timeout=timeout)

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": local_rank,
            "rank": rank,
            "world_size": world_size,
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": world_size,
        },
    )

    if init_method is None:
        assert model._init_method == "env://"
    else:
        assert model._init_method == init_method

    model.finalize()

    del os.environ["RANK"]

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ
    assert "RANK" not in os.environ
    def _broadcast(self, tensor, src, name, async_op=False):
        if dist.get_backend() == "nccl":
            assert (
                tensor.is_cuda
            ), f"Bad tensor - NCCL backend only supports cuda tensors: {name}; {tensor}"

        if len(self.ranks) == 1:
            # Conform to the comm.broadcast and comm.all_reduce API, but do no work
            if async_op:
                return NoopPromise()
            else:
                return tensor

        return dist.broadcast(tensor, src, group=self._group, async_op=async_op)
Exemple #15
0
def _test_dist_spawn_fn(local_rank, backend, world_size, device):
    from ignite.distributed.utils import _model

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    assert isinstance(_model, _NativeDistModel), "{} vs _NativeDistModel".format(type(_model))

    assert _model.get_local_rank() == local_rank
    assert _model.get_world_size() == world_size
    if backend == "nccl":
        assert _model.device() == torch.device("{}:{}".format(device, local_rank))
    elif backend == "gloo":
        assert _model.device() == torch.device(device)
Exemple #16
0
 def __call__(self, value: torch.Tensor) -> None:
     _total_value = list(util.unwrap_to_tensors(value))[0]
     _count = 1
     if util.dist_available():
         device = util.int_to_device(-1 if dist.get_backend() != "nccl" else
                                     torch.cuda.current_device())
         count = torch.tensor(_count, device=device)
         total_value = torch.tensor(_total_value, device=device)
         # Reduce from all processes
         dist.all_reduce(count, op=dist.ReduceOp.SUM)
         dist.all_reduce(total_value, op=dist.ReduceOp.SUM)
         _count = count.item()
         _total_value = total_value.item()
     self._count += _count
     self._total_value += _total_value
Exemple #17
0
def get_dist_device():
    """
    Get the expected target device in the distributed data parallel.
    For NCCL backend, return GPU device of current process.
    For GLOO backend, return CPU.
    For any other backends, return None as the default, tensor.to(None) will not change the device.

    """
    if dist.is_initialized():
        backend = dist.get_backend()
        if backend == "nccl" and torch.cuda.is_available():
            return torch.device(f"cuda:{torch.cuda.current_device()}")
        elif backend == "gloo":
            return torch.device("cpu")
    return None
Exemple #18
0
 def backward(ctx, grad_output):
     if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
         rank = dist.get_rank(group=ctx.group)
         world_size = dist.get_world_size(group=ctx.group)
         out_size = list(grad_output.size())
         if out_size[0] % world_size != 0:
             raise RuntimeError(
                 f'Tensor with dimensions: {out_size} does '
                 f'not have first dimension divisible by world_size: {world_size}'
             )
         out_size[0] = out_size[0] // dist.get_world_size(group=ctx.group)
         gx = torch.empty(out_size, device=grad_output.device, dtype=grad_output.dtype)
         dist._reduce_scatter_base(gx, grad_output, ReduceOp.SUM, ctx.group)
     else:
         raise RuntimeError("Backend not supported!")
     return (None, gx, None)
Exemple #19
0
def _broadcast_object_list(object_list, src=0, group=None):
    if _rank_not_in_group(group):
        return

    my_rank = get_rank()
    # Serialize object_list elements to tensors on src rank.
    if my_rank == src:
        tensor_list, size_list = zip(
            *[_object_to_tensor(obj) for obj in object_list])
        object_sizes_tensor = torch.cat(size_list)
    else:
        object_sizes_tensor = torch.LongTensor(len(object_list))

    group_backend = get_backend(group)
    is_nccl_backend = group_backend == Backend.NCCL
    current_device = torch.device("cpu")
    if is_nccl_backend:
        # See note about using torch.cuda.current_device() here in docstring.
        # We cannot simply use my_rank since rank == device is not necessarily
        # true.
        current_device = torch.device('cuda', torch.cuda.current_device())
        object_sizes_tensor = object_sizes_tensor.to(current_device)
        object_sizes_tensor = object_sizes_tensor.to(current_device)

    # Broadcast object sizes
    broadcast(object_sizes_tensor, src=src, group=group)

    # Concatenate and broadcast serialized object tensors
    if my_rank == src:
        object_tensor = torch.cat(tensor_list)
    else:
        object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item())

    if is_nccl_backend:
        object_tensor = object_tensor.to(current_device)

    broadcast(object_tensor, src=src, group=group)

    # Deserialize objects using their stored sizes.
    offset = 0
    if my_rank != src:
        for i, obj_size in enumerate(object_sizes_tensor):
            obj_view = object_tensor[offset:offset + obj_size]
            obj_view = obj_view.type(
                torch.ByteTensor)  # type: ignore[call-overload]
            offset += obj_size
            object_list[i] = _tensor_to_object(obj_view, obj_size)
Exemple #20
0
    def reduce(self, op):
        """
        Reduces average value over all workers.

        :param op: 'sum' or 'mean', reduction operator
        """
        if op not in ('sum', 'mean'):
            raise NotImplementedError

        distributed = (get_world_size() > 1)
        if distributed:
            # Backward/forward compatibility around
            # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
            # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
            # To accomodate change in Pytorch's distributed API
            if hasattr(dist, "get_backend"):
                _backend = dist.get_backend()
                if hasattr(dist, "DistBackend"):
                    backend_enum_holder = dist.DistBackend
                else:
                    backend_enum_holder = dist.Backend
            else:
                _backend = dist._backend
                backend_enum_holder = dist.dist_backend

            cuda = _backend == backend_enum_holder.NCCL

            if cuda:
                avg = torch.cuda.FloatTensor([self.avg])
                _sum = torch.cuda.FloatTensor([self.sum])
            else:
                avg = torch.FloatTensor([self.avg])
                _sum = torch.FloatTensor([self.sum])

            try:
                _reduce_op = dist.ReduceOp
            except AttributeError:
                _reduce_op = dist.reduce_op

            dist.all_reduce(avg, op=_reduce_op.SUM)
            dist.all_reduce(_sum, op=_reduce_op.SUM)
            self.avg = avg.item()
            self.sum = _sum.item()

            if op == 'mean':
                self.avg /= get_world_size()
                self.sum /= get_world_size()
Exemple #21
0
 def forward(ctx, group, *tensors):
     ctx.group = group
     out_tensor_list = [
         torch.empty_like(tensors[i]) for i in range(dist.get_world_size(group=group))
     ]
     reqs = [None] * dist.get_world_size(group=group)
     my_rank = dist.get_rank(group=group)
     # Implement it on means of scatter/gather, send/recv async operations have issues
     if dist.get_backend(group=group) is dist.Backend.GLOO:
         for i in range(dist.get_world_size(group=group)):
             to_send = None
             if i == my_rank:
                 to_send = list(tensors)
             dist.scatter(out_tensor_list[i], to_send, i, group=group)
     else:
         dist.all_to_all(out_tensor_list, list(tensors), group=group)
     return tuple(out_tensor_list)
Exemple #22
0
def _test_dist_spawn_fn(local_rank, backend, world_size, device, **kwargs):
    from ignite.distributed.utils import _model

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    assert isinstance(_model,
                      _NativeDistModel), f"{type(_model)} vs _NativeDistModel"

    assert _model.get_local_rank() == local_rank
    assert _model.get_world_size() == world_size
    assert _model.device().type == torch.device(device).type

    if "master_addr" in kwargs:
        assert os.environ["MASTER_ADDR"] == kwargs["master_addr"]
    if "master_port" in kwargs:
        assert os.environ["MASTER_PORT"] == str(kwargs["master_port"])
Exemple #23
0
    def __init__(
        self,
        params: _params_t,
        optim: Type[Optimizer] = SGD,
        group: Optional[Any] = None,
        broadcast_buffer_size: int = -1,
        broadcast_fp16: bool = False,
        **default: Any,
    ):

        # Hold all the model params in the root .param_groups
        self.in_super_constructor = True
        super().__init__(params, default)
        self.in_super_constructor = False

        # Partition information. lazy evaluation, computed when requested
        self.__per_device_params: Dict[
            torch.device,
            List[List[Parameter]]] = OrderedDict()  # device, rank, params
        self.__param_rank: Dict[torch.Tensor, int] = {}
        self._partition_parameters: List[List[dict]] = []
        self.__param_to_index: Dict[int, int] = {}
        self.__local_params: Optional[List[torch.Tensor]] = None

        # Default empty values + immutables
        self._optim_defaults = default
        self._optim_constructor = optim

        self.group = group if group is not None else dist.group.WORLD
        self.world_size = dist.get_world_size(self.group)
        self.backend = dist.get_backend(self.group)
        self.rank = dist.get_rank(self.group)
        self.global_rank = get_global_rank(self.group, self.rank)
        self._local_to_global_rank = [
            get_global_rank(self.group, i) for i in range(self.world_size)
        ]

        self.broadcast_fp16 = broadcast_fp16
        self.buckets: Dict[torch.device, Dict[int, ParamBucket]] = {}
        self._all_states: List[Dict[str, Any]] = [
        ]  # Optional consolidated optimizer state
        self._default_device = torch.device("cpu")

        # Setup everything which is related to the parameters to be trained
        # (partition and optimizer for the shard)
        self.refresh_trainable()
    def distributed(self):
        if not dist.is_initialized():
            raise KeyError('Could not set distributed mode for the compression algorithm '
                           'because the default process group has not been initialized.')

        if next(self.model.parameters()).is_cuda:
            state = torch.cuda.get_rng_state()
            if dist.get_backend() == dist.Backend.NCCL:
                state = state.cuda()
            torch.distributed.broadcast(state, src=0)
            torch.cuda.set_rng_state(state.cpu())
        else:
            state = torch.get_rng_state()
            torch.distributed.broadcast(state, src=0)
            torch.set_rng_state(state)

        self._distributed = True
Exemple #25
0
    def __init__(self,
                 module,
                 message_size=10000000,
                 delay_allreduce=False,
                 shared_param=None):
        super(DistributedDataParallel, self).__init__()

        # Backward/forward compatibility around
        # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36
        if (hasattr(dist, "get_backend")):
            self._backend = dist.get_backend()
            self.backend_enum_holder = dist.DistBackend
        else:
            self._backend = dist._backend
            self.backend_enum_holder = dist.dist_backend

        self.warn_on_half = True if self._backend == self.backend_enum_holder.GLOO else False

        if shared_param is not None:
            raise ValueError(
                "shared_param is no longer supported as an option.  It was misleadingly named from the start.  It turns out overlapping communication with computation should work fine with shared parameters.  If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead."
            )

        self.delay_allreduce = delay_allreduce
        self.message_size = message_size

        self.reduction_stream = torch.cuda.Stream()

        self.module = module

        if self._backend == self.backend_enum_holder.NCCL:
            for param in self.module.parameters():
                assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."

        self.active_params = []

        self.param_type_to_tmp_i = {
            "torch.cuda.HalfTensor": 0,
            "torch.cuda.FloatTensor": 1,
            "torch.cuda.DoubleTensor": 2
        }

        self.create_hooks()

        flat_dist_call([param.data for param in self.module.parameters()],
                       dist.broadcast, (0, ))
Exemple #26
0
    def __call__(
        self,
        best_span_strings: Union[str, List[str]],
        answer_strings: Union[List[str], List[List[str]]],
    ):
        if not isinstance(best_span_strings, list):
            best_span_strings = [best_span_strings]
            answer_strings = [answer_strings]  # type: ignore

        cast(List[str], best_span_strings)
        cast(List[List[str]], answer_strings)

        assert len(best_span_strings) == len(answer_strings)

        count = len(best_span_strings)
        exact_match = 0
        f1_score = 0.0

        for prediction, gold_answers in zip(best_span_strings, answer_strings):
            exact_match += squad.metric_max_over_ground_truths(
                squad.compute_exact, prediction, gold_answers
            )
            f1_score += squad.metric_max_over_ground_truths(
                squad.compute_f1, prediction, gold_answers
            )

        if is_distributed():
            if dist.get_backend() == "nccl":
                device = torch.cuda.current_device()
            else:
                device = torch.device("cpu")
            # Converting bool to int here, since we want to count the number of exact matches.
            _exact_match = torch.tensor(exact_match, dtype=torch.int).to(device)
            _f1_score = torch.tensor(f1_score, dtype=torch.double).to(device)
            _count = torch.tensor(count).to(device)
            dist.all_reduce(_exact_match, op=dist.ReduceOp.SUM)
            dist.all_reduce(_f1_score, op=dist.ReduceOp.SUM)
            dist.all_reduce(_count, op=dist.ReduceOp.SUM)
            exact_match = _exact_match.item()
            f1_score = _f1_score.item()
            count = _count.item()

        self._total_em += exact_match
        self._total_f1 += f1_score
        self._count += count
Exemple #27
0
def synchronize():
    """
    Helper function to synchronize (barrier) among all processes when
    using distributed training
    """
    if not dist.is_available():
        return
    if not dist.is_initialized():
        return
    world_size = dist.get_world_size()
    if world_size == 1:
        return
    if dist.get_backend() == dist.Backend.NCCL:
        # This argument is needed to avoid warnings.
        # It's valid only for NCCL backend.
        dist.barrier(device_ids=[torch.cuda.current_device()])
    else:
        dist.barrier()
    def synchronize_between_processes(self):
        if dist.is_initialized():
            # Bypass NCCL (which forces CUDA-only sync)
            if dist.get_backend() == "nccl":
                group = dist.new_group(backend="gloo")
            else:
                group = dist.group.WORLD

            my_rank = dist.get_rank()
            output = [None for _ in range(dist.get_world_size())]
            dist.gather_object(self.predictions,
                               output if my_rank == 0 else None,
                               dst=0,
                               group=group)

            return list(itertools.chain.from_iterable(output)), my_rank == 0
        else:
            return self.predictions, True
Exemple #29
0
def peak_memory_mb() -> Dict[int, float]:
    """
    Get peak memory usage for each worker, as measured by max-resident-set size:

    https://unix.stackexchange.com/questions/30940/getrusage-system-call-what-is-maximum-resident-set-size

    Only works on OSX and Linux, otherwise the result will be 0.0 for every worker.
    """
    if resource is None or sys.platform not in ("linux", "darwin"):
        peak_mb = 0.0
    else:
        peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if sys.platform == "darwin":
            # On OSX the result is in bytes.
            peak_mb = peak / 1_000_000
        else:
            # On Linux the result is in kilobytes.
            peak_mb = peak / 1_000

    if is_distributed():
        global_rank = dist.get_rank()
        world_size = dist.get_world_size()

        peak_mb_tensor = torch.tensor([float(global_rank), peak_mb])
        # All of these tensors will be gathered into this list.
        gather_results = [torch.tensor([0.0, 0.0]) for _ in range(world_size)]

        # If the backend is 'nccl', this means we're training on GPUs, so these tensors
        # need to be on GPU.
        if dist.get_backend() == "nccl":
            peak_mb_tensor = peak_mb_tensor.cuda()
            gather_results = [x.cuda() for x in gather_results]

        dist.all_gather(gather_results, peak_mb_tensor)

        results_dict: Dict[int, float] = {}
        for peak_mb_tensor in gather_results:
            worker = int(peak_mb_tensor[0])
            peak_mb = round(float(peak_mb_tensor[1]), 3)
            results_dict[worker] = peak_mb

        return results_dict
    else:
        return {0: peak_mb}
Exemple #30
0
def _serialize_to_tensor(data, group):
    global _USE_HVD
    if _USE_HVD:
        backend = "nccl"
    else:
        backend = dist.get_backend(group)
    assert backend in ["gloo", "nccl"]
    device = torch.device("cpu" if backend == "gloo" else "cuda")

    buffer = pickle.dumps(data)
    if len(buffer) > 1024**3:
        logger = logging.getLogger(__name__)
        logger.warning(
            "Rank {} trying to all-gather {:.2f} GB of data on device {}".
            format(get_rank(),
                   len(buffer) / (1024**3), device))
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(device=device)
    return tensor