def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD
def all_reduce(self, tensor, name): if dist.get_backend() == "nccl": assert tensor.is_cuda, f"Bad tensor - NCCL backend only supports cuda tensors: {name}" if len(self.ranks) == 1: return tensor dist.all_reduce(tensor, group=self._group, async_op=False) return tensor
def __init__(self, indices, batch_size, drop_last, ddp_seed): if isinstance(indices, Mapping): self._mapping_keys = list(indices.keys()) else: self._mapping_keys = None self.rank = dist.get_rank() self.num_replicas = dist.get_world_size() self.seed = ddp_seed self.epoch = 0 self.batch_size = batch_size self.drop_last = drop_last if self.drop_last and len(indices) % self.num_replicas != 0: self.num_samples = math.ceil((len(indices) - self.num_replicas) / self.num_replicas) else: self.num_samples = math.ceil(len(indices) / self.num_replicas) self.total_size = self.num_samples * self.num_replicas # If drop_last is True, we create a shared memory array larger than the number # of indices since we will need to pad it after shuffling to make it evenly # divisible before every epoch. If drop_last is False, we create an array # with the same size as the indices so we can trim it later. self.shared_mem_size = self.total_size if not self.drop_last else len(indices) self.num_indices = len(indices) if self.rank == 0: name, id_ = _generate_shared_mem_name_id() if isinstance(indices, Mapping): device = next(iter(indices.values())).device id_tensor = _get_id_tensor_from_mapping(indices, device, self._mapping_keys) self._tensor_dataset = create_shared_mem_array( name, (self.shared_mem_size, 2), torch.int64) self._tensor_dataset[:id_tensor.shape[0], :] = id_tensor else: self._tensor_dataset = create_shared_mem_array( name, (self.shared_mem_size,), torch.int64) self._tensor_dataset[:len(indices)] = indices self._device = self._tensor_dataset.device meta_info = torch.LongTensor([id_, self._tensor_dataset.shape[0]]) else: meta_info = torch.LongTensor([0, 0]) if dist.get_backend() == 'nccl': # Use default CUDA device; PyTorch DDP required the users to set the CUDA # device for each process themselves so calling .cuda() should be safe. meta_info = meta_info.cuda() dist.broadcast(meta_info, src=0) if self.rank != 0: id_, num_samples = meta_info.tolist() name = _get_shared_mem_name(id_) if isinstance(indices, Mapping): indices_shared = get_shared_mem_array(name, (num_samples, 2), torch.int64) else: indices_shared = get_shared_mem_array(name, (num_samples,), torch.int64) self._tensor_dataset = indices_shared self._device = indices_shared.device
def all_gather_no_backward(self, tensor, name): if dist.get_backend() == "nccl": assert tensor.is_cuda, f"Bad tensor - NCCL backend only supports cuda tensors: {name}" tensor_list = [ torch.zeros(tensor.size(), dtype=tensor.dtype, device=tensor.device) for _ in range(self.size) ] dist.all_gather(tensor_list, tensor, group=self._group) return tensor_list
def _get_distributed_info(self): import torch.distributed as dist if not dist.is_available() or not dist.is_initialized(): return None return { "backend": dist.get_backend(), "rank": dist.get_rank(), "world_size": dist.get_world_size() }
def config_pytorch(use_cuda=False, seed=None, cudnn_deterministic=False): """Config pytorch packages. Fix random number for packages and initialize distributed environment for pytorch. Setup cuda environment for pytorch. :param config: A global object containing specified config. :type config: argparse.Namespace """ # Setting `cudnn.deterministic = True` will turn on # CUDNN deterministic setting which can slow down training considerably. # Unexpected behavior may also be observed from checkpoint. # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py if cudnn_deterministic: # cudnn.deterministic = True print("You have chosen to seed training. " "This will turn on the CUDNN deterministic setting, " "which can slow down your training considerably! " "You may see unexpected behavior when restarting " "from checkpoints.") if seed: random.seed(seed) torch.manual_seed(seed) # define the graph for the computation. if use_cuda: assert torch.cuda.is_available() rank = dist.get_rank() world_size = dist.get_world_size() backend = dist.get_backend() if dist.is_initialized() else None graph = FCGraph(rank, world_size, use_cuda) # enable cudnn accelerator if we are using cuda. if use_cuda: graph.assigned_gpu_id() torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True if torch.backends.cudnn.version() is None: print("CUDNN not found on device.") print( "World size={}, Rank={}, hostname={}, backend={}, cuda_available={}, cuda_device={}" .format( world_size, rank, socket.gethostname(), backend, torch.cuda.is_available(), torch.cuda.current_device(), )) return rank, world_size, graph
def reduce_tensors(self, tensors, dst, group, bufs=None): r"""Perform reduce on a list of tensors. Args: tensors: The list of tensors to reduce. dst: The destination rank. group: The desired communication group. bufs (optional): The buffers to store reduced parameters. If not provided, in-place operations will be performed on tensors. Returns: list: Returns a list of request handlers. """ reqs = [] if bufs is None: if self.device.type == 'cpu': for tensor in tensors: # Hack for Gloo on CPU. It may change the sender's tensor. if dist.get_backend() == 'gloo': tensor = tensor.clone().detach() reqs.append( dist.reduce(tensor, dst, group=group, async_op=True)) else: for i, tensor in enumerate(tensors): with torch.cuda.stream(self.streams[i % self.num_streams]): reqs.append( dist.reduce(tensor, dst, group=group, async_op=True)) # fi else: if self.device.type == 'cpu': for tensor, buf in zip(tensors, bufs): buf[:] = tensor[:] reqs.append( dist.reduce(buf, dst, group=group, async_op=True)) else: for i, tensor in enumerate(tensors): with torch.cuda.stream(self.streams[i % self.num_streams]): buf = bufs[i] buf[:] = tensor[:] reqs.append( dist.reduce(buf, dst, group=group, async_op=True)) # fi # fi return reqs
def all_gather_coalesced(tensors, buffer_size=256 * MB): assert dist.get_backend() == dist.dist_backend.NCCL # gloo gives some weird device error world_size = dist.get_world_size() rcv_lsts = [[] for _ in range(world_size)] for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)] dist.all_gather(tmp_rcv_lst, flat_tensors) for i, rcv_flat_tensors in enumerate(tmp_rcv_lst): for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors): rcv_lsts[i].append(rcv_t) return rcv_lsts
def backward(ctx, *grad_outputs): if dist.get_backend(group=ctx.group) is dist.Backend.NCCL: rank = dist.get_rank() gx = torch.empty_like(grad_outputs[rank]) _Reduce_Scatter.apply(ReduceOp.SUM, ctx.group, gx, *grad_outputs) else: # As many backends doesn't support ReduceScatter, we use AlltoAll with .sum() # to emulate the ReduceScatter behavior tensor_list = [torch.empty_like(tensor) for tensor in grad_outputs] gxs = _AlltoAll.apply(ctx.group, tensor_list, *grad_outputs) gx = torch.sum(torch.stack(gxs), dim=0) return (None, gx)
def _test_dist_spawn_fn(local_rank, backend, world_size, device): from ignite.distributed.utils import _model assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend assert isinstance(_model, _NativeDistModel), f"{type(_model)} vs _NativeDistModel" assert _model.get_local_rank() == local_rank assert _model.get_world_size() == world_size assert _model.device().type == torch.device(device).type
def dist_get_info(): if not dist_is_on(): return {'distributed': False} return { 'distributed': True, 'distributed backend': tdist.get_backend(), 'world size': tdist.get_world_size(), 'master addr': environ.get('MASTER_ADDR'), 'master port': environ.get('MASTER_PORT'), 'rank': tdist.get_rank(), 'local rank': environ.get('LOCAL_RANK'), }
def _serialize_to_tensor(data, group): backend = dist.get_backend(group) assert backend in ["gloo", "nccl"] device = torch.device("cpu" if backend == "gloo" else "cuda") buffer = pickle.dumps(data) if len(buffer) > 1024**3: print("Rank {} trying to all-gather {:.2f} GB of data on device {}". format(dist.get_rank(), len(buffer) / (1024**3), device)) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to(device=device) return tensor
def _test__native_dist_model_create_from_backend_dist(init_method, local_rank, rank, world_size, backend, true_device): import os from datetime import timedelta timeout = timedelta(seconds=20) os.environ["RANK"] = f"{rank}" assert "MASTER_ADDR" not in os.environ assert "MASTER_PORT" not in os.environ model = _NativeDistModel.create_from_backend(backend=backend, timeout=timeout, init_method=init_method) assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend with pytest.raises( RuntimeError, match= r"Can not create new distributed process group if default one is"): _NativeDistModel.create_from_backend(backend=backend, timeout=timeout) _assert_model( model, { "device": true_device, "local_rank": local_rank, "rank": rank, "world_size": world_size, "node_index": 0, "nnodes": 1, "nproc_per_node": world_size, }, ) if init_method is None: assert model._init_method == "env://" else: assert model._init_method == init_method model.finalize() del os.environ["RANK"] assert "MASTER_ADDR" not in os.environ assert "MASTER_PORT" not in os.environ assert "RANK" not in os.environ
def _broadcast(self, tensor, src, name, async_op=False): if dist.get_backend() == "nccl": assert ( tensor.is_cuda ), f"Bad tensor - NCCL backend only supports cuda tensors: {name}; {tensor}" if len(self.ranks) == 1: # Conform to the comm.broadcast and comm.all_reduce API, but do no work if async_op: return NoopPromise() else: return tensor return dist.broadcast(tensor, src, group=self._group, async_op=async_op)
def _test_dist_spawn_fn(local_rank, backend, world_size, device): from ignite.distributed.utils import _model assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend assert isinstance(_model, _NativeDistModel), "{} vs _NativeDistModel".format(type(_model)) assert _model.get_local_rank() == local_rank assert _model.get_world_size() == world_size if backend == "nccl": assert _model.device() == torch.device("{}:{}".format(device, local_rank)) elif backend == "gloo": assert _model.device() == torch.device(device)
def __call__(self, value: torch.Tensor) -> None: _total_value = list(util.unwrap_to_tensors(value))[0] _count = 1 if util.dist_available(): device = util.int_to_device(-1 if dist.get_backend() != "nccl" else torch.cuda.current_device()) count = torch.tensor(_count, device=device) total_value = torch.tensor(_total_value, device=device) # Reduce from all processes dist.all_reduce(count, op=dist.ReduceOp.SUM) dist.all_reduce(total_value, op=dist.ReduceOp.SUM) _count = count.item() _total_value = total_value.item() self._count += _count self._total_value += _total_value
def get_dist_device(): """ Get the expected target device in the distributed data parallel. For NCCL backend, return GPU device of current process. For GLOO backend, return CPU. For any other backends, return None as the default, tensor.to(None) will not change the device. """ if dist.is_initialized(): backend = dist.get_backend() if backend == "nccl" and torch.cuda.is_available(): return torch.device(f"cuda:{torch.cuda.current_device()}") elif backend == "gloo": return torch.device("cpu") return None
def backward(ctx, grad_output): if dist.get_backend(group=ctx.group) is dist.Backend.NCCL: rank = dist.get_rank(group=ctx.group) world_size = dist.get_world_size(group=ctx.group) out_size = list(grad_output.size()) if out_size[0] % world_size != 0: raise RuntimeError( f'Tensor with dimensions: {out_size} does ' f'not have first dimension divisible by world_size: {world_size}' ) out_size[0] = out_size[0] // dist.get_world_size(group=ctx.group) gx = torch.empty(out_size, device=grad_output.device, dtype=grad_output.dtype) dist._reduce_scatter_base(gx, grad_output, ReduceOp.SUM, ctx.group) else: raise RuntimeError("Backend not supported!") return (None, gx, None)
def _broadcast_object_list(object_list, src=0, group=None): if _rank_not_in_group(group): return my_rank = get_rank() # Serialize object_list elements to tensors on src rank. if my_rank == src: tensor_list, size_list = zip( *[_object_to_tensor(obj) for obj in object_list]) object_sizes_tensor = torch.cat(size_list) else: object_sizes_tensor = torch.LongTensor(len(object_list)) group_backend = get_backend(group) is_nccl_backend = group_backend == Backend.NCCL current_device = torch.device("cpu") if is_nccl_backend: # See note about using torch.cuda.current_device() here in docstring. # We cannot simply use my_rank since rank == device is not necessarily # true. current_device = torch.device('cuda', torch.cuda.current_device()) object_sizes_tensor = object_sizes_tensor.to(current_device) object_sizes_tensor = object_sizes_tensor.to(current_device) # Broadcast object sizes broadcast(object_sizes_tensor, src=src, group=group) # Concatenate and broadcast serialized object tensors if my_rank == src: object_tensor = torch.cat(tensor_list) else: object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) if is_nccl_backend: object_tensor = object_tensor.to(current_device) broadcast(object_tensor, src=src, group=group) # Deserialize objects using their stored sizes. offset = 0 if my_rank != src: for i, obj_size in enumerate(object_sizes_tensor): obj_view = object_tensor[offset:offset + obj_size] obj_view = obj_view.type( torch.ByteTensor) # type: ignore[call-overload] offset += obj_size object_list[i] = _tensor_to_object(obj_view, obj_size)
def reduce(self, op): """ Reduces average value over all workers. :param op: 'sum' or 'mean', reduction operator """ if op not in ('sum', 'mean'): raise NotImplementedError distributed = (get_world_size() > 1) if distributed: # Backward/forward compatibility around # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86 # To accomodate change in Pytorch's distributed API if hasattr(dist, "get_backend"): _backend = dist.get_backend() if hasattr(dist, "DistBackend"): backend_enum_holder = dist.DistBackend else: backend_enum_holder = dist.Backend else: _backend = dist._backend backend_enum_holder = dist.dist_backend cuda = _backend == backend_enum_holder.NCCL if cuda: avg = torch.cuda.FloatTensor([self.avg]) _sum = torch.cuda.FloatTensor([self.sum]) else: avg = torch.FloatTensor([self.avg]) _sum = torch.FloatTensor([self.sum]) try: _reduce_op = dist.ReduceOp except AttributeError: _reduce_op = dist.reduce_op dist.all_reduce(avg, op=_reduce_op.SUM) dist.all_reduce(_sum, op=_reduce_op.SUM) self.avg = avg.item() self.sum = _sum.item() if op == 'mean': self.avg /= get_world_size() self.sum /= get_world_size()
def forward(ctx, group, *tensors): ctx.group = group out_tensor_list = [ torch.empty_like(tensors[i]) for i in range(dist.get_world_size(group=group)) ] reqs = [None] * dist.get_world_size(group=group) my_rank = dist.get_rank(group=group) # Implement it on means of scatter/gather, send/recv async operations have issues if dist.get_backend(group=group) is dist.Backend.GLOO: for i in range(dist.get_world_size(group=group)): to_send = None if i == my_rank: to_send = list(tensors) dist.scatter(out_tensor_list[i], to_send, i, group=group) else: dist.all_to_all(out_tensor_list, list(tensors), group=group) return tuple(out_tensor_list)
def _test_dist_spawn_fn(local_rank, backend, world_size, device, **kwargs): from ignite.distributed.utils import _model assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend assert isinstance(_model, _NativeDistModel), f"{type(_model)} vs _NativeDistModel" assert _model.get_local_rank() == local_rank assert _model.get_world_size() == world_size assert _model.device().type == torch.device(device).type if "master_addr" in kwargs: assert os.environ["MASTER_ADDR"] == kwargs["master_addr"] if "master_port" in kwargs: assert os.environ["MASTER_PORT"] == str(kwargs["master_port"])
def __init__( self, params: _params_t, optim: Type[Optimizer] = SGD, group: Optional[Any] = None, broadcast_buffer_size: int = -1, broadcast_fp16: bool = False, **default: Any, ): # Hold all the model params in the root .param_groups self.in_super_constructor = True super().__init__(params, default) self.in_super_constructor = False # Partition information. lazy evaluation, computed when requested self.__per_device_params: Dict[ torch.device, List[List[Parameter]]] = OrderedDict() # device, rank, params self.__param_rank: Dict[torch.Tensor, int] = {} self._partition_parameters: List[List[dict]] = [] self.__param_to_index: Dict[int, int] = {} self.__local_params: Optional[List[torch.Tensor]] = None # Default empty values + immutables self._optim_defaults = default self._optim_constructor = optim self.group = group if group is not None else dist.group.WORLD self.world_size = dist.get_world_size(self.group) self.backend = dist.get_backend(self.group) self.rank = dist.get_rank(self.group) self.global_rank = get_global_rank(self.group, self.rank) self._local_to_global_rank = [ get_global_rank(self.group, i) for i in range(self.world_size) ] self.broadcast_fp16 = broadcast_fp16 self.buckets: Dict[torch.device, Dict[int, ParamBucket]] = {} self._all_states: List[Dict[str, Any]] = [ ] # Optional consolidated optimizer state self._default_device = torch.device("cpu") # Setup everything which is related to the parameters to be trained # (partition and optimizer for the shard) self.refresh_trainable()
def distributed(self): if not dist.is_initialized(): raise KeyError('Could not set distributed mode for the compression algorithm ' 'because the default process group has not been initialized.') if next(self.model.parameters()).is_cuda: state = torch.cuda.get_rng_state() if dist.get_backend() == dist.Backend.NCCL: state = state.cuda() torch.distributed.broadcast(state, src=0) torch.cuda.set_rng_state(state.cpu()) else: state = torch.get_rng_state() torch.distributed.broadcast(state, src=0) torch.set_rng_state(state) self._distributed = True
def __init__(self, module, message_size=10000000, delay_allreduce=False, shared_param=None): super(DistributedDataParallel, self).__init__() # Backward/forward compatibility around # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 if (hasattr(dist, "get_backend")): self._backend = dist.get_backend() self.backend_enum_holder = dist.DistBackend else: self._backend = dist._backend self.backend_enum_holder = dist.dist_backend self.warn_on_half = True if self._backend == self.backend_enum_holder.GLOO else False if shared_param is not None: raise ValueError( "shared_param is no longer supported as an option. It was misleadingly named from the start. It turns out overlapping communication with computation should work fine with shared parameters. If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead." ) self.delay_allreduce = delay_allreduce self.message_size = message_size self.reduction_stream = torch.cuda.Stream() self.module = module if self._backend == self.backend_enum_holder.NCCL: for param in self.module.parameters(): assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU." self.active_params = [] self.param_type_to_tmp_i = { "torch.cuda.HalfTensor": 0, "torch.cuda.FloatTensor": 1, "torch.cuda.DoubleTensor": 2 } self.create_hooks() flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0, ))
def __call__( self, best_span_strings: Union[str, List[str]], answer_strings: Union[List[str], List[List[str]]], ): if not isinstance(best_span_strings, list): best_span_strings = [best_span_strings] answer_strings = [answer_strings] # type: ignore cast(List[str], best_span_strings) cast(List[List[str]], answer_strings) assert len(best_span_strings) == len(answer_strings) count = len(best_span_strings) exact_match = 0 f1_score = 0.0 for prediction, gold_answers in zip(best_span_strings, answer_strings): exact_match += squad.metric_max_over_ground_truths( squad.compute_exact, prediction, gold_answers ) f1_score += squad.metric_max_over_ground_truths( squad.compute_f1, prediction, gold_answers ) if is_distributed(): if dist.get_backend() == "nccl": device = torch.cuda.current_device() else: device = torch.device("cpu") # Converting bool to int here, since we want to count the number of exact matches. _exact_match = torch.tensor(exact_match, dtype=torch.int).to(device) _f1_score = torch.tensor(f1_score, dtype=torch.double).to(device) _count = torch.tensor(count).to(device) dist.all_reduce(_exact_match, op=dist.ReduceOp.SUM) dist.all_reduce(_f1_score, op=dist.ReduceOp.SUM) dist.all_reduce(_count, op=dist.ReduceOp.SUM) exact_match = _exact_match.item() f1_score = _f1_score.item() count = _count.item() self._total_em += exact_match self._total_f1 += f1_score self._count += count
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return if dist.get_backend() == dist.Backend.NCCL: # This argument is needed to avoid warnings. # It's valid only for NCCL backend. dist.barrier(device_ids=[torch.cuda.current_device()]) else: dist.barrier()
def synchronize_between_processes(self): if dist.is_initialized(): # Bypass NCCL (which forces CUDA-only sync) if dist.get_backend() == "nccl": group = dist.new_group(backend="gloo") else: group = dist.group.WORLD my_rank = dist.get_rank() output = [None for _ in range(dist.get_world_size())] dist.gather_object(self.predictions, output if my_rank == 0 else None, dst=0, group=group) return list(itertools.chain.from_iterable(output)), my_rank == 0 else: return self.predictions, True
def peak_memory_mb() -> Dict[int, float]: """ Get peak memory usage for each worker, as measured by max-resident-set size: https://unix.stackexchange.com/questions/30940/getrusage-system-call-what-is-maximum-resident-set-size Only works on OSX and Linux, otherwise the result will be 0.0 for every worker. """ if resource is None or sys.platform not in ("linux", "darwin"): peak_mb = 0.0 else: peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss if sys.platform == "darwin": # On OSX the result is in bytes. peak_mb = peak / 1_000_000 else: # On Linux the result is in kilobytes. peak_mb = peak / 1_000 if is_distributed(): global_rank = dist.get_rank() world_size = dist.get_world_size() peak_mb_tensor = torch.tensor([float(global_rank), peak_mb]) # All of these tensors will be gathered into this list. gather_results = [torch.tensor([0.0, 0.0]) for _ in range(world_size)] # If the backend is 'nccl', this means we're training on GPUs, so these tensors # need to be on GPU. if dist.get_backend() == "nccl": peak_mb_tensor = peak_mb_tensor.cuda() gather_results = [x.cuda() for x in gather_results] dist.all_gather(gather_results, peak_mb_tensor) results_dict: Dict[int, float] = {} for peak_mb_tensor in gather_results: worker = int(peak_mb_tensor[0]) peak_mb = round(float(peak_mb_tensor[1]), 3) results_dict[worker] = peak_mb return results_dict else: return {0: peak_mb}
def _serialize_to_tensor(data, group): global _USE_HVD if _USE_HVD: backend = "nccl" else: backend = dist.get_backend(group) assert backend in ["gloo", "nccl"] device = torch.device("cpu" if backend == "gloo" else "cuda") buffer = pickle.dumps(data) if len(buffer) > 1024**3: logger = logging.getLogger(__name__) logger.warning( "Rank {} trying to all-gather {:.2f} GB of data on device {}". format(get_rank(), len(buffer) / (1024**3), device)) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to(device=device) return tensor