def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None): warnings.warn( "SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.", FutureWarning, ) if num_replicas is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank num_samples = len(self.dataset) # Add extra samples to make num_samples a multiple of batch_size if passed if batch_size is not None: self.num_samples = int( math.ceil(num_samples / (batch_size * num_replicas))) * batch_size else: self.num_samples = int(math.ceil(num_samples / num_replicas)) self.total_size = self.num_samples * self.num_replicas self.batch_size = batch_size
def __init__( self, batch_size: int, dataset: Optional[Dataset] = None, num_replicas: Optional[int] = None, rank: Optional[int] = None, seed: int = 0, drop_last: bool = False, lengths: Optional[List[int]] = None, model_input_name: Optional[str] = None, ): if dataset is None and lengths is None: raise ValueError("One of dataset and lengths must be provided.") if num_replicas is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError( "Requires distributed package to be available") rank = dist.get_rank() self.batch_size = batch_size self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.drop_last = drop_last if lengths is None: model_input_name = model_input_name if model_input_name is not None else "input_ids" if (not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding)) or model_input_name not in dataset[0]): raise ValueError( "Can only automatically infer lengths for datasets whose items are dictionaries with an " f"'{model_input_name}' key.") lengths = [len(feature[model_input_name]) for feature in dataset] self.lengths = lengths # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. if self.drop_last and len(self.lengths) % self.num_replicas != 0: # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil( (len(self.lengths) - self.num_replicas) / self.num_replicas) else: self.num_samples = math.ceil(len(self.lengths) / self.num_replicas) self.total_size = self.num_samples * self.num_replicas self.seed = seed
def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def all_reduce_item(value, op='sum'): """ All-reduces single scalar value if distributed is in use """ if dist.is_available() and dist.is_initialized(): if op == 'sum' or op == 'mean': dop = dist.ReduceOp.SUM elif op == 'min': dop = dist.ReduceOp.MIN elif op == 'max': dop = dist.ReduceOp.MAX elif op == 'product': dop = dist.ReduceOp.PRODUCT else: raise RuntimeError('Unsupported reduce op') # backend = dist.get_backend() # if backend == dist.Backend.NCCL: # device = torch.device('cuda') # elif backend == dist.Backend.GLOO: # device = torch.device('cpu') # else: # raise RuntimeError('Unsupported distributed backend') device = torch.device('cuda') tensor = torch.tensor(value, device=device) dist.all_reduce(tensor, dop) if op == 'mean': tensor /= get_world_size() ret = tensor.item() else: ret = value return ret
def get_rank(): """ Gets distributed rank or returns zero if distributed is not initialized. """ if dist.is_available() and dist.is_initialized(): rank = dist.get_rank() else: rank = 0 return rank
def get_world_size(): """ Gets total number of distributed workers or returns one if distributed is not initialized. """ if dist.is_available() and dist.is_initialized(): world_size = dist.get_world_size() else: world_size = 1 return world_size
def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank num_samples = len(self.dataset) # Add extra samples to make num_samples a multiple of batch_size if passed if batch_size is not None: self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size else: self.num_samples = int(math.ceil(num_samples / num_replicas)) self.total_size = self.num_samples * self.num_replicas self.batch_size = batch_size
def barrier(): """ Call dist.barrier() if distributed is in use """ if dist.is_available() and dist.is_initialized(): dist.barrier()