def _test_all_gather_helper(self, group, group_id, rank): for dest in group: tensor = _build_tensor(dest + 1, rank) tensors = [_build_tensor(dest + 1, -1) for i in group] dist.all_gather(tensors, tensor, group_id) expected_tensors = [_build_tensor(dest + 1, i) for i in group] for t1, t2 in zip(tensors, expected_tensors): self.assertEqual(t1, t2) self._barrier()
def _test_all_gather_helper(self, group, group_id, rank, cuda=False, rank_to_GPU=None): for dest in group: tensor = _build_tensor(dest + 1, rank) tensors = [_build_tensor(dest + 1, -1) for i in group] if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors] dist.all_gather(tensors, tensor, group_id) expected_tensors = [_build_tensor(dest + 1, i) for i in group] for t1, t2 in zip(tensors, expected_tensors): self.assertEqual(t1, t2) self._barrier()
def _ranks_on_same_node(rank, world_size): hostname = socket.gethostname() hostname_length = torch.IntTensor([len(hostname)]) dist.all_reduce(hostname_length, op=dist.reduce_op.MAX) max_hostname_length = hostname_length.item() encoding = [ord(c) for c in hostname] encoding += [-1 for c in range(max_hostname_length - len(hostname))] encoding = torch.IntTensor(encoding) all_encodings = [torch.IntTensor([0] * max_hostname_length) for _ in range(world_size)] dist.all_gather(all_encodings, encoding) all_encodings = [ec.numpy().tolist() for ec in all_encodings] counter = 0 for i in range(rank): if all_encodings[rank] == all_encodings[i]: counter += 1 return counter
def peak_cpu_memory() -> Dict[int, int]: """ Get peak memory usage for each worker, as measured by max-resident-set size: https://unix.stackexchange.com/questions/30940/getrusage-system-call-what-is-maximum-resident-set-size Only works on OSX and Linux, otherwise the result will be 0.0 for every worker. """ if resource is None or sys.platform not in ("linux", "darwin"): peak_bytes = 0 else: peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss if sys.platform == "darwin": # On OSX the result is in bytes. peak_bytes = peak else: # On Linux the result is in kilobytes. peak_bytes = peak * 1_024 if is_distributed(): global_rank = dist.get_rank() world_size = dist.get_world_size() peak_bytes_tensor = torch.tensor([global_rank, peak_bytes]) # All of these tensors will be gathered into this list. gather_results = [torch.tensor([0, 0]) for _ in range(world_size)] # If the backend is 'nccl', this means we're training on GPUs, so these tensors # need to be on GPU. if dist.get_backend() == "nccl": peak_bytes_tensor = peak_bytes_tensor.cuda() gather_results = [x.cuda() for x in gather_results] dist.all_gather(gather_results, peak_bytes_tensor) results_dict: Dict[int, int] = {} for peak_bytes_tensor in gather_results: results_dict[int(peak_bytes_tensor[0])] = int(peak_bytes_tensor[1]) return results_dict else: return {0: peak_bytes}
def main(length): """Set up an array of specified length and gather it back to the root process.""" rank = dist.get_rank() comm_size = dist.get_world_size() print(f'Starting rank {rank} of {comm_size}') x = torch.ones( length) * rank # Default type is float, which is a good choice. buf = [torch.empty(length) for i in range(comm_size)] dist.all_gather( buf, x) # Synchronous collective: all processes block until complete. if rank == 0: rslt = torch.stack(buf) print(f'rank: {rank}:\n{rslt}') else: print(f'rank: {rank}: done.\n')
def all_gather(self, collectiveArgs, retFlag=False): retObj = dist.all_gather( collectiveArgs.tensorList, collectiveArgs.ipTensor, group=collectiveArgs.group, async_op=collectiveArgs.asyncOp, ) # synchronicity is maintained in runColl if retFlag: return retObj else: return
def check_distributed_masks(self): if not self._distributed or dist.get_world_size() == 1: return 1 nvalues = 0 ncor_values = 0 eps = 1e-4 for minfo in self.sparsified_module_info: mask = minfo.operand.mask mask_list = [torch.empty_like(mask) for _ in range(dist.get_world_size())] # nccl does not support gather, send, recv operations dist.all_gather(mask_list, mask) for i in range(1, len(mask_list)): rel_error = (mask_list[0] - mask_list[i]) / mask_list[0] ncor_values = ncor_values + (rel_error.abs() < eps).sum(dtype=mask.dtype) nvalues = nvalues + mask_list[i].numel() return ncor_values / nvalues
def all_gather(tensors): """ All gathers the provided tensors from all processes across machines. Args: tensors (list): tensors to perform all gather across all processes in all machines. """ gather_list = [] output_tensor = [] world_size = dist.get_world_size() for tensor in tensors: tensor_placeholder = [ torch.ones_like(tensor) for _ in range(world_size) ] dist.all_gather(tensor_placeholder, tensor, async_op=False) gather_list.append(tensor_placeholder) for gathered_tensor in gather_list: output_tensor.append(torch.cat(gathered_tensor, dim=0)) return output_tensor
def pad_to_largest_tensor(tensor, group): world_size = dist.get_world_size(group=group) assert ( world_size >= 1 ), "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.tensor([tensor.numel()], device="cuda") size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) if local_size != max_size: padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
def all_reduce_check(self, rank, data, architecture, args): logger = logging.getLogger('test_logger_rank{}'.format(rank)) trainer = AllReduceTrainer(rank=rank, data=data, architecture=architecture, args=args, logger=logger) # Compute the forward and backward passes inputs, target = next(iter(trainer.train_loader)) inputs, target = Variable(inputs), Variable(target) out = trainer.model(inputs) loss = trainer.loss_fn(out, target) loss.backward() before = [[ torch.zeros(param.data.shape) for _ in range(args.num_workers) ] for param in trainer.model.parameters()] after = deepcopy(before) # collect gradients from each worker and ensure they're unequal for before_list, param in zip(before, trainer.model.parameters()): dist.all_gather(before_list, param.grad.data) dist.barrier() # ensure grads are unequal assert len(set(repr(b.tolist()) for b in before_list)) \ == args.num_workers # run on-forward hook which should perform all-reduce on gradients trainer.on_forward_fn() # collect gradients again and ensure they're equal, # and are the average of gradients collected earlier for after_list, before_list, param \ in zip(after, before, trainer.model.parameters()): dist.all_gather(after_list, param.grad.data) dist.barrier() # ensure params grads are equal assert len(set(repr(a.tolist()) for a in after_list)) == 1 # ensure params grads are averaged exp_avg = np \ .vstack([b.numpy().ravel() for b in before_list]) \ .mean(axis=0) act_avg = after_list[0].numpy().ravel() assert np.allclose(exp_avg, act_avg) return
def forward(self, input, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size): input = input.contiguous() count = torch.empty(1, dtype=running_mean.dtype, device=input.device).fill_(input.numel() // input.size(1)) # calculate mean/invstd for input. mean, invstd = torch.batch_norm_stats(input, eps) num_channels = input.shape[1] # C, C, 1 -> (2C + 1) combined = torch.cat([mean, invstd, count], dim=0) # world_size * (2C + 1) combined_list = [torch.empty_like(combined) for k in range(world_size)] # Use allgather instead of allreduce since I don't trust in-place operations .. dist.all_gather(combined_list, combined, async_op=False) combined = torch.stack(combined_list, dim=0) # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1 mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1) size = count_all.view(-1).long().sum() if size == 1: raise ValueError( 'Expected more than 1 value per channel when training, got input size {}' .format(size)) # calculate global mean & invstd mean, invstd = torch.batch_norm_gather_stats_with_counts( input, mean_all, invstd_all, running_mean, running_var, momentum, eps, count_all.view(-1)) self.save_for_backward(input, weight, mean, invstd, count_all) self.process_group = process_group # apply element-wise normalization out = torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps) return out
def test_evaluation(model, val_set): """Test trained network Args: model (nn.Model): Trained model to be evaluated val_set (DataLoader): Validation set to perform the evaluation """ rank = dist.get_rank() size = dist.get_world_size() device = torch.device('cuda', rank) # Setup counter of images predicted to 0. predicted_ok = 0 total_images = 0 # make list to collect test ccuracies for each gpu acc_list = [ torch.zeros(1, dtype=torch.float).to(device) for _ in range(size) ] model.eval() for images, labels in val_set: # Predict image. images = images.to(device) labels = labels.to(device) images = images.view(images.shape[0], -1) pred = model(images) _, predicted = torch.max(pred.data, 1) total_images += labels.size(0) predicted_ok += (predicted == labels).sum().item() dist.all_gather(acc_list, torch.tensor(predicted_ok / total_images).to(device)) if rank == 0: acc = torch.mean(torch.cat(acc_list, 0)) print('\nNumber Of Images Tested = {}'.format(total_images)) print('Model Accuracy = {}'.format(acc))
def check_params_distributed(net, n_gpus, rank): param = next(net.parameters()) tensor_list = [param.new_empty(param.shape) for i in range(n_gpus)] dist.all_gather(tensor_list, param) if rank == 0: for i in range(n_gpus): if not torch.isnan(tensor_list[0]).any() and \ not torch.isnan(tensor_list[1]).any() and \ not torch.allclose(tensor_list[0], tensor_list[i]): print('WARNING!!!! GRADS NOT EQUAL') # from pdb import set_trace; set_trace() if param.grad is not None: tensor_list = [param.new_empty(param.shape) for i in range(n_gpus)] dist.all_gather(tensor_list, param.grad) if rank == 0: for i in range(n_gpus): if not torch.isnan(tensor_list[0]).any() and \ not torch.isnan(tensor_list[1]).any() and \ not torch.allclose(tensor_list[0], tensor_list[i]): print('WARNING!!!! GRADS NOT EQUAL')
def all_gather(data, group=None): if get_world_size() == 1: return [data] if group is None: group = get_global_gloo_group() if dist.get_world_size(group) == 1: return [data] tensor = serialize_to_tensor(data, group) size_list, tensor = pad_to_largest_tensor(tensor, group) max_size = max(size_list) tensor_list = [ torch.empty((max_size, ), dtype=torch.uint8, device=tensor.device) for _ in size_list ] dist.all_gather(tensor_list, tensor, group=group) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
def dist_collect_other(x, return_before_cat=False): """ collect all tensor from all GPUs except current one args: x: shape (mini_batch, ...) returns: shape (mini_batch * num_gpu, ...) """ x = x.contiguous() out_list = [ torch.zeros_like(x, device=x.device, dtype=x.dtype) for _ in range(dist.get_world_size()) ] dist.all_gather(out_list, x) # get only non local ones. out_list = [ out_list[rank] for rank in range(dist.get_world_size()) if rank != dist.get_rank() ] if return_before_cat: return out_list return torch.cat(out_list, dim=0)
def all_gather(tensor, group, return_tensor=False): """Perform an all-gather operation.""" if use_xla(): result = xm.all_gather(tensor, groups=group[1]) world_size = get_world_size(group=group) result = result.view(world_size, *tensor.size()) if return_tensor: return result else: return [result[i] for i in range(world_size)] else: world_size = get_world_size(group=group) rank = get_rank(group=group) tensor_list = [ tensor if i == rank else torch.empty_like(tensor) for i in range(world_size) ] dist.all_gather(tensor_list, tensor, group=group) if return_tensor: return torch.stack(tensor_list, dim=0) else: return tensor_list
def diag(self, distribute=True): """ get diagonal. distribute: True to get the diagonal as a distributed matrix. False to get the diagonal as a broadcasted vector via all_gather. """ assert self.shape[0]==self.shape[1] rank = dist.get_rank() partition = torch.cumsum(torch.LongTensor([0] + self.sizes), 0) chunk = torch.diag(self.chunk, partition[rank].item()).view(-1, 1) if distribute: shape = [self.shape[0], 1] sizes = self.sizes byrow = True return THDistMat(shape, sizes, chunk, byrow) else: out = self.chunk.new(partition[-1].item(), 1) out_split = list(torch.split(out, self.sizes, 0)) synchronize() dist.all_gather(out_split, chunk) return out
def save_classifier(num_classes, world_size, classifier, path, logger, do_save): tensor = classifier.weight split_size = num_classes // world_size + int(num_classes % world_size > 0) # outsizes = [min(split_size, num_classes - split_size*rank) for rank in range(world_size)] results_list = [ torch.zeros(split_size, tensor.shape[1]).to(tensor.device) for i in range(world_size) ] dist.all_gather(results_list, tensor) result = torch.cat(results_list, dim=0) if do_save: result = result.detach().cpu() directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) torch.save(result, path) ckpt = open(directory + '/checkpoint', 'w') ckpt.write(os.path.basename(path)) ckpt.close() logger.info('Classifier saved to: %s' % path)
def _ranks_on_same_node(rank, world_size): hostname = socket.gethostname() hostname_length = torch.IntTensor([len(hostname)]) dist.all_reduce(hostname_length, op=dist.reduce_op.MAX) max_hostname_length = hostname_length.item() encoding = [ord(c) for c in hostname] encoding += [-1 for c in range(max_hostname_length - len(hostname))] encoding = torch.IntTensor(encoding) all_encodings = [ torch.IntTensor([0] * max_hostname_length) for _ in range(world_size) ] dist.all_gather(all_encodings, encoding) all_encodings = [ec.numpy().tolist() for ec in all_encodings] ranks = [] for i in range(world_size): if all_encodings[rank] == all_encodings[i]: ranks.append(i) return ranks
def all_gather(data): world_size = get_world_size() if world_size == 1: return data batch = torch.tensor(data.shape[0], dtype=torch.long, device=data.device) batches = [torch.tensor(0, dtype=torch.long, device=data.device) for _ in range(world_size)] dist.all_gather(batches, batch) max_batch = max(batches).item() max_shape = list(data.shape) max_shape[0] = max_batch datas = [torch.zeros(max_shape, dtype=data.dtype, device=data.device) for _ in range(world_size)] if batch != max_batch: pad_shape = max_shape pad_shape[0] = max_batch - batch data = torch.cat([data, torch.zeros(pad_shape, dtype=data.dtype, device=data.device)]) dist.all_gather(datas, data) datas = [data[:batch] for batch, data in zip(batches, datas)] return torch.cat(datas)
def distmm_thinthin_outer(matA, matB, tmpout=None, out=None): ''' A ((p) x r), B (r x (q)) => AB((p) x q), out row-major tmpout: r x q, to all_gather out: (p) x q B is all_gathered. in NMF: to compute objective. ''' rank = dist.get_rank() assert matA.byrow and (not matB.byrow) p = matA.shape[0] q = matB.shape[1] r = matA.shape[1] assert r == matB.shape[0] shape = [p, q] sizes = matA.sizes byrow = True # all_gather if tmpout is None: tmpout = torch.t(matB.chunk.new(q, r)) else: assert tmpout.size() == torch.Size([r, q]) torch.t(tmpout).view(-1) split_tmpout_pre = torch.split(tmpout, matB.sizes, dim=1) split_tmpout = [x.t() for x in split_tmpout_pre] #print(split_tmpout) synchronize() dist.all_gather(split_tmpout, matB.chunk.t()) # compute if out is None: out = matA.chunk.new(matA.sizes[rank], q) else: assert out.size() == torch.Size([matA.sizes[rank], q]) out.view(-1) chunk = torch.mm(matA.chunk, tmpout, out=out) return THDistMat(shape, sizes, chunk, byrow)
def prepare_self_train_data(self, rank, model, idx): target_num = min(self.world_size * self.train_batch_size * self.update_interval * self.accum_steps, len(self.train_data["input_ids"])) if idx + target_num >= len(self.train_data["input_ids"]): select_idx = torch.cat((torch.arange(idx, len(self.train_data["input_ids"])), torch.arange(idx + target_num - len(self.train_data["input_ids"])))) else: select_idx = torch.arange(idx, idx + target_num) assert len(select_idx) == target_num idx = (idx + len(select_idx)) % len(self.train_data["input_ids"]) select_dataset = {"input_ids": self.train_data["input_ids"][select_idx], "attention_masks": self.train_data["attention_masks"][select_idx]} dataset_loader = self.make_dataloader(rank, select_dataset, self.eval_batch_size) input_ids, input_mask, preds = self.inference(model, dataset_loader, rank, return_type="data") gather_input_ids = [torch.ones_like(input_ids) for _ in range(self.world_size)] gather_input_mask = [torch.ones_like(input_mask) for _ in range(self.world_size)] gather_preds = [torch.ones_like(preds) for _ in range(self.world_size)] dist.all_gather(gather_input_ids, input_ids) dist.all_gather(gather_input_mask, input_mask) dist.all_gather(gather_preds, preds) input_ids = torch.cat(gather_input_ids, dim=0).cpu() input_mask = torch.cat(gather_input_mask, dim=0).cpu() all_preds = torch.cat(gather_preds, dim=0).cpu() weight = all_preds ** 2 / torch.sum(all_preds, dim=0) target_dist = (weight.t() / torch.sum(weight, dim=1)).t() all_target_pred = target_dist.argmax(dim=-1) agree = (all_preds.argmax(dim=-1) == all_target_pred).int().sum().item() / len(all_target_pred) self_train_dict = {"input_ids": input_ids, "attention_masks": input_mask, "labels": target_dist, "all_target_pred": all_target_pred} return self_train_dict, idx, agree
def __call__( self, predictions: torch.Tensor, gold_labels: torch.Tensor, mask: Optional[torch.BoolTensor] = None, ): """ # Parameters predictions : `torch.Tensor`, required. A tensor of predictions of shape (batch_size, ...). gold_labels : `torch.Tensor`, required. A tensor of the same shape as `predictions`. mask : `torch.BoolTensor`, optional (default = `None`). A tensor of the same shape as `predictions`. """ predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between # the vectors, since each element in the predictions and gold_labels tensor is assumed # to be a separate observation. predictions = predictions.reshape(-1) gold_labels = gold_labels.reshape(-1) self.total_predictions = self.total_predictions.to(predictions.device) self.total_gold_labels = self.total_gold_labels.to(gold_labels.device) if mask is not None: mask = mask.reshape(-1) self.total_predictions = torch.cat((self.total_predictions, predictions * mask), 0) self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels * mask), 0) else: self.total_predictions = torch.cat((self.total_predictions, predictions), 0) self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels), 0) if is_distributed(): world_size = dist.get_world_size() device = gold_labels.device # Check if batch lengths are equal. _all_batch_lengths = [torch.tensor(0) for i in range(world_size)] dist.all_gather( _all_batch_lengths, torch.tensor(self.total_predictions.shape[0], device=device) ) _all_batch_lengths = [batch_length.item() for batch_length in _all_batch_lengths] if len(set(_all_batch_lengths)) > 1: # Subsequent dist.all_gather() calls currently do not handle tensors of different length. raise RuntimeError( "Distributed aggregation for SpearmanCorrelation is currently not supported " "for batches of unequal length." ) _total_predictions = [ torch.zeros(self.total_predictions.shape, device=device) for i in range(world_size) ] _total_gold_labels = [ torch.zeros(self.total_gold_labels.shape, device=device) for i in range(world_size) ] dist.all_gather(_total_predictions, self.total_predictions) dist.all_gather(_total_gold_labels, self.total_gold_labels) self.total_predictions = torch.cat(_total_predictions, dim=0) self.total_gold_labels = torch.cat(_total_gold_labels, dim=0)
def visualize_pseudo_proj(self, logger, iteration): def log_grid_image(label, im, iteration=iteration): nrow=int(math.ceil(im.size(0)**0.5)) im_grid = torchvision.utils.make_grid(im, nrow=nrow) logger.add_image(label, im_grid, iteration) if self.distributed: if hasattr(self, 'pseudo_im') and self.pseudo_im is not None: pseudo_imgs = [self.pseudo_im.clone().zero_() for i in range(dist.get_world_size())] dist.all_gather(pseudo_imgs, self.pseudo_im) pseudo_imgs = torch.cat(pseudo_imgs, dim=0) proj_imgs = [self.proj_im.clone().zero_() for i in range(dist.get_world_size())] masks = [self.mask.clone().zero_() for i in range(dist.get_world_size())] dist.all_gather(proj_imgs, self.proj_im) dist.all_gather(masks, self.mask) proj_imgs = torch.cat(proj_imgs, dim=0) masks = torch.cat(masks, dim=0) else: if hasattr(self, 'pseudo_im') and self.pseudo_im is not None: pseudo_imgs = self.pseudo_im proj_imgs = self.proj_im masks = self.mask ## write summary if self.rank == 0: if self.mode == 'step2': log_grid_image('Image/pseudo_images', pseudo_imgs/2+0.5, iteration) log_grid_image('Image/proj_images', proj_imgs/2+0.5, iteration) log_grid_image('Image/mask', masks, iteration)
def collect_results_gpu(result_part, size): """Collect results in gpu mode. It encodes results to gpu tensors and use gpu communication for results collection. Args: result_part (list): Results to be collected size (int): Result size. Returns: list: Ordered results. """ rank, world_size = get_dist_info() # dump result part to tensor with pickle part_tensor = torch.tensor(bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') # gather all result part tensor shape shape_tensor = torch.tensor(part_tensor.shape, device='cuda') shape_list = [shape_tensor.clone() for _ in range(world_size)] dist.all_gather(shape_list, shape_tensor) # padding result part tensor to max length shape_max = torch.tensor(shape_list).max() part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') part_send[:shape_tensor[0]] = part_tensor part_recv_list = [ part_tensor.new_zeros(shape_max) for _ in range(world_size) ] # gather all result part dist.all_gather(part_recv_list, part_send) if rank == 0: part_list = [] for recv, shape in zip(part_recv_list, shape_list): part_list.append( pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] return ordered_results
def gather_partitioned_activations(tensors, device=None): global mp_rank, mp_size, mp_group assert len( tensors ) % 2 == 0, f'Expected even count of tensors, instead got {len(tensors)}' inputs = [] num_args = int(len(tensors) / 2) for i in range(num_args): item = tensors[2 * i] size = tensors[2 * i + 1] if not is_activation_to_checkpoint(item): inputs.append(item) continue partition_size = item.numel() tensor_size = partition_size * mp_size if device is not None: flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=device) else: flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device) partitions = [] for i in range(mp_size): part_i = flat_tensor.narrow(0, partition_size * i, partition_size) if i == mp_rank: part_i.copy_(item) partitions.append(part_i) if mp_group is not None: dist.all_gather(partitions, partitions[mp_rank], group=mp_group) input_tensor = flat_tensor.view(list(size.numpy())) item.data = input_tensor.data inputs.append(item) return tuple(inputs)
def report_epoch_stats(self): if self.epoch_stats['prefix'] == 'train': statistics = [ self.epoch_stats['num_correct'], self.epoch_stats['num_total'], self.epoch_stats['loss'] ] else: # aggregate the results from all nodes group = dist.new_group(range(self.args.world_size)) statistics = th.tensor([ self.epoch_stats['num_correct'], self.epoch_stats['num_total'], self.epoch_stats['loss'] ], dtype=th.float32).cuda() if self.args.dist_method == 'reduce': dist.reduce(tensor=statistics, dst=0, op=dist.ReduceOp.SUM, group=group) elif self.args.dist_method == 'all_gather': all_statistics = [ th.zeros((1, 3)).cuda() for _ in range(self.args.world_size) ] dist.all_gather(tensor=statistics, tensor_list=all_statistics, group=group) statistics = th.sum(th.cat(all_statistics, dim=0), dim=0).cpu().numpy() accuracy = float(statistics[0]) / statistics[1] loss = statistics[2] / statistics[1] if self.epoch_stats['prefix'] != 'test': self.logger.info( "rank %d, %s phase of epoch %d: accuracy %.6f, loss %.6f, num_correct %d, total %d" % (self.args.distributed_rank, self.epoch_stats['prefix'], self.epoch_stats['epoch'], accuracy, loss, statistics[0], statistics[1])) return accuracy, loss
def peak_gpu_memory() -> Dict[int, int]: """ Get the peak GPU memory usage in bytes by device. # Returns `Dict[int, int]` Keys are device ids as integers. Values are memory usage as integers in bytes. Returns an empty `dict` if GPUs are not available. """ if not torch.cuda.is_available(): return {} device = torch.cuda.current_device() results_dict: Dict[int, int] = {} if is_distributed(): # If the backend is not 'nccl', we're training on CPU. if dist.get_backend() != "nccl": return {} global_rank = dist.get_rank() world_size = dist.get_world_size() peak_bytes = torch.cuda.max_memory_allocated(device) peak_bytes_tensor = torch.tensor([global_rank, peak_bytes], device=device) # All of these tensors will be gathered into this list. gather_results = [torch.tensor([0, 0], device=device) for _ in range(world_size)] dist.all_gather(gather_results, peak_bytes_tensor) for peak_bytes_tensor in gather_results: results_dict[int(peak_bytes_tensor[0])] = int(peak_bytes_tensor[1]) else: results_dict = {0: torch.cuda.max_memory_allocated()} # Reset peak stats. torch.cuda.reset_max_memory_allocated(device) return results_dict
def get_full_inputs(tensors): inputs=[] for i in range(int(len(tensors)/2)-1): item = tensors[2 * i] size = tensors[2* i + 1] partition_size = item.numel() tensor_size = partition_size * mp_size flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device) partitions=[] for i in range(mp_size): part_i = flat_tensor.narrow(0, partition_size * i , partition_size) if i == mp_rank: part_i.copy_(item) partitions.append(part_i) dist.all_gather(partitions,partitions[mp_rank], group=mp_group) input_tensor = flat_tensor.view(list(size.numpy())) item.data=input_tensor.data inputs.append(item) inputs.append(tensors[-2]) return tuple(inputs)
def feed_op(self, batch, mode): """Feed data to the metric. Args: batch (Tensor): Input tensor. mode (str): The mode of current data batch. 'reals' or 'fakes'. """ if mode == 'reals': pass elif mode == 'fakes': if self.bgr2rgb: batch = batch[:, [2, 1, 0], ...] if self.resize: if self.use_pil_resize: batch = self.pil_resize(batch) else: batch = F.interpolate(batch, size=(299, 299), mode='bilinear') if self.use_tero_script: batch = (batch * 127.5 + 128).clamp(0, 255).to(torch.uint8) batch = batch.to(self.device) # get prediction pred = self.get_pred(batch) if dist.is_initialized(): ws = dist.get_world_size() placeholder = [torch.zeros_like(pred) for _ in range(ws)] dist.all_gather(placeholder, pred) pred = torch.cat(placeholder, dim=0) # in distributed training, we only collect features at rank-0. if (dist.is_initialized() and dist.get_rank() == 0) or not dist.is_initialized(): self.preds.append(pred.cpu().numpy()) else: raise ValueError(f'{mode} is not a implemented feed mode.')
def prepare(self, label, optimizer): """ get sampled class centers for cal softmax. label: tensor Label tensor on each rank. optimizer: opt Optimizer for partial fc, which need to get weight mom. """ with torch.cuda.stream(self.stream): total_label = torch.zeros(size=[self.batch_size * self.world_size], device=self.device, dtype=torch.long) dist.all_gather(list(total_label.chunk(self.world_size, dim=0)), label) self.sample(total_label) optimizer.state.pop(optimizer.param_groups[-1]['params'][0], None) optimizer.param_groups[-1]['params'][0] = self.sub_weight optimizer.state[ self.sub_weight]['momentum_buffer'] = self.sub_weight_mom norm_weight = normalize(self.sub_weight) return total_label, norm_weight
def _pad_to_largest_tensor(tensor, group): world_size = dist.get_world_size(group=group) assert world_size >= 1, \ "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) if local_size != max_size: padding = torch.zeros((max_size - local_size, ), dtype=torch.uint8, device=tensor.device) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor
print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.gather(tensor, dst=0) dist.barrier() if rank == 0: print_header("all gather") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) tensors = [tensor for n in range(0, dist.get_world_size())] for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.all_gather(tensors, tensor) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) tensors = [tensor for n in range(0, dist.get_world_size())] for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.all_gather(tensors, tensor) dist.barrier()