def gather_detections(all_detections, samples_per_rank): world_size = get_world_size() result = [ torch.zeros(samples_per_rank, *all_detections.shape[1:]).cuda() for _ in range(world_size) ] all_detections = F.pad( all_detections, [0, 0, 0, 0, 0, samples_per_rank - all_detections.size(0)]) dist.all_gather(result, all_detections.cuda()) return torch.cat(result)
def check_distributed_masks(self): if not self._distributed or get_world_size() == 1: return 1 nvalues = 0 ncor_values = 0 eps = 1e-4 for minfo in self.sparsified_module_info: mask = minfo.operand.mask mask_list = [torch.empty_like(mask) for _ in range(get_world_size())] # nccl does not support gather, send, recv operations dist.all_gather(mask_list, mask) for i in range(1, len(mask_list)): rel_error = (mask_list[0] - mask_list[i]) / mask_list[0] ncor_values = ncor_values + (rel_error.abs() < eps).sum(dtype=mask.dtype) nvalues = nvalues + mask_list[i].numel() return ncor_values / nvalues