Example #1
0
    def _test_all_gather_helper(self, group, group_id, rank):
        for dest in group:
            tensor = _build_tensor(dest + 1, rank)
            tensors = [_build_tensor(dest + 1, -1) for i in group]
            dist.all_gather(tensors, tensor, group_id)

            expected_tensors = [_build_tensor(dest + 1, i) for i in group]
            for t1, t2 in zip(tensors, expected_tensors):
                self.assertEqual(t1, t2)

        self._barrier()
Example #2
0
    def _test_all_gather_helper(self, group, group_id, rank, cuda=False, rank_to_GPU=None):
        for dest in group:
            tensor = _build_tensor(dest + 1, rank)
            tensors = [_build_tensor(dest + 1, -1) for i in group]
            if cuda:
                tensor = tensor.cuda(rank_to_GPU[rank][0])
                tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
            dist.all_gather(tensors, tensor, group_id)

            expected_tensors = [_build_tensor(dest + 1, i) for i in group]
            for t1, t2 in zip(tensors, expected_tensors):
                self.assertEqual(t1, t2)

        self._barrier()
Example #3
0
def _ranks_on_same_node(rank, world_size):
    hostname = socket.gethostname()
    hostname_length = torch.IntTensor([len(hostname)])
    dist.all_reduce(hostname_length, op=dist.reduce_op.MAX)
    max_hostname_length = hostname_length.item()

    encoding = [ord(c) for c in hostname]
    encoding += [-1 for c in range(max_hostname_length - len(hostname))]
    encoding = torch.IntTensor(encoding)

    all_encodings = [torch.IntTensor([0] * max_hostname_length) for _ in range(world_size)]
    dist.all_gather(all_encodings, encoding)

    all_encodings = [ec.numpy().tolist() for ec in all_encodings]
    counter = 0
    for i in range(rank):
        if all_encodings[rank] == all_encodings[i]:
            counter += 1
    return counter
def peak_cpu_memory() -> Dict[int, int]:
    """
    Get peak memory usage for each worker, as measured by max-resident-set size:

    https://unix.stackexchange.com/questions/30940/getrusage-system-call-what-is-maximum-resident-set-size

    Only works on OSX and Linux, otherwise the result will be 0.0 for every worker.
    """
    if resource is None or sys.platform not in ("linux", "darwin"):
        peak_bytes = 0
    else:
        peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if sys.platform == "darwin":
            # On OSX the result is in bytes.
            peak_bytes = peak
        else:
            # On Linux the result is in kilobytes.
            peak_bytes = peak * 1_024

    if is_distributed():
        global_rank = dist.get_rank()
        world_size = dist.get_world_size()

        peak_bytes_tensor = torch.tensor([global_rank, peak_bytes])
        # All of these tensors will be gathered into this list.
        gather_results = [torch.tensor([0, 0]) for _ in range(world_size)]

        # If the backend is 'nccl', this means we're training on GPUs, so these tensors
        # need to be on GPU.
        if dist.get_backend() == "nccl":
            peak_bytes_tensor = peak_bytes_tensor.cuda()
            gather_results = [x.cuda() for x in gather_results]

        dist.all_gather(gather_results, peak_bytes_tensor)

        results_dict: Dict[int, int] = {}
        for peak_bytes_tensor in gather_results:
            results_dict[int(peak_bytes_tensor[0])] = int(peak_bytes_tensor[1])

        return results_dict
    else:
        return {0: peak_bytes}
Example #5
0
def main(length):
    """Set up an array of specified length and gather it back to the root process."""
    rank = dist.get_rank()
    comm_size = dist.get_world_size()

    print(f'Starting rank {rank} of {comm_size}')

    x = torch.ones(
        length) * rank  # Default type is float, which is a good choice.

    buf = [torch.empty(length) for i in range(comm_size)]

    dist.all_gather(
        buf, x)  # Synchronous collective: all processes block until complete.

    if rank == 0:
        rslt = torch.stack(buf)
        print(f'rank: {rank}:\n{rslt}')
    else:
        print(f'rank: {rank}:  done.\n')
Example #6
0
 def all_gather(self, collectiveArgs, retFlag=False):
     retObj = dist.all_gather(
         collectiveArgs.tensorList,
         collectiveArgs.ipTensor,
         group=collectiveArgs.group,
         async_op=collectiveArgs.asyncOp,
     )  # synchronicity is maintained in runColl
     if retFlag:
         return retObj
     else:
         return
    def check_distributed_masks(self):
        if not self._distributed or dist.get_world_size() == 1:
            return 1

        nvalues = 0
        ncor_values = 0
        eps = 1e-4
        for minfo in self.sparsified_module_info:
            mask = minfo.operand.mask

            mask_list = [torch.empty_like(mask) for _ in range(dist.get_world_size())]
            # nccl does not support gather, send, recv operations
            dist.all_gather(mask_list, mask)

            for i in range(1, len(mask_list)):
                rel_error = (mask_list[0] - mask_list[i]) / mask_list[0]
                ncor_values = ncor_values + (rel_error.abs() < eps).sum(dtype=mask.dtype)
                nvalues = nvalues + mask_list[i].numel()

        return ncor_values / nvalues
Example #8
0
def all_gather(tensors):
    """
    All gathers the provided tensors from all processes across machines.
    Args:
        tensors (list): tensors to perform all gather across all processes in
        all machines.
    """

    gather_list = []
    output_tensor = []
    world_size = dist.get_world_size()
    for tensor in tensors:
        tensor_placeholder = [
            torch.ones_like(tensor) for _ in range(world_size)
        ]
        dist.all_gather(tensor_placeholder, tensor, async_op=False)
        gather_list.append(tensor_placeholder)
    for gathered_tensor in gather_list:
        output_tensor.append(torch.cat(gathered_tensor, dim=0))
    return output_tensor
Example #9
0
def pad_to_largest_tensor(tensor, group):
    world_size = dist.get_world_size(group=group)
    assert (
        world_size >= 1
    ), "comm.gather/all_gather must be called from ranks within the given group!"
    local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
    size_list = [
        torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)
    ]
    dist.all_gather(size_list, local_size, group=group)
    size_list = [int(size.item()) for size in size_list]

    max_size = max(size_list)

    # we pad the tensor because torch all_gather does not support
    # gathering tensors of different shapes
    if local_size != max_size:
        padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device)
        tensor = torch.cat((tensor, padding), dim=0)
    return size_list, tensor
Example #10
0
def all_gather(data):
    """
    Run all_gather on arbitrary picklable data (not necessarily tensors)
    Args:
        data: any picklable object
    Returns:
        list[data]: list of data gathered from each rank
    """
    world_size = get_world_size()
    if world_size == 1:
        return [data]

    # serialized to a Tensor
    buffer = pickle.dumps(data)
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to("cuda")

    # obtain Tensor size of each rank
    local_size = torch.tensor([tensor.numel()], device="cuda")
    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
    dist.all_gather(size_list, local_size)
    size_list = [int(size.item()) for size in size_list]
    max_size = max(size_list)

    # receiving Tensor from all ranks
    # we pad the tensor because torch all_gather does not support
    # gathering tensors of different shapes
    tensor_list = []
    for _ in size_list:
        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
    if local_size != max_size:
        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
        tensor = torch.cat((tensor, padding), dim=0)
    dist.all_gather(tensor_list, tensor)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
        data_list.append(pickle.loads(buffer))

    return data_list
Example #11
0
 def all_reduce_check(self, rank, data, architecture, args):
     logger = logging.getLogger('test_logger_rank{}'.format(rank))
     trainer = AllReduceTrainer(rank=rank,
                                data=data,
                                architecture=architecture,
                                args=args,
                                logger=logger)
     # Compute the forward and backward passes
     inputs, target = next(iter(trainer.train_loader))
     inputs, target = Variable(inputs), Variable(target)
     out = trainer.model(inputs)
     loss = trainer.loss_fn(out, target)
     loss.backward()
     before = [[
         torch.zeros(param.data.shape) for _ in range(args.num_workers)
     ] for param in trainer.model.parameters()]
     after = deepcopy(before)
     # collect gradients from each worker and ensure they're unequal
     for before_list, param in zip(before, trainer.model.parameters()):
         dist.all_gather(before_list, param.grad.data)
         dist.barrier()
         # ensure grads are unequal
         assert len(set(repr(b.tolist()) for b in before_list)) \
             == args.num_workers
     # run on-forward hook which should perform all-reduce on gradients
     trainer.on_forward_fn()
     # collect gradients again and ensure they're equal,
     # and are the average of gradients collected earlier
     for after_list, before_list, param \
             in zip(after, before, trainer.model.parameters()):
         dist.all_gather(after_list, param.grad.data)
         dist.barrier()
         # ensure params grads are equal
         assert len(set(repr(a.tolist()) for a in after_list)) == 1
         # ensure params grads are averaged
         exp_avg = np \
             .vstack([b.numpy().ravel() for b in before_list]) \
             .mean(axis=0)
         act_avg = after_list[0].numpy().ravel()
         assert np.allclose(exp_avg, act_avg)
     return
Example #12
0
    def forward(self, input, weight, bias, running_mean, running_var, eps,
                momentum, process_group, world_size):
        input = input.contiguous()

        count = torch.empty(1, dtype=running_mean.dtype,
                            device=input.device).fill_(input.numel() //
                                                       input.size(1))

        # calculate mean/invstd for input.
        mean, invstd = torch.batch_norm_stats(input, eps)

        num_channels = input.shape[1]
        # C, C, 1 -> (2C + 1)
        combined = torch.cat([mean, invstd, count], dim=0)
        # world_size * (2C + 1)
        combined_list = [torch.empty_like(combined) for k in range(world_size)]
        # Use allgather instead of allreduce since I don't trust in-place operations ..
        dist.all_gather(combined_list, combined, async_op=False)
        combined = torch.stack(combined_list, dim=0)
        # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
        mean_all, invstd_all, count_all = torch.split(combined,
                                                      num_channels,
                                                      dim=1)

        size = count_all.view(-1).long().sum()
        if size == 1:
            raise ValueError(
                'Expected more than 1 value per channel when training, got input size {}'
                .format(size))

        # calculate global mean & invstd
        mean, invstd = torch.batch_norm_gather_stats_with_counts(
            input, mean_all, invstd_all, running_mean, running_var, momentum,
            eps, count_all.view(-1))

        self.save_for_backward(input, weight, mean, invstd, count_all)
        self.process_group = process_group

        # apply element-wise normalization
        out = torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
        return out
Example #13
0
def test_evaluation(model, val_set):
    """Test trained network

    Args:
        model (nn.Model): Trained model to be evaluated
        val_set (DataLoader): Validation set to perform the evaluation
    """
    rank = dist.get_rank()
    size = dist.get_world_size()
    device = torch.device('cuda', rank)

    # Setup counter of images predicted to 0.
    predicted_ok = 0
    total_images = 0

    # make list to collect test ccuracies for each gpu
    acc_list = [
        torch.zeros(1, dtype=torch.float).to(device) for _ in range(size)
    ]

    model.eval()

    for images, labels in val_set:
        # Predict image.
        images = images.to(device)
        labels = labels.to(device)

        images = images.view(images.shape[0], -1)
        pred = model(images)

        _, predicted = torch.max(pred.data, 1)
        total_images += labels.size(0)
        predicted_ok += (predicted == labels).sum().item()

    dist.all_gather(acc_list,
                    torch.tensor(predicted_ok / total_images).to(device))

    if rank == 0:
        acc = torch.mean(torch.cat(acc_list, 0))
        print('\nNumber Of Images Tested = {}'.format(total_images))
        print('Model Accuracy = {}'.format(acc))
def check_params_distributed(net, n_gpus, rank):
    param = next(net.parameters())
    tensor_list = [param.new_empty(param.shape) for i in range(n_gpus)]
    dist.all_gather(tensor_list, param)
    if rank == 0:
        for i in range(n_gpus):
            if not torch.isnan(tensor_list[0]).any() and \
                    not torch.isnan(tensor_list[1]).any() and \
                    not torch.allclose(tensor_list[0], tensor_list[i]):
                print('WARNING!!!! GRADS NOT EQUAL')
                # from pdb import set_trace; set_trace()

    if param.grad is not None:
        tensor_list = [param.new_empty(param.shape) for i in range(n_gpus)]
        dist.all_gather(tensor_list, param.grad)
        if rank == 0:
            for i in range(n_gpus):
                if not torch.isnan(tensor_list[0]).any() and \
                        not torch.isnan(tensor_list[1]).any() and \
                        not torch.allclose(tensor_list[0], tensor_list[i]):
                    print('WARNING!!!! GRADS NOT EQUAL')
def all_gather(data, group=None):
    if get_world_size() == 1: return [data]
    if group is None: group = get_global_gloo_group()
    if dist.get_world_size(group) == 1: return [data]

    tensor = serialize_to_tensor(data, group)
    size_list, tensor = pad_to_largest_tensor(tensor, group)
    max_size = max(size_list)

    tensor_list = [
        torch.empty((max_size, ), dtype=torch.uint8, device=tensor.device)
        for _ in size_list
    ]
    dist.all_gather(tensor_list, tensor, group=group)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
        data_list.append(pickle.loads(buffer))

    return data_list
Example #16
0
def dist_collect_other(x, return_before_cat=False):
    """ collect all tensor from all GPUs except current one
    args:
        x: shape (mini_batch, ...)
    returns:
        shape (mini_batch * num_gpu, ...)
    """
    x = x.contiguous()
    out_list = [
        torch.zeros_like(x, device=x.device, dtype=x.dtype)
        for _ in range(dist.get_world_size())
    ]
    dist.all_gather(out_list, x)
    # get only non local ones.
    out_list = [
        out_list[rank] for rank in range(dist.get_world_size())
        if rank != dist.get_rank()
    ]
    if return_before_cat:
        return out_list
    return torch.cat(out_list, dim=0)
Example #17
0
def all_gather(tensor, group, return_tensor=False):
    """Perform an all-gather operation."""
    if use_xla():
        result = xm.all_gather(tensor, groups=group[1])
        world_size = get_world_size(group=group)
        result = result.view(world_size, *tensor.size())
        if return_tensor:
            return result
        else:
            return [result[i] for i in range(world_size)]
    else:
        world_size = get_world_size(group=group)
        rank = get_rank(group=group)
        tensor_list = [
            tensor if i == rank else torch.empty_like(tensor) for i in range(world_size)
        ]
        dist.all_gather(tensor_list, tensor, group=group)
        if return_tensor:
            return torch.stack(tensor_list, dim=0)
        else:
            return tensor_list
Example #18
0
 def diag(self, distribute=True):
     """
     get diagonal. 
     distribute: True to get the diagonal as a distributed matrix.
           False to get the diagonal as a broadcasted vector via all_gather.
     """
     assert self.shape[0]==self.shape[1]
     rank = dist.get_rank()
     partition = torch.cumsum(torch.LongTensor([0] + self.sizes), 0)
     chunk = torch.diag(self.chunk, partition[rank].item()).view(-1, 1)
     if distribute:
         shape = [self.shape[0], 1]
         sizes = self.sizes
         byrow = True
         return THDistMat(shape, sizes, chunk, byrow)
     else:
         out       = self.chunk.new(partition[-1].item(), 1)
         out_split = list(torch.split(out, self.sizes, 0))
         synchronize()
         dist.all_gather(out_split, chunk)
         return out
Example #19
0
def save_classifier(num_classes, world_size, classifier, path, logger,
                    do_save):
    tensor = classifier.weight
    split_size = num_classes // world_size + int(num_classes % world_size > 0)
    # outsizes = [min(split_size, num_classes - split_size*rank) for rank in range(world_size)]
    results_list = [
        torch.zeros(split_size, tensor.shape[1]).to(tensor.device)
        for i in range(world_size)
    ]
    dist.all_gather(results_list, tensor)
    result = torch.cat(results_list, dim=0)
    if do_save:
        result = result.detach().cpu()
        directory = os.path.dirname(path)
        if not os.path.exists(directory):
            os.makedirs(directory)
        torch.save(result, path)
        ckpt = open(directory + '/checkpoint', 'w')
        ckpt.write(os.path.basename(path))
        ckpt.close()
        logger.info('Classifier saved to: %s' % path)
Example #20
0
def _ranks_on_same_node(rank, world_size):
    hostname = socket.gethostname()
    hostname_length = torch.IntTensor([len(hostname)])
    dist.all_reduce(hostname_length, op=dist.reduce_op.MAX)
    max_hostname_length = hostname_length.item()

    encoding = [ord(c) for c in hostname]
    encoding += [-1 for c in range(max_hostname_length - len(hostname))]
    encoding = torch.IntTensor(encoding)

    all_encodings = [
        torch.IntTensor([0] * max_hostname_length) for _ in range(world_size)
    ]
    dist.all_gather(all_encodings, encoding)

    all_encodings = [ec.numpy().tolist() for ec in all_encodings]
    ranks = []
    for i in range(world_size):
        if all_encodings[rank] == all_encodings[i]:
            ranks.append(i)
    return ranks
Example #21
0
def all_gather(data):
    world_size = get_world_size()
    if world_size == 1:
        return data
    batch = torch.tensor(data.shape[0], dtype=torch.long, device=data.device)
    batches = [torch.tensor(0, dtype=torch.long, device=data.device) for _ in range(world_size)]
    dist.all_gather(batches, batch)
    max_batch = max(batches).item()
    
    max_shape = list(data.shape)
    max_shape[0] = max_batch

    datas = [torch.zeros(max_shape, dtype=data.dtype, device=data.device) for _ in range(world_size)]
    if batch != max_batch:
        pad_shape = max_shape
        pad_shape[0] = max_batch - batch 
        data = torch.cat([data, torch.zeros(pad_shape, dtype=data.dtype, device=data.device)])
    dist.all_gather(datas, data)

    datas = [data[:batch] for batch, data in zip(batches, datas)]
    return torch.cat(datas)
Example #22
0
def distmm_thinthin_outer(matA, matB, tmpout=None, out=None):
    '''
    A ((p) x r), B (r x (q)) => AB((p) x q), out row-major
    tmpout: r x q, to all_gather
    out: (p) x q
    B is all_gathered.
    in NMF: to compute objective.
    '''
    rank = dist.get_rank()
    assert matA.byrow and (not matB.byrow)
    p = matA.shape[0]
    q = matB.shape[1]
    r = matA.shape[1]
    assert r == matB.shape[0]
    shape = [p, q]
    sizes = matA.sizes
    byrow = True

    # all_gather
    if tmpout is None:
        tmpout = torch.t(matB.chunk.new(q, r))
    else:
        assert tmpout.size() == torch.Size([r, q])
        torch.t(tmpout).view(-1)
    split_tmpout_pre = torch.split(tmpout, matB.sizes, dim=1)
    split_tmpout = [x.t() for x in split_tmpout_pre]
    #print(split_tmpout)
    synchronize()
    dist.all_gather(split_tmpout, matB.chunk.t())

    # compute
    if out is None:
        out = matA.chunk.new(matA.sizes[rank], q)
    else:
        assert out.size() == torch.Size([matA.sizes[rank], q])
        out.view(-1)

    chunk = torch.mm(matA.chunk, tmpout, out=out)

    return THDistMat(shape, sizes, chunk, byrow)
Example #23
0
 def prepare_self_train_data(self, rank, model, idx):
     target_num = min(self.world_size * self.train_batch_size * self.update_interval * self.accum_steps,
                      len(self.train_data["input_ids"]))
     if idx + target_num >= len(self.train_data["input_ids"]):
         select_idx = torch.cat((torch.arange(idx, len(self.train_data["input_ids"])),
                                 torch.arange(idx + target_num - len(self.train_data["input_ids"]))))
     else:
         select_idx = torch.arange(idx, idx + target_num)
     assert len(select_idx) == target_num
     idx = (idx + len(select_idx)) % len(self.train_data["input_ids"])
     select_dataset = {"input_ids": self.train_data["input_ids"][select_idx],
                       "attention_masks": self.train_data["attention_masks"][select_idx]}
     dataset_loader = self.make_dataloader(rank, select_dataset, self.eval_batch_size)
     input_ids, input_mask, preds = self.inference(model, dataset_loader, rank, return_type="data")
     gather_input_ids = [torch.ones_like(input_ids) for _ in range(self.world_size)]
     gather_input_mask = [torch.ones_like(input_mask) for _ in range(self.world_size)]
     gather_preds = [torch.ones_like(preds) for _ in range(self.world_size)]
     dist.all_gather(gather_input_ids, input_ids)
     dist.all_gather(gather_input_mask, input_mask)
     dist.all_gather(gather_preds, preds)
     input_ids = torch.cat(gather_input_ids, dim=0).cpu()
     input_mask = torch.cat(gather_input_mask, dim=0).cpu()
     all_preds = torch.cat(gather_preds, dim=0).cpu()
     weight = all_preds ** 2 / torch.sum(all_preds, dim=0)
     target_dist = (weight.t() / torch.sum(weight, dim=1)).t()
     all_target_pred = target_dist.argmax(dim=-1)
     agree = (all_preds.argmax(dim=-1) == all_target_pred).int().sum().item() / len(all_target_pred)
     self_train_dict = {"input_ids": input_ids, "attention_masks": input_mask, "labels": target_dist,
                        "all_target_pred": all_target_pred}
     return self_train_dict, idx, agree
Example #24
0
    def __call__(
        self,
        predictions: torch.Tensor,
        gold_labels: torch.Tensor,
        mask: Optional[torch.BoolTensor] = None,
    ):
        """
        # Parameters

        predictions : `torch.Tensor`, required.
            A tensor of predictions of shape (batch_size, ...).
        gold_labels : `torch.Tensor`, required.
            A tensor of the same shape as `predictions`.
        mask : `torch.BoolTensor`, optional (default = `None`).
            A tensor of the same shape as `predictions`.
        """
        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
        # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between
        # the vectors, since each element in the predictions and gold_labels tensor is assumed
        # to be a separate observation.
        predictions = predictions.reshape(-1)
        gold_labels = gold_labels.reshape(-1)

        self.total_predictions = self.total_predictions.to(predictions.device)
        self.total_gold_labels = self.total_gold_labels.to(gold_labels.device)

        if mask is not None:
            mask = mask.reshape(-1)
            self.total_predictions = torch.cat((self.total_predictions, predictions * mask), 0)
            self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels * mask), 0)
        else:
            self.total_predictions = torch.cat((self.total_predictions, predictions), 0)
            self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels), 0)

        if is_distributed():
            world_size = dist.get_world_size()
            device = gold_labels.device
            # Check if batch lengths are equal.
            _all_batch_lengths = [torch.tensor(0) for i in range(world_size)]
            dist.all_gather(
                _all_batch_lengths, torch.tensor(self.total_predictions.shape[0], device=device)
            )
            _all_batch_lengths = [batch_length.item() for batch_length in _all_batch_lengths]

            if len(set(_all_batch_lengths)) > 1:
                # Subsequent dist.all_gather() calls currently do not handle tensors of different length.
                raise RuntimeError(
                    "Distributed aggregation for SpearmanCorrelation is currently not supported "
                    "for batches of unequal length."
                )
            _total_predictions = [
                torch.zeros(self.total_predictions.shape, device=device) for i in range(world_size)
            ]
            _total_gold_labels = [
                torch.zeros(self.total_gold_labels.shape, device=device) for i in range(world_size)
            ]
            dist.all_gather(_total_predictions, self.total_predictions)
            dist.all_gather(_total_gold_labels, self.total_gold_labels)
            self.total_predictions = torch.cat(_total_predictions, dim=0)
            self.total_gold_labels = torch.cat(_total_gold_labels, dim=0)
Example #25
0
    def visualize_pseudo_proj(self, logger, iteration):
        def log_grid_image(label, im, iteration=iteration):
            nrow=int(math.ceil(im.size(0)**0.5))
            im_grid = torchvision.utils.make_grid(im, nrow=nrow)
            logger.add_image(label, im_grid, iteration)

        if self.distributed:
            if hasattr(self, 'pseudo_im') and self.pseudo_im is not None:
                pseudo_imgs = [self.pseudo_im.clone().zero_() for i in range(dist.get_world_size())]
                dist.all_gather(pseudo_imgs, self.pseudo_im)
                pseudo_imgs = torch.cat(pseudo_imgs, dim=0)
            proj_imgs = [self.proj_im.clone().zero_() for i in range(dist.get_world_size())]
            masks = [self.mask.clone().zero_() for i in range(dist.get_world_size())]
            dist.all_gather(proj_imgs, self.proj_im)
            dist.all_gather(masks, self.mask)
            proj_imgs = torch.cat(proj_imgs, dim=0)
            masks = torch.cat(masks, dim=0)
        else:
            if hasattr(self, 'pseudo_im') and self.pseudo_im is not None:
                pseudo_imgs = self.pseudo_im
            proj_imgs = self.proj_im
            masks = self.mask

        ## write summary
        if self.rank == 0:
            if self.mode == 'step2':
                log_grid_image('Image/pseudo_images', pseudo_imgs/2+0.5, iteration)
            log_grid_image('Image/proj_images', proj_imgs/2+0.5, iteration)
            log_grid_image('Image/mask', masks, iteration)
Example #26
0
def collect_results_gpu(result_part, size):
    """Collect results in gpu mode.
    It encodes results to gpu tensors and use gpu communication for results
    collection.
    Args:
        result_part (list): Results to be collected
        size (int): Result size.
    Returns:
        list: Ordered results.
    """
    rank, world_size = get_dist_info()
    # dump result part to tensor with pickle
    part_tensor = torch.tensor(bytearray(pickle.dumps(result_part)),
                               dtype=torch.uint8,
                               device='cuda')
    # gather all result part tensor shape
    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
    shape_list = [shape_tensor.clone() for _ in range(world_size)]
    dist.all_gather(shape_list, shape_tensor)
    # padding result part tensor to max length
    shape_max = torch.tensor(shape_list).max()
    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
    part_send[:shape_tensor[0]] = part_tensor
    part_recv_list = [
        part_tensor.new_zeros(shape_max) for _ in range(world_size)
    ]
    # gather all result part
    dist.all_gather(part_recv_list, part_send)
    if rank == 0:
        part_list = []
        for recv, shape in zip(part_recv_list, shape_list):
            part_list.append(
                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
        # sort the results
        ordered_results = []
        for res in zip(*part_list):
            ordered_results.extend(list(res))
        # the dataloader may pad some samples
        ordered_results = ordered_results[:size]
        return ordered_results
Example #27
0
def gather_partitioned_activations(tensors, device=None):
    global mp_rank, mp_size, mp_group
    assert len(
        tensors
    ) % 2 == 0, f'Expected even count of tensors, instead got {len(tensors)}'
    inputs = []
    num_args = int(len(tensors) / 2)
    for i in range(num_args):

        item = tensors[2 * i]
        size = tensors[2 * i + 1]

        if not is_activation_to_checkpoint(item):
            inputs.append(item)
            continue

        partition_size = item.numel()
        tensor_size = partition_size * mp_size
        if device is not None:
            flat_tensor = torch.zeros([tensor_size],
                                      dtype=item.dtype,
                                      device=device)
        else:
            flat_tensor = torch.zeros([tensor_size],
                                      dtype=item.dtype,
                                      device=item.device)
        partitions = []
        for i in range(mp_size):
            part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
            if i == mp_rank:
                part_i.copy_(item)
            partitions.append(part_i)
        if mp_group is not None:
            dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
        input_tensor = flat_tensor.view(list(size.numpy()))
        item.data = input_tensor.data

        inputs.append(item)

    return tuple(inputs)
Example #28
0
    def report_epoch_stats(self):
        if self.epoch_stats['prefix'] == 'train':
            statistics = [
                self.epoch_stats['num_correct'], self.epoch_stats['num_total'],
                self.epoch_stats['loss']
            ]
        else:
            # aggregate the results from all nodes
            group = dist.new_group(range(self.args.world_size))
            statistics = th.tensor([
                self.epoch_stats['num_correct'], self.epoch_stats['num_total'],
                self.epoch_stats['loss']
            ],
                                   dtype=th.float32).cuda()

            if self.args.dist_method == 'reduce':
                dist.reduce(tensor=statistics,
                            dst=0,
                            op=dist.ReduceOp.SUM,
                            group=group)
            elif self.args.dist_method == 'all_gather':
                all_statistics = [
                    th.zeros((1, 3)).cuda()
                    for _ in range(self.args.world_size)
                ]
                dist.all_gather(tensor=statistics,
                                tensor_list=all_statistics,
                                group=group)
                statistics = th.sum(th.cat(all_statistics, dim=0),
                                    dim=0).cpu().numpy()

        accuracy = float(statistics[0]) / statistics[1]
        loss = statistics[2] / statistics[1]
        if self.epoch_stats['prefix'] != 'test':
            self.logger.info(
                "rank %d, %s phase of epoch %d: accuracy %.6f, loss %.6f, num_correct %d, total %d"
                % (self.args.distributed_rank, self.epoch_stats['prefix'],
                   self.epoch_stats['epoch'], accuracy, loss, statistics[0],
                   statistics[1]))
        return accuracy, loss
Example #29
0
def peak_gpu_memory() -> Dict[int, int]:
    """
    Get the peak GPU memory usage in bytes by device.

    # Returns

    `Dict[int, int]`
        Keys are device ids as integers.
        Values are memory usage as integers in bytes.
        Returns an empty `dict` if GPUs are not available.
    """
    if not torch.cuda.is_available():
        return {}

    device = torch.cuda.current_device()

    results_dict: Dict[int, int] = {}
    if is_distributed():
        # If the backend is not 'nccl', we're training on CPU.
        if dist.get_backend() != "nccl":
            return {}

        global_rank = dist.get_rank()
        world_size = dist.get_world_size()
        peak_bytes = torch.cuda.max_memory_allocated(device)
        peak_bytes_tensor = torch.tensor([global_rank, peak_bytes], device=device)
        # All of these tensors will be gathered into this list.
        gather_results = [torch.tensor([0, 0], device=device) for _ in range(world_size)]

        dist.all_gather(gather_results, peak_bytes_tensor)

        for peak_bytes_tensor in gather_results:
            results_dict[int(peak_bytes_tensor[0])] = int(peak_bytes_tensor[1])
    else:
        results_dict = {0: torch.cuda.max_memory_allocated()}

    # Reset peak stats.
    torch.cuda.reset_max_memory_allocated(device)

    return results_dict
Example #30
0
def get_full_inputs(tensors):
    inputs=[]
    for i in range(int(len(tensors)/2)-1):
        item = tensors[2 * i]
        size = tensors[2* i + 1]
        partition_size = item.numel()
        tensor_size = partition_size * mp_size
        flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device)
        partitions=[]
        for i in range(mp_size):
            part_i = flat_tensor.narrow(0, partition_size * i , partition_size)
            if i == mp_rank:
                part_i.copy_(item)
            partitions.append(part_i)
        dist.all_gather(partitions,partitions[mp_rank], group=mp_group)
        input_tensor = flat_tensor.view(list(size.numpy()))
        item.data=input_tensor.data

        inputs.append(item)
    inputs.append(tensors[-2])
        
    return tuple(inputs)
Example #31
0
    def feed_op(self, batch, mode):
        """Feed data to the metric.

        Args:
            batch (Tensor): Input tensor.
            mode (str): The mode of current data batch. 'reals' or 'fakes'.
        """
        if mode == 'reals':
            pass
        elif mode == 'fakes':
            if self.bgr2rgb:
                batch = batch[:, [2, 1, 0], ...]
            if self.resize:
                if self.use_pil_resize:
                    batch = self.pil_resize(batch)
                else:
                    batch = F.interpolate(batch,
                                          size=(299, 299),
                                          mode='bilinear')
            if self.use_tero_script:
                batch = (batch * 127.5 + 128).clamp(0, 255).to(torch.uint8)

            batch = batch.to(self.device)

            # get prediction
            pred = self.get_pred(batch)

            if dist.is_initialized():
                ws = dist.get_world_size()
                placeholder = [torch.zeros_like(pred) for _ in range(ws)]
                dist.all_gather(placeholder, pred)
                pred = torch.cat(placeholder, dim=0)

            # in distributed training, we only collect features at rank-0.
            if (dist.is_initialized()
                    and dist.get_rank() == 0) or not dist.is_initialized():
                self.preds.append(pred.cpu().numpy())
        else:
            raise ValueError(f'{mode} is not a implemented feed mode.')
Example #32
0
    def prepare(self, label, optimizer):
        """
        get sampled class centers for cal softmax.

        label: tensor
            Label tensor on each rank.
        optimizer: opt
            Optimizer for partial fc, which need to get weight mom.
        """
        with torch.cuda.stream(self.stream):
            total_label = torch.zeros(size=[self.batch_size * self.world_size],
                                      device=self.device,
                                      dtype=torch.long)
            dist.all_gather(list(total_label.chunk(self.world_size, dim=0)),
                            label)
            self.sample(total_label)
            optimizer.state.pop(optimizer.param_groups[-1]['params'][0], None)
            optimizer.param_groups[-1]['params'][0] = self.sub_weight
            optimizer.state[
                self.sub_weight]['momentum_buffer'] = self.sub_weight_mom
            norm_weight = normalize(self.sub_weight)
            return total_label, norm_weight
Example #33
0
def _pad_to_largest_tensor(tensor, group):
    world_size = dist.get_world_size(group=group)
    assert world_size >= 1, \
        "comm.gather/all_gather must be called from ranks within the given group!"
    local_size = torch.tensor([tensor.numel()],
                              dtype=torch.int64,
                              device=tensor.device)
    size_list = [
        torch.zeros([1], dtype=torch.int64, device=tensor.device)
        for _ in range(world_size)
    ]
    dist.all_gather(size_list, local_size, group=group)
    size_list = [int(size.item()) for size in size_list]

    max_size = max(size_list)

    if local_size != max_size:
        padding = torch.zeros((max_size - local_size, ),
                              dtype=torch.uint8,
                              device=tensor.device)
        tensor = torch.cat((tensor, padding), dim=0)
    return size_list, tensor
Example #34
0
            print_stats(bytes, num_tensors, end - start)
    print()
else:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.gather(tensor, dst=0)
dist.barrier()

if rank == 0:
    print_header("all gather")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        tensors = [tensor for n in range(0, dist.get_world_size())]
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.all_gather(tensors, tensor)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
else:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        tensors = [tensor for n in range(0, dist.get_world_size())]
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.all_gather(tensors, tensor)
dist.barrier()