def train(state, dir):
        state.rendezvous += 1
        logging.info('rank %s: rendezvous %s', hvd.rank(), state.rendezvous)

        for state.epoch in range(state.epoch, epochs):
            logging.info('rank %s: start epoch %s at batch %s', hvd.rank(),
                         state.epoch, state.batch)

            for state.batch in range(state.batch, batches_per_epoch):
                check_fail(dir, hvd.rank(), state.epoch, state.batch)

                optimizer.zero_grad()
                output = model(data)
                loss = F.cross_entropy(output, target)
                loss.backward()
                optimizer.step()

                # TODO: this sleep makes the fault tolerant test fail
                #       torch all gather throws an RuntimeError which should be a HorovodInternalError
                #import time
                #time.sleep(0.2)

                if state.batch % batches_per_commit == 0:
                    logging.info('rank %s: allgather', hvd.rank())
                    hvd.allgather(
                        torch.tensor([
                            hvd.rank(), state.epoch, state.batch,
                            state.rendezvous
                        ]), 'state').tolist()
                    logging.info('rank %s: commit epoch %s batch %s',
                                 hvd.rank(), state.epoch, state.batch)
                    state.commits += 1
                    state.commit()

            logging.info('rank %s: allgather', hvd.rank())
            hvd.allgather(
                torch.tensor(
                    [hvd.rank(), state.epoch, state.batch, state.rendezvous]),
                'state').tolist()
            logging.info('rank %s: commit epoch %s', hvd.rank(), state.epoch)
            state.commits += 1
            state.commit()
            state.batch = 0

        res = hvd.allgather(
            torch.tensor(
                [hvd.rank(), state.epoch, state.batch, state.rendezvous]),
            'state').tolist()
        logging.info('rank %s: returning', hvd.rank())
        return res, hvd.rank()
Exemple #2
0
 def forward(ctx, tensor, name):
     ctx.dim = tensor.shape[0]
     # we try to put all sync ops in forward pass
     ctx.all_dims = hvd.allgather(
         torch.tensor([ctx.dim], device=tensor.device)).view(hvd.size())
     handle = hvd.allgather_async(tensor, name)
     return hvd.synchronize(handle)
Exemple #3
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            gathered = hvd.allgather(tensor)

            assert list(gathered.shape) == [17 * size] + [17] * (dim - 1)

            for i in range(size):
                rank_tensor = gathered[i * 17:(i + 1) * 17]
                assert list(rank_tensor.shape) == [17] * dim, \
                    'hvd.allgather produces incorrect gathered shape'
                assert rank_tensor.data.min() == i, 'hvd.allgather produces incorrect gathered tensor'
                assert rank_tensor.data.max() == i, 'hvd.allgather produces incorrect gathered tensor'
Exemple #4
0
 def forward(self, data):
     """
     Arguments:
         data:
             Tensor to be gathered across all processes.
     """
     return hvd.allgather(data, name=self.name)
Exemple #5
0
    def calculate_shuffle_buffer_size(hvd, avg_row_size, train_row_count_per_worker):
        """
        Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling
        buffer such that on a single machine, among all the workers on that machine, at most
        memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size
        is identical among all the workers.

        example 1:
        memory_cap_gb = 4
        machine1: 8 workers
        machine2: 3 workers
        shuffle_buffer_size = 0.5 GB

        example 2:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 3 workers
        shuffle_buffer_size = 1 GB

        example 3:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 8 workers
            machine3: 5 workers
        shuffle_buffer_size = 0.5 GB
        """
        local_size = hvd.local_size()
        local_sizes = hvd.allgather(torch.tensor([local_size]))
        max_local_size = torch.max(local_sizes).item()

        if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB:
            shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size
        else:
            shuffle_buffer_size = BYTES_PER_GIB / avg_row_size
        return int(min(shuffle_buffer_size, train_row_count_per_worker))
Exemple #6
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [
            torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
            torch.IntTensor, torch.LongTensor, torch.FloatTensor,
            torch.DoubleTensor
        ]
        if torch.cuda.is_available():
            dtypes += [
                torch.cuda.ByteTensor, torch.cuda.CharTensor,
                torch.cuda.ShortTensor, torch.cuda.IntTensor,
                torch.cuda.LongTensor, torch.cuda.FloatTensor,
                torch.cuda.DoubleTensor
            ]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            gathered = hvd.allgather(tensor)

            assert list(gathered.shape) == [17 * size] + [17] * (dim - 1)

            for i in range(size):
                rank_tensor = gathered[i * 17:(i + 1) * 17]
                assert list(rank_tensor.shape) == [17] * dim, \
                    'hvd.allgather produces incorrect gathered shape'
                assert rank_tensor.data.min(
                ) == i, 'hvd.allgather produces incorrect gathered tensor'
                assert rank_tensor.data.max(
                ) == i, 'hvd.allgather produces incorrect gathered tensor'
Exemple #7
0
def all_gather_hvd(data, group=None):
    global _USE_HVD
    assert _USE_HVD, f"_USE_HVD: {_USE_HVD}"
    world_size = get_world_size()
    if world_size == 1:
        return [data]

    tensor = _serialize_to_tensor(data, group)

    size_list, tensor = _pad_to_largest_tensor(tensor, group)
    max_size = max(size_list)

    # receiving Tensor from all ranks
    tensor_list = [
        torch.empty((max_size, ), dtype=torch.uint8, device=tensor.device)
        for _ in size_list
    ]
    if _USE_HVD:
        # NOTE: concatenated on the first dimension
        tensor_list = hvd.allgather(tensor[None, ])
    else:
        dist.all_gather(tensor_list, tensor, group=group)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
        data_list.append(pickle.loads(buffer))

    return data_list
def all_gather_list(data, max_size=4096):
    """Gathers arbitrary data from all nodes into a list."""
    world_size = hvd.size()
    if not hasattr(all_gather_list, '_in_buffer') or \
            max_size != all_gather_list._in_buffer.size():
        all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size)
    in_buffer = all_gather_list._in_buffer

    enc = pickle.dumps(data)
    enc_size = len(enc)
    if enc_size + 2 > max_size:
        raise ValueError('encoded data exceeds max_size: {}'.format(enc_size +
                                                                    2))
    assert max_size < 255 * 256
    in_buffer[0] = enc_size // 255  # this encoding works for max_size < 65k
    in_buffer[1] = enc_size % 255
    in_buffer[2:enc_size + 2] = torch.ByteTensor(list(enc))

    # FIXME cannot create buffer
    out = hvd.allgather(in_buffer.cuda())

    results = []
    for i in range(0, max_size * world_size, max_size):
        out_buffer = out[i:i + max_size]
        size = (255 * out_buffer[0].item()) + out_buffer[1].item()

        bytes_list = bytes(out_buffer[2:size + 2].tolist())
        result = pickle.loads(bytes_list)
        results.append(result)
    return results
Exemple #9
0
    def async_send(self, tensors_compressed, name):
        """
        :param tensors_compressed: list of flat tensors to communicate
        :param name: for the all_gather operation
        :return: handles to synchronize, tensor sizes per rank
        """
        tensors_size = [t.numel() for t in tensors_compressed
                        ]  # list of tensor size for this rank
        if self.compressor.tensors_size_are_same:
            tensors_size_ag = [
                tensors_size
            ] * self.world_size  # list of tensor sizes per rank
            tensor_sizes = zip(*tensors_size_ag)  # transpose
        else:
            tensors_size = torch.tensor(tensors_size)  # TODO: set device
            gathered = allgather(
                tensors_size)  # tensor of tensor sizes per rank
            tensor_sizes = gathered.view([self.world_size, -1
                                          ]).t().tolist()  # transpose, to list

        handles = []
        for tensor_compressed in tensors_compressed:
            handle = allgather_async(tensor_compressed)
            handles.append(handle)

        return handles, tensor_sizes
Exemple #10
0
def all_gather_list(data):
    """Gathers arbitrary data from all nodes into a list."""
    enc = pickle.dumps(data)

    enc_size = len(enc)
    max_size = hvd.allgather(torch.tensor([enc_size]).cuda()).max().item()
    in_buffer, enc_byte = _encode(enc, max_size)

    out_buffer = hvd.allgather(in_buffer[:enc_byte + enc_size])

    results = []
    for _ in range(hvd.size()):
        bytes_list, shift = _decode(out_buffer, enc_byte)
        out_buffer = out_buffer[shift:]
        result = pickle.loads(bytes_list)
        results.append(result)
    return results
Exemple #11
0
def test_gradient(tensors, models, device):
    k, q, mask = tensors
    k = k.to(device)
    q = q.to(device)
    mask = mask.to(device)

    k = k.requires_grad_(True)
    q = q.requires_grad_(True)

    model, gt_model = models
    model = model.to(device)
    gt_model = gt_model.to(device)

    distr_out = model(k, q, k, mask)
    reduction = distr_out.sum()
    reduction.backward()

    k_gt = hvd.allgather(k.detach())
    q_gt = hvd.allgather(q.detach())

    k_gt = k_gt.view(1, -1, k_gt.size(-1))
    q_gt = q_gt.view(1, -1, q_gt.size(-1))
    k_gt = k_gt.requires_grad_(True)
    q_gt = q_gt.requires_grad_(True)
    gt_mask = torch.zeros(1, k_gt.size(1), q_gt.size(1), device=k_gt.device)

    gt_out = gt_model(k_gt, q_gt, k_gt, gt_mask.bool())
    gt_reduction = gt_out.sum()
    gt_reduction.backward()

    k_grad = hvd.allgather(k.grad)
    k_grad = k_grad.view(1, -1, k_grad.size(-1))

    q_grad = hvd.allgather(q.grad)
    q_grad = q_grad.view(1, -1, q_grad.size(-1))

    assert torch.allclose(k_grad, k_gt.grad, atol=1e-5)
    assert torch.allclose(q_grad, q_gt.grad, atol=1e-5)

    for ((gt_name, gt_param), (name, param)) in zip(
            gt_model.named_parameters(), model.named_parameters()):
        assert gt_name == name
        gt_grad = gt_param.grad
        grad = hvd.allreduce(param.grad, op=hvd.Sum)
        assert torch.allclose(gt_grad, grad, atol=1e-5)
Exemple #12
0
    def get_video_level_scores(self,
                               modularized_query,
                               context_feat1,
                               context_mask,
                               val_gather_gpus=True):
        """ Calculate video2query scores for each pair of video
            and query inside the batch.
        Args:
            modularized_query: (N, D)
            context_feat1: (N, L, D),
                output of the first transformer encoder layer
            context_mask: (N, L)
        Returns:
            context_query_scores: (N, N)
                score of each query w.r.t. each video inside the batch,
                diagonal positions are positive. used to get negative samples.
        """
        modularized_query = F.normalize(modularized_query, dim=-1, eps=1e-5)
        context_feat1 = F.normalize(context_feat1, dim=-1, eps=1e-5)
        # gather all ranks to increase negative examples
        # only do this at training, multi-GPU eval is not supported
        if self.training and self.gather_gpus or\
                not self.training and val_gather_gpus:
            # need to pad video to same length
            bs, vlen, hid = context_feat1.size()
            device = context_feat1.device
            all_vlens = hvd.allgather(torch.tensor(
                [vlen], device=device)).view(hvd.size())
            max_vlen = all_vlens.max().item()
            pad_len = max_vlen - all_vlens[hvd.rank()]
            if pad_len != 0:
                pad = torch.zeros(bs,
                                  pad_len,
                                  hid,
                                  dtype=context_feat1.dtype,
                                  device=device)
                context_feat1 = torch.cat([context_feat1, pad], dim=1)
                mask_pad = pad[..., 0].long()
                context_mask = torch.cat([context_mask, mask_pad], dim=1)
            # our backprop compatible allgather
            modularized_query = vsm_allgather(modularized_query).contiguous()
            context_feat1 = vsm_allgather(context_feat1).contiguous()
            context_mask = vsm_allgather(context_mask).contiguous()

        query_context_scores = torch.einsum("md,nld->mln", modularized_query,
                                            context_feat1)  # (N, L, N)
        context_mask = context_mask.transpose(0, 1).unsqueeze(0)  # (1, L, N)
        context_mask = context_mask.to(
            dtype=query_context_scores.dtype)  # fp16 compatibility
        query_context_scores = mask_logits(query_context_scores,
                                           context_mask)  # (N, L, N)
        query_context_scores, _ = torch.max(
            query_context_scores,
            dim=1)  # (N, N) diagonal positions are positive pairs.
        return query_context_scores
def distributed_matmul_nt(left: Tensor, right: Tensor, offset=32) -> Tensor:
    """
    Multiply two sequence tensors to obtain the result of :math:`AB^T`.

    Left and right inputs can be N-dimensional tensors of size
    :math:`* \times \frac{T}{N} \times D`, where :math:`T` is the total length,
    :math:`N` is the total number of processes available and :math:`D`, the
    dimension of the sequence. The result of this function is a tensor of size
    :math:`* \times \frac{T}{N} \times T`, that contain the result chunk for
    each process of the resulting operation.

    Inputs
    ------
    left: Tensor
        :math:`A` in :math:`AB^T`, must be of size
        :math:`* \times \frac{T}{N} \times D`.
    right: Tensor
        :math:`B` in :math:`AB^T`, must be of size
        :math:`* \times \frac{T}{N} \times D`.
    offset: int
        Number of chunks to communicate during each distributed step, it must
        be a factor of :math:`\frac{T}{N}`. This factor should be modified in
        order to reduce the total computing time at the expense of the memory
        used.

    Returns
    -------
    result: Tensor
        For each process, this function computes the corresponding segment
        of the operation :math:`A^T B`, of size
        :math:`* \times \frac{T}{N} \times T`.
    """
    synchronize()
    rows = left.size(-2)
    world_size = get_world_size()
    total_rows = rows * world_size

    prefix_size = tuple(left.size())[:-2]
    size = (left.size(-2), right.size(-2))
    size = (world_size,) + prefix_size + size
    # (world_size, ...dims, T/N, T/N)
    result = torch.empty(size, device=left.device)
    final_size = prefix_size + (left.size(-2), total_rows)

    for row in range(0, rows, offset):
        end_bound = row + offset
        current_row = right[..., row:end_bound, :].contiguous()
        # [r0[row:end_bound], r1[row:end_bound], ..., rworld[row:end_bound]]
        # all_rows: world_size x ... x offset x dim
        current_row = current_row.unsqueeze(0)
        all_rows = hvd.allgather(current_row, name=f'scatter_rows_{row}')
        partial_results = left.matmul(all_rows.transpose(-1, -2))
        result[..., row:end_bound] = partial_results
    result = result.unsqueeze(-2).transpose(0, -2).reshape(*final_size)
    return result
 def fn(a, b, c, d):
     hvd.init()
     rank = hvd.rank()
     v = a + b + c + d
     res = hvd.allgather(torch.tensor([rank, v])).tolist()
     if rank == 0:
         return res
     elif rank == 1:
         return "ret_val_of_rank_1"
     else:
         return None
Exemple #15
0
    def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        tensor_size = [17] * 3
        tensor_size[1] = 10 * (rank + 1)
        tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except torch.FatalError:
            pass
Exemple #16
0
    def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        tensor_size = [17] * 3
        tensor_size[1] = 10 * (rank + 1)
        tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except (torch.FatalError, RuntimeError):
            pass
Exemple #17
0
def any_broadcast(data, root_rank):
    """broadcast arbitrary data from root_rank to all nodes."""
    enc = pickle.dumps(data)

    max_size = hvd.allgather(torch.tensor([len(enc)]).cuda()).max().item()
    buffer_, enc_byte = _encode(enc, max_size, use_max_size=True)

    hvd.broadcast_(buffer_, root_rank)

    bytes_list, _ = _decode(buffer_, enc_byte)
    result = pickle.loads(bytes_list)
    return result
Exemple #18
0
def test_distributed_nt(tensor_fixture):
    gt_tensors, test_tensors, (gt_fn, test_fn) = tensor_fixture
    gt_result = gt_fn(*gt_tensors)
    test_result = test_fn(*test_tensors)
    gather_result = hvd.allgather(test_result)
    if gather_result.dim() == 3:
        collapsed = gather_result.size(0) * gather_result.size(1)
        gather_result = gather_result.view(1, collapsed, -1)
    else:
        gather_result = gather_result.transpose(0, 1).reshape(
            1, gather_result.size(1), -1, gather_result.size(-1))
    assert (gt_result == gather_result).all()
def distributed_matmul_all(left: Tensor, right: Tensor, offset=32) -> Tensor:
    """
    Multiply two sequence tensors to obtain the result of :math:`AB`.

    Left and right inputs can be N-dimensional tensors, where the first one
    must be of size :math:`* \times \frac{T}{N} \times T` and the second one of
    size , where :math:`* \times \frac{T}{N} \times D`, where :math:`T` is the
    total length,  :math:`N` is the total number of processes available and
    :math:`D`, the dimension of the sequence. The result of this function is a
    tensor of size :math:`* \times \frac{T}{N} \times D`, that contain the
    result chunk for each process of the resulting operation.

    Inputs
    ------
    left: Tensor
        :math:`A` in :math:`AB`, must be of size
        :math:`* \times \frac{T}{N} \times T`
    right: Tensor
        :math:`B` in :math:`AB`, must be of size
        :math:`* \times \frac{T}{N} \times D`

    Returns
    -------
    result: Tensor
        For each process, this function computes the corresponding segment
        of the operation :math:`AB`, of size
        :math:`1 \times \frac{T}{N} \times D`
    """
    dims = left.dim()
    cols = left.size(dims - 1)
    world_size = get_world_size()

    total_cols = right.size(-1)
    split_size = cols // world_size
    splits = torch.stack(left.split(split_size, -1), dim=0)
    left_sizes = tuple(left.size())
    size = (world_size,) + left_sizes[:-2] + (left.size(-2), total_cols)
    rank_block = torch.empty(*size, device=left.device)

    total_cols = right.size(-1)
    synchronize()
    for current_col in range(0, total_cols, offset):
        end_bound = current_col + offset
        col = right[..., current_col:end_bound]
        col = col.contiguous()
        all_cols = hvd.allgather(col.unsqueeze(0),
                                 name=f'matmul_all_{current_col}')
        # all_cols: torch.size([world_size, right.size(1), offset])
        block_result = torch.matmul(splits, all_cols)
        rank_block[..., current_col:end_bound] = block_result
    result = rank_block.sum(dim=0)
    return result
Exemple #20
0
    def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        tensor_size = [17] * 3
        if rank % 2 == 0:
            tensor = torch.IntTensor(*tensor_size)
        else:
            tensor = torch.FloatTensor(*tensor_size)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except torch.FatalError:
            pass
Exemple #21
0
    def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        tensor_size = [17] * 3
        if rank % 2 == 0:
            tensor = torch.IntTensor(*tensor_size)
        else:
            tensor = torch.FloatTensor(*tensor_size)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except (torch.FatalError, RuntimeError):
            pass
Exemple #22
0
    def all_gather(
        self, result: torch.Tensor, group: Optional[Any] = dist_group.WORLD, sync_grads: bool = False
    ) -> torch.Tensor:
        if group is not None and group != dist_group.WORLD:
            raise ValueError("Horovod does not support allgather using a subcommunicator at this time. Unset `group`.")

        if len(result.shape) == 0:
            # Convert scalars to single dimension tensors
            result = result.reshape(1)

        # sync and gather all
        self.join()
        return hvd.allgather(result)
Exemple #23
0
    def single_point(self, pos=None, prt=True, ntherm=-1, ndecor=100):
        """Performs a single point calculation

        Keyword Arguments:
            pos {torch.tensor} -- positions of the walkers If none, sample (default: {None})
            prt {bool} -- print energy/variance values (default: {True})
            ntherm {int} -- number of MC steps to thermalize (default: {-1})
            ndecor {int} -- number of MC step to decorelate  (default: {100})

        Returns:
            tuple -- (position, energy, variance)
        """

        self.wf.eval()
        num_threads = 1
        hvd.broadcast_parameters(self.wf.state_dict(), root_rank=0)
        torch.set_num_threads(num_threads)

        # sample the wave function
        pos = self.sample(ntherm=self.resample.ntherm)
        pos.requires_grad = False

        # handle the batch size
        batchsize = len(pos)

        # create the data loader
        self.dataset = DataSet(pos)

        if self.cuda:
            kwargs = {'num_workers': num_threads, 'pin_memory': True}
        else:
            kwargs = {'num_workers': num_threads}

        self.dataloader = DataLoader(self.dataset,
                                     batch_size=batchsize,
                                     **kwargs)

        for data in self.dataloader:

            lpos = data.to(self.device)
            lpos.requires_grad = True
            eloc = self.wf.local_energy(lpos)

        eloc_all = hvd.allgather(eloc, name='local_energies')
        e = torch.mean(eloc_all)
        s = torch.var(eloc_all)

        if prt:
            printd(hvd.rank(), 'Energy   : ', e)
            printd(hvd.rank(), 'Variance : ', s)
        return pos, e, s
Exemple #24
0
    def test_horovod_allgather_variable_size(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
        even if those tensors have different sizes along the first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [
            torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
            torch.IntTensor, torch.LongTensor, torch.FloatTensor,
            torch.DoubleTensor
        ]
        if _fp16_supported:
            dtypes += [torch.HalfTensor]
        if torch.cuda.is_available():
            dtypes += [
                torch.cuda.ByteTensor, torch.cuda.CharTensor,
                torch.cuda.ShortTensor, torch.cuda.IntTensor,
                torch.cuda.LongTensor, torch.cuda.FloatTensor,
                torch.cuda.DoubleTensor
            ]
            if _fp16_supported:
                dtypes += [torch.cuda.HalfTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = torch.FloatTensor(*([tensor_sizes[rank]] + [17] *
                                         (dim - 1))).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            gathered = hvd.allgather(tensor)
            tensor, gathered = self.convert_cpu_fp16_to_fp32(tensor, gathered)

            expected_size = sum(tensor_sizes)
            assert list(gathered.shape) == [expected_size] + [17] * (dim - 1)

            for i in range(size):
                rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                rank_tensor = gathered[sum(tensor_sizes[:i]
                                           ):sum(tensor_sizes[:i + 1])]
                assert list(rank_tensor.shape) == rank_size
                assert rank_tensor.data.min() == i
                assert rank_tensor.data.max() == i
    def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None):
        if group is not None:
            raise ValueError(
                "Horovod does not support allgather using a subcommunicator at this time. "
                "Unset `group`."
            )

        if len(result.shape) == 0:
            # Convert scalars to single dimension tensors
            result = result.reshape(1)

        # sync and gather all
        hvd.join()
        gathered = hvd.allgather(result)
        gathered_result = list(gathered.split(1, dim=0))
        return gathered_result
Exemple #26
0
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [
            torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
            torch.IntTensor, torch.LongTensor, torch.FloatTensor,
            torch.DoubleTensor
        ]
        if torch.cuda.is_available():
            dtypes += [
                torch.cuda.ByteTensor, torch.cuda.CharTensor,
                torch.cuda.ShortTensor, torch.cuda.IntTensor,
                torch.cuda.LongTensor, torch.cuda.FloatTensor,
                torch.cuda.DoubleTensor
            ]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = torch.FloatTensor(*([tensor_sizes[rank]] + [17] *
                                         (dim - 1))).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            tensor = torch.autograd.Variable(tensor, requires_grad=True)

            grad_list = []
            for r, size in enumerate(tensor_sizes):
                grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r)
            grad_ys = torch.cat(grad_list, dim=0)

            gathered = hvd.allgather(tensor)
            gathered.backward(grad_ys)
            grad_out = tensor.grad.data.numpy()

            expected = np.ones([tensor_sizes[rank]] + [17] *
                               (dim - 1)) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(
                err, 0.00000001, "gradient %s differs from expected %s, "
                "error: %s" % (grad_out, expected, str(err)))
Exemple #27
0
def all_gather(data, max_size=65000):
    """ Gathers arbitrary data from all nodes into a list.

    This function is heavily borrowed from fairseq (https://github.com/pytorch/fairseq)

    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
    data. Note that *data* must be picklable.

    Args:
        data (Any): data from the local worker to be gathered on other workers
        group (optional): group of the collective
        max_size (int, optional): maximum size of the data to be gathered
            across workers
    """

    world_size = hvd.size()

    buffer_size = max_size
    if not hasattr(all_gather, '_buffer') or \
            all_gather._buffer.numel() < buffer_size:
        all_gather._buffer = torch.cuda.ByteTensor(buffer_size)

    buffer = all_gather._buffer
    buffer.zero_()

    enc = pickle.dumps(data)
    enc_size = len(enc)

    if enc_size + 2 > max_size:
        raise ValueError('encoded data exceeds max_size: {}'.format(enc_size +
                                                                    2))

    buffer_rank = buffer
    buffer_rank[0] = enc_size // 255  # this encoding works for max_size < 65k
    buffer_rank[1] = enc_size % 255
    buffer_rank[2:enc_size + 2] = torch.ByteTensor(list(enc))

    buffer_gathered = hvd.allgather(buffer)

    result = []
    for i in range(world_size):
        out_buffer = buffer_gathered[i * max_size:(i + 1) * max_size]
        size = (255 * item(out_buffer[0])) + item(out_buffer[1])
        if size > 0:
            result.append(pickle.loads(bytes(out_buffer[2:size + 2].tolist())))
    return result
Exemple #28
0
    def calculate_shuffle_buffer_size():
        """
        Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling
        buffer such that on a single machine, among all the workers on that machine, at most
        memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size
        is identical among all the workers.

        example 1:
        memory_cap_gb = 4
        machine1: 8 workers
        machine2: 3 workers
        shuffle_buffer_size = 0.5 GB

        example 2:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 3 workers
        shuffle_buffer_size = 1 GB

        example 3:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 8 workers
            machine3: 5 workers
        shuffle_buffer_size = 0.5 GB
        """
        import horovod.torch as hvd

        # If user specifies any user_shuffle_buffer_size (even 0), we should honor it.
        if user_shuffle_buffer_size is not None:
            if user_shuffle_buffer_size < 0:
                raise ValueError(
                    "user_shuffle_buffer_size cannot be negative!")
            return user_shuffle_buffer_size

        local_size = hvd.local_size()
        local_sizes = hvd.allgather(torch.tensor([local_size]))
        max_local_size = torch.max(local_sizes).item()

        if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB:
            shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size
        else:
            shuffle_buffer_size = BYTES_PER_GIB / avg_row_size
        return int(min(shuffle_buffer_size, train_rows / hvd.size()))
Exemple #29
0
def evaluate(model, eval_loader):
    st = time()
    LOGGER.info("start running Image/Text Retrieval evaluation ...")
    score_matrix = inference(model, eval_loader)
    dset = eval_loader.dataset
    all_score = hvd.allgather(score_matrix)
    all_txt_ids = [i for ids in all_gather_list(dset.ids) for i in ids]
    all_img_ids = dset.all_img_ids
    assert all_score.size() == (len(all_txt_ids), len(all_img_ids))
    if hvd.rank() != 0:
        return {}

    # NOTE: only use rank0 to compute final scores
    eval_log = itm_eval(all_score, all_txt_ids, all_img_ids, dset.txt2img,
                        dset.img2txts)

    tot_time = time() - st
    LOGGER.info(f"evaluation finished in {int(tot_time)} seconds")
    return eval_log
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # Only Tensors of floating point dtype can require gradients
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
            if _fp16_supported:
                dtypes += [torch.cuda.HalfTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = torch.FloatTensor(
                *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            tensor.requires_grad_()

            grad_list = []
            for r, size in enumerate(tensor_sizes):
                grad_list.append(self.cast_and_place(
                    torch.ones([size] + [17] * (dim - 1)), dtype) * r)
            grad_ys = torch.cat(grad_list, dim=0)

            gathered = hvd.allgather(tensor)
            gathered.backward(grad_ys)
            grad_out = tensor.grad.data.cpu().numpy()

            expected = np.ones(
                [tensor_sizes[rank]] + [17] * (dim - 1)
            ) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(err, 0.00000001,
                            "gradient %s differs from expected %s, "
                            "error: %s" % (grad_out, expected, str(err)))
Exemple #31
0
def measure_fid():
    # get all the resolutions to evaluate fid
    # WARNING: always assume that we measure the largest few resolutions
    n_res_to_eval = len(args.inception_path.split(','))  # support evaluating multiple resolutions
    inception_path_list = args.inception_path.split(',')

    n_batch = math.ceil(args.fid_n_sample / (args.fid_batch_size * hvd.size()))

    features_list = [None for _ in range(n_res_to_eval)]
    torch.manual_seed(int(time.time()) + hvd.rank() * 999)  # just make sure they use different seed
    # collect features
    with torch.no_grad():
        for _ in tqdm(range(n_batch), desc='FID', disable=hvd.rank() != 0):
            z = torch.randn(args.fid_batch_size, 1, 512, device=device)
            out, all_rgbs = g_ema(z, return_rgbs=True)
            for i_res in range(n_res_to_eval):
                img = all_rgbs[-i_res - 1]
                feat = inception(img.clamp(-1., 1.))[0].view(img.shape[0], -1).to('cpu')
                if features_list[i_res] is None:
                    features_list[i_res] = feat
                else:
                    features_list[i_res] = torch.cat((features_list[i_res], feat), dim=0)
    # compute the FID
    fid_dict = dict()
    for i_res, features in enumerate(features_list):
        features = hvd.allgather(features, name='fid_features{}'.format(i_res)).numpy()
        features = features[:args.fid_n_sample]
        if hvd.rank() == 0:  # only compute on node 1, save some CPU
            sample_mean = np.mean(features, 0)
            sample_cov = np.cov(features, rowvar=False)
            with open(inception_path_list[i_res], 'rb') as f:
                embeds = pickle.load(f)
            real_mean = embeds['mean']
            real_cov = embeds['cov']
            fid = calc_fid(sample_mean, sample_cov, real_mean, real_cov)
            fid_dict[int(args.resolution / 2 ** i_res)] = fid
        else:
            fid_dict[int(args.resolution / 2 ** i_res)] = 1e9
    if hvd.rank() == 0:
        print('fid:', {k: round(v, 3) for k, v in fid_dict.items()})
    fid0 = hvd.broadcast(torch.tensor(fid_dict[args.resolution]).float(), root_rank=0, name='fid').item()
    return fid0  # only return the fid of the largest resolution
Exemple #32
0
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = torch.FloatTensor(
                *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            tensor = torch.autograd.Variable(tensor, requires_grad=True)

            grad_list = []
            for r, size in enumerate(tensor_sizes):
                grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r)
            grad_ys = torch.cat(grad_list, dim=0)

            gathered = hvd.allgather(tensor)
            gathered.backward(grad_ys)
            grad_out = tensor.grad.data.numpy()

            expected = np.ones(
                [tensor_sizes[rank]] + [17] * (dim - 1)
            ) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(err, 0.00000001,
                            "gradient %s differs from expected %s, "
                            "error: %s" % (grad_out, expected, str(err)))
Exemple #33
0
    def _dequeue_and_enqueue(self, keys):
        # gather keys before updating queue
        if self.use_horovod:
            keys = hvd.allgather(keys)
        else:
            keys = concat_all_gather(keys)

        batch_size = keys.shape[0]

        # ptr = int(self.queue_ptr)
        ptr = int(self.queue_ptr.item())
        assert self.K % batch_size == 0  # for simplicity

        # replace the keys at ptr (dequeue and enqueue)
        self.queue = torch.cat([self.queue[:, :ptr], keys.T, self.queue[:, ptr + batch_size :]], dim=1).detach()
        # self.queue = self.queue.clone()
        # self.queue[:, ptr : ptr + batch_size] = keys.T
        ptr = (ptr + batch_size) % self.K  # move pointer

        self.queue_ptr[0] = ptr
Exemple #34
0
    def test_horovod_allgather_variable_size(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
        even if those tensors have different sizes along the first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = torch.FloatTensor(
                *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            gathered = hvd.allgather(tensor)

            expected_size = sum(tensor_sizes)
            assert list(gathered.shape) == [expected_size] + [17] * (dim - 1)

            for i in range(size):
                rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                rank_tensor = gathered[sum(
                    tensor_sizes[:i]):sum(tensor_sizes[:i + 1])]
                assert list(rank_tensor.shape) == rank_size
                assert rank_tensor.data.min() == i
                assert rank_tensor.data.max() == i
Exemple #35
0
def _pad_to_largest_tensor(tensor, group):
    """
    Returns:
        list[int]: size of the tensor, on each rank
        Tensor: padded tensor that has the max size
    """
    global _USE_HVD
    if _USE_HVD:
        world_size = get_world_size()
    else:
        world_size = dist.get_world_size(group=group)
    assert world_size >= 1, "comm.gather/all_gather must be called from ranks within the given group!"
    local_size = torch.tensor([tensor.numel()],
                              dtype=torch.int64,
                              device=tensor.device)
    size_list = [
        torch.zeros([1], dtype=torch.int64, device=tensor.device)
        for _ in range(world_size)
    ]
    if _USE_HVD:
        size_list = hvd.allgather(
            local_size)  # a tensor with (world_size,) actually
    else:
        dist.all_gather(size_list, local_size, group=group)
    size_list = [int(size.item()) for size in size_list]

    max_size = max(size_list)

    # we pad the tensor because torch all_gather does not support
    # gathering tensors of different shapes
    if local_size != max_size:
        padding = torch.zeros((max_size - local_size, ),
                              dtype=torch.uint8,
                              device=tensor.device)
        tensor = torch.cat((tensor, padding), dim=0)
    return size_list, tensor