def _batch_shuffle_ddp(self, x):  # pragma: no-cover
        Batch shuffle, for making use of BatchNorm.
        *** Only support DistributedDataParallel (DDP) model. ***
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # random shuffle index
        idx_shuffle = torch.randperm(batch_size_all).cuda()

        # broadcast to all gpus
        torch.distributed.broadcast(idx_shuffle, src=0)

        # index for restoring
        idx_unshuffle = torch.argsort(idx_shuffle)

        # shuffled index for this gpu
        gpu_idx = torch.distributed.get_rank()
        idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx]

        return x_gather[idx_this], idx_unshuffle
    def _batch_unshuffle_ddp(self, x, idx_unshuffle):  # pragma: no-cover
        Undo batch shuffle.
        *** Only support DistributedDataParallel (DDP) model. ***
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # restored index for this gpu
        gpu_idx = torch.distributed.get_rank()
        idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx]

        return x_gather[idx_this]
    def _dequeue_and_enqueue(self, keys):
        # gather keys before updating queue
        if self.use_ddp or self.use_ddp2:
            keys = concat_all_gather(keys)

        config_batch_size = self.config.optim_params.batch_size
        batch_size = keys.size(0)

        k = self.config.loss_params.k
        ptr = int(self.moco_queue_ptr)
        assert k % batch_size == 0  # why?

        # replace keys at ptr
        self.moco_queue[ptr:ptr + batch_size] = keys
        # move config by full batch size even if current batch is smaller
        ptr = (ptr + config_batch_size) % k

        self.moco_queue_ptr[0] = ptr