def _batch_shuffle_ddp(self, x): # pragma: no-cover """ Batch shuffle, for making use of BatchNorm. *** Only support DistributedDataParallel (DDP) model. *** """ # gather from all gpus batch_size_this = x.shape[0] x_gather = concat_all_gather(x) batch_size_all = x_gather.shape[0] num_gpus = batch_size_all // batch_size_this # random shuffle index idx_shuffle = torch.randperm(batch_size_all).cuda() # broadcast to all gpus torch.distributed.broadcast(idx_shuffle, src=0) # index for restoring idx_unshuffle = torch.argsort(idx_shuffle) # shuffled index for this gpu gpu_idx = torch.distributed.get_rank() idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx] return x_gather[idx_this], idx_unshuffle
def _batch_unshuffle_ddp(self, x, idx_unshuffle): # pragma: no-cover """ Undo batch shuffle. *** Only support DistributedDataParallel (DDP) model. *** """ # gather from all gpus batch_size_this = x.shape[0] x_gather = concat_all_gather(x) batch_size_all = x_gather.shape[0] num_gpus = batch_size_all // batch_size_this # restored index for this gpu gpu_idx = torch.distributed.get_rank() idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx] return x_gather[idx_this]
def _dequeue_and_enqueue(self, keys): # gather keys before updating queue if self.use_ddp or self.use_ddp2: keys = concat_all_gather(keys) config_batch_size = self.config.optim_params.batch_size batch_size = keys.size(0) k = self.config.loss_params.k ptr = int(self.moco_queue_ptr) assert k % batch_size == 0 # why? # replace keys at ptr self.moco_queue[ptr:ptr + batch_size] = keys # move config by full batch size even if current batch is smaller ptr = (ptr + config_batch_size) % k self.moco_queue_ptr[0] = ptr