def train(state, dir): state.rendezvous += 1 logging.info('rank %s: rendezvous %s', hvd.rank(), state.rendezvous) for state.epoch in range(state.epoch, epochs): logging.info('rank %s: start epoch %s at batch %s', hvd.rank(), state.epoch, state.batch) for state.batch in range(state.batch, batches_per_epoch): check_fail(dir, hvd.rank(), state.epoch, state.batch) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() # TODO: this sleep makes the fault tolerant test fail # torch all gather throws an RuntimeError which should be a HorovodInternalError #import time #time.sleep(0.2) if state.batch % batches_per_commit == 0: logging.info('rank %s: allgather', hvd.rank()) hvd.allgather( torch.tensor([ hvd.rank(), state.epoch, state.batch, state.rendezvous ]), 'state').tolist() logging.info('rank %s: commit epoch %s batch %s', hvd.rank(), state.epoch, state.batch) state.commits += 1 state.commit() logging.info('rank %s: allgather', hvd.rank()) hvd.allgather( torch.tensor( [hvd.rank(), state.epoch, state.batch, state.rendezvous]), 'state').tolist() logging.info('rank %s: commit epoch %s', hvd.rank(), state.epoch) state.commits += 1 state.commit() state.batch = 0 res = hvd.allgather( torch.tensor( [hvd.rank(), state.epoch, state.batch, state.rendezvous]), 'state').tolist() logging.info('rank %s: returning', hvd.rank()) return res, hvd.rank()
def forward(ctx, tensor, name): ctx.dim = tensor.shape[0] # we try to put all sync ops in forward pass ctx.all_dims = hvd.allgather( torch.tensor([ctx.dim], device=tensor.device)).view(hvd.size()) handle = hvd.allgather_async(tensor, name) return hvd.synchronize(handle)
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) assert list(gathered.shape) == [17 * size] + [17] * (dim - 1) for i in range(size): rank_tensor = gathered[i * 17:(i + 1) * 17] assert list(rank_tensor.shape) == [17] * dim, \ 'hvd.allgather produces incorrect gathered shape' assert rank_tensor.data.min() == i, 'hvd.allgather produces incorrect gathered tensor' assert rank_tensor.data.max() == i, 'hvd.allgather produces incorrect gathered tensor'
def forward(self, data): """ Arguments: data: Tensor to be gathered across all processes. """ return hvd.allgather(data, name=self.name)
def calculate_shuffle_buffer_size(hvd, avg_row_size, train_row_count_per_worker): """ Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling buffer such that on a single machine, among all the workers on that machine, at most memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size is identical among all the workers. example 1: memory_cap_gb = 4 machine1: 8 workers machine2: 3 workers shuffle_buffer_size = 0.5 GB example 2: memory_cap_gb = 4 machine1: 2 workers machine2: 3 workers shuffle_buffer_size = 1 GB example 3: memory_cap_gb = 4 machine1: 2 workers machine2: 8 workers machine3: 5 workers shuffle_buffer_size = 0.5 GB """ local_size = hvd.local_size() local_sizes = hvd.allgather(torch.tensor([local_size])) max_local_size = torch.max(local_sizes).item() if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB: shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size else: shuffle_buffer_size = BYTES_PER_GIB / avg_row_size return int(min(shuffle_buffer_size, train_row_count_per_worker))
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [ torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor ] if torch.cuda.is_available(): dtypes += [ torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor ] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) assert list(gathered.shape) == [17 * size] + [17] * (dim - 1) for i in range(size): rank_tensor = gathered[i * 17:(i + 1) * 17] assert list(rank_tensor.shape) == [17] * dim, \ 'hvd.allgather produces incorrect gathered shape' assert rank_tensor.data.min( ) == i, 'hvd.allgather produces incorrect gathered tensor' assert rank_tensor.data.max( ) == i, 'hvd.allgather produces incorrect gathered tensor'
def all_gather_hvd(data, group=None): global _USE_HVD assert _USE_HVD, f"_USE_HVD: {_USE_HVD}" world_size = get_world_size() if world_size == 1: return [data] tensor = _serialize_to_tensor(data, group) size_list, tensor = _pad_to_largest_tensor(tensor, group) max_size = max(size_list) # receiving Tensor from all ranks tensor_list = [ torch.empty((max_size, ), dtype=torch.uint8, device=tensor.device) for _ in size_list ] if _USE_HVD: # NOTE: concatenated on the first dimension tensor_list = hvd.allgather(tensor[None, ]) else: dist.all_gather(tensor_list, tensor, group=group) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
def all_gather_list(data, max_size=4096): """Gathers arbitrary data from all nodes into a list.""" world_size = hvd.size() if not hasattr(all_gather_list, '_in_buffer') or \ max_size != all_gather_list._in_buffer.size(): all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size) in_buffer = all_gather_list._in_buffer enc = pickle.dumps(data) enc_size = len(enc) if enc_size + 2 > max_size: raise ValueError('encoded data exceeds max_size: {}'.format(enc_size + 2)) assert max_size < 255 * 256 in_buffer[0] = enc_size // 255 # this encoding works for max_size < 65k in_buffer[1] = enc_size % 255 in_buffer[2:enc_size + 2] = torch.ByteTensor(list(enc)) # FIXME cannot create buffer out = hvd.allgather(in_buffer.cuda()) results = [] for i in range(0, max_size * world_size, max_size): out_buffer = out[i:i + max_size] size = (255 * out_buffer[0].item()) + out_buffer[1].item() bytes_list = bytes(out_buffer[2:size + 2].tolist()) result = pickle.loads(bytes_list) results.append(result) return results
def async_send(self, tensors_compressed, name): """ :param tensors_compressed: list of flat tensors to communicate :param name: for the all_gather operation :return: handles to synchronize, tensor sizes per rank """ tensors_size = [t.numel() for t in tensors_compressed ] # list of tensor size for this rank if self.compressor.tensors_size_are_same: tensors_size_ag = [ tensors_size ] * self.world_size # list of tensor sizes per rank tensor_sizes = zip(*tensors_size_ag) # transpose else: tensors_size = torch.tensor(tensors_size) # TODO: set device gathered = allgather( tensors_size) # tensor of tensor sizes per rank tensor_sizes = gathered.view([self.world_size, -1 ]).t().tolist() # transpose, to list handles = [] for tensor_compressed in tensors_compressed: handle = allgather_async(tensor_compressed) handles.append(handle) return handles, tensor_sizes
def all_gather_list(data): """Gathers arbitrary data from all nodes into a list.""" enc = pickle.dumps(data) enc_size = len(enc) max_size = hvd.allgather(torch.tensor([enc_size]).cuda()).max().item() in_buffer, enc_byte = _encode(enc, max_size) out_buffer = hvd.allgather(in_buffer[:enc_byte + enc_size]) results = [] for _ in range(hvd.size()): bytes_list, shift = _decode(out_buffer, enc_byte) out_buffer = out_buffer[shift:] result = pickle.loads(bytes_list) results.append(result) return results
def test_gradient(tensors, models, device): k, q, mask = tensors k = k.to(device) q = q.to(device) mask = mask.to(device) k = k.requires_grad_(True) q = q.requires_grad_(True) model, gt_model = models model = model.to(device) gt_model = gt_model.to(device) distr_out = model(k, q, k, mask) reduction = distr_out.sum() reduction.backward() k_gt = hvd.allgather(k.detach()) q_gt = hvd.allgather(q.detach()) k_gt = k_gt.view(1, -1, k_gt.size(-1)) q_gt = q_gt.view(1, -1, q_gt.size(-1)) k_gt = k_gt.requires_grad_(True) q_gt = q_gt.requires_grad_(True) gt_mask = torch.zeros(1, k_gt.size(1), q_gt.size(1), device=k_gt.device) gt_out = gt_model(k_gt, q_gt, k_gt, gt_mask.bool()) gt_reduction = gt_out.sum() gt_reduction.backward() k_grad = hvd.allgather(k.grad) k_grad = k_grad.view(1, -1, k_grad.size(-1)) q_grad = hvd.allgather(q.grad) q_grad = q_grad.view(1, -1, q_grad.size(-1)) assert torch.allclose(k_grad, k_gt.grad, atol=1e-5) assert torch.allclose(q_grad, q_gt.grad, atol=1e-5) for ((gt_name, gt_param), (name, param)) in zip( gt_model.named_parameters(), model.named_parameters()): assert gt_name == name gt_grad = gt_param.grad grad = hvd.allreduce(param.grad, op=hvd.Sum) assert torch.allclose(gt_grad, grad, atol=1e-5)
def get_video_level_scores(self, modularized_query, context_feat1, context_mask, val_gather_gpus=True): """ Calculate video2query scores for each pair of video and query inside the batch. Args: modularized_query: (N, D) context_feat1: (N, L, D), output of the first transformer encoder layer context_mask: (N, L) Returns: context_query_scores: (N, N) score of each query w.r.t. each video inside the batch, diagonal positions are positive. used to get negative samples. """ modularized_query = F.normalize(modularized_query, dim=-1, eps=1e-5) context_feat1 = F.normalize(context_feat1, dim=-1, eps=1e-5) # gather all ranks to increase negative examples # only do this at training, multi-GPU eval is not supported if self.training and self.gather_gpus or\ not self.training and val_gather_gpus: # need to pad video to same length bs, vlen, hid = context_feat1.size() device = context_feat1.device all_vlens = hvd.allgather(torch.tensor( [vlen], device=device)).view(hvd.size()) max_vlen = all_vlens.max().item() pad_len = max_vlen - all_vlens[hvd.rank()] if pad_len != 0: pad = torch.zeros(bs, pad_len, hid, dtype=context_feat1.dtype, device=device) context_feat1 = torch.cat([context_feat1, pad], dim=1) mask_pad = pad[..., 0].long() context_mask = torch.cat([context_mask, mask_pad], dim=1) # our backprop compatible allgather modularized_query = vsm_allgather(modularized_query).contiguous() context_feat1 = vsm_allgather(context_feat1).contiguous() context_mask = vsm_allgather(context_mask).contiguous() query_context_scores = torch.einsum("md,nld->mln", modularized_query, context_feat1) # (N, L, N) context_mask = context_mask.transpose(0, 1).unsqueeze(0) # (1, L, N) context_mask = context_mask.to( dtype=query_context_scores.dtype) # fp16 compatibility query_context_scores = mask_logits(query_context_scores, context_mask) # (N, L, N) query_context_scores, _ = torch.max( query_context_scores, dim=1) # (N, N) diagonal positions are positive pairs. return query_context_scores
def distributed_matmul_nt(left: Tensor, right: Tensor, offset=32) -> Tensor: """ Multiply two sequence tensors to obtain the result of :math:`AB^T`. Left and right inputs can be N-dimensional tensors of size :math:`* \times \frac{T}{N} \times D`, where :math:`T` is the total length, :math:`N` is the total number of processes available and :math:`D`, the dimension of the sequence. The result of this function is a tensor of size :math:`* \times \frac{T}{N} \times T`, that contain the result chunk for each process of the resulting operation. Inputs ------ left: Tensor :math:`A` in :math:`AB^T`, must be of size :math:`* \times \frac{T}{N} \times D`. right: Tensor :math:`B` in :math:`AB^T`, must be of size :math:`* \times \frac{T}{N} \times D`. offset: int Number of chunks to communicate during each distributed step, it must be a factor of :math:`\frac{T}{N}`. This factor should be modified in order to reduce the total computing time at the expense of the memory used. Returns ------- result: Tensor For each process, this function computes the corresponding segment of the operation :math:`A^T B`, of size :math:`* \times \frac{T}{N} \times T`. """ synchronize() rows = left.size(-2) world_size = get_world_size() total_rows = rows * world_size prefix_size = tuple(left.size())[:-2] size = (left.size(-2), right.size(-2)) size = (world_size,) + prefix_size + size # (world_size, ...dims, T/N, T/N) result = torch.empty(size, device=left.device) final_size = prefix_size + (left.size(-2), total_rows) for row in range(0, rows, offset): end_bound = row + offset current_row = right[..., row:end_bound, :].contiguous() # [r0[row:end_bound], r1[row:end_bound], ..., rworld[row:end_bound]] # all_rows: world_size x ... x offset x dim current_row = current_row.unsqueeze(0) all_rows = hvd.allgather(current_row, name=f'scatter_rows_{row}') partial_results = left.matmul(all_rows.transpose(-1, -2)) result[..., row:end_bound] = partial_results result = result.unsqueeze(-2).transpose(0, -2).reshape(*final_size) return result
def fn(a, b, c, d): hvd.init() rank = hvd.rank() v = a + b + c + d res = hvd.allgather(torch.tensor([rank, v])).tolist() if rank == 0: return res elif rank == 1: return "ret_val_of_rank_1" else: return None
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except torch.FatalError: pass
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except (torch.FatalError, RuntimeError): pass
def any_broadcast(data, root_rank): """broadcast arbitrary data from root_rank to all nodes.""" enc = pickle.dumps(data) max_size = hvd.allgather(torch.tensor([len(enc)]).cuda()).max().item() buffer_, enc_byte = _encode(enc, max_size, use_max_size=True) hvd.broadcast_(buffer_, root_rank) bytes_list, _ = _decode(buffer_, enc_byte) result = pickle.loads(bytes_list) return result
def test_distributed_nt(tensor_fixture): gt_tensors, test_tensors, (gt_fn, test_fn) = tensor_fixture gt_result = gt_fn(*gt_tensors) test_result = test_fn(*test_tensors) gather_result = hvd.allgather(test_result) if gather_result.dim() == 3: collapsed = gather_result.size(0) * gather_result.size(1) gather_result = gather_result.view(1, collapsed, -1) else: gather_result = gather_result.transpose(0, 1).reshape( 1, gather_result.size(1), -1, gather_result.size(-1)) assert (gt_result == gather_result).all()
def distributed_matmul_all(left: Tensor, right: Tensor, offset=32) -> Tensor: """ Multiply two sequence tensors to obtain the result of :math:`AB`. Left and right inputs can be N-dimensional tensors, where the first one must be of size :math:`* \times \frac{T}{N} \times T` and the second one of size , where :math:`* \times \frac{T}{N} \times D`, where :math:`T` is the total length, :math:`N` is the total number of processes available and :math:`D`, the dimension of the sequence. The result of this function is a tensor of size :math:`* \times \frac{T}{N} \times D`, that contain the result chunk for each process of the resulting operation. Inputs ------ left: Tensor :math:`A` in :math:`AB`, must be of size :math:`* \times \frac{T}{N} \times T` right: Tensor :math:`B` in :math:`AB`, must be of size :math:`* \times \frac{T}{N} \times D` Returns ------- result: Tensor For each process, this function computes the corresponding segment of the operation :math:`AB`, of size :math:`1 \times \frac{T}{N} \times D` """ dims = left.dim() cols = left.size(dims - 1) world_size = get_world_size() total_cols = right.size(-1) split_size = cols // world_size splits = torch.stack(left.split(split_size, -1), dim=0) left_sizes = tuple(left.size()) size = (world_size,) + left_sizes[:-2] + (left.size(-2), total_cols) rank_block = torch.empty(*size, device=left.device) total_cols = right.size(-1) synchronize() for current_col in range(0, total_cols, offset): end_bound = current_col + offset col = right[..., current_col:end_bound] col = col.contiguous() all_cols = hvd.allgather(col.unsqueeze(0), name=f'matmul_all_{current_col}') # all_cols: torch.size([world_size, right.size(1), offset]) block_result = torch.matmul(splits, all_cols) rank_block[..., current_col:end_bound] = block_result result = rank_block.sum(dim=0) return result
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*tensor_size) else: tensor = torch.FloatTensor(*tensor_size) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except torch.FatalError: pass
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*tensor_size) else: tensor = torch.FloatTensor(*tensor_size) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except (torch.FatalError, RuntimeError): pass
def all_gather( self, result: torch.Tensor, group: Optional[Any] = dist_group.WORLD, sync_grads: bool = False ) -> torch.Tensor: if group is not None and group != dist_group.WORLD: raise ValueError("Horovod does not support allgather using a subcommunicator at this time. Unset `group`.") if len(result.shape) == 0: # Convert scalars to single dimension tensors result = result.reshape(1) # sync and gather all self.join() return hvd.allgather(result)
def single_point(self, pos=None, prt=True, ntherm=-1, ndecor=100): """Performs a single point calculation Keyword Arguments: pos {torch.tensor} -- positions of the walkers If none, sample (default: {None}) prt {bool} -- print energy/variance values (default: {True}) ntherm {int} -- number of MC steps to thermalize (default: {-1}) ndecor {int} -- number of MC step to decorelate (default: {100}) Returns: tuple -- (position, energy, variance) """ self.wf.eval() num_threads = 1 hvd.broadcast_parameters(self.wf.state_dict(), root_rank=0) torch.set_num_threads(num_threads) # sample the wave function pos = self.sample(ntherm=self.resample.ntherm) pos.requires_grad = False # handle the batch size batchsize = len(pos) # create the data loader self.dataset = DataSet(pos) if self.cuda: kwargs = {'num_workers': num_threads, 'pin_memory': True} else: kwargs = {'num_workers': num_threads} self.dataloader = DataLoader(self.dataset, batch_size=batchsize, **kwargs) for data in self.dataloader: lpos = data.to(self.device) lpos.requires_grad = True eloc = self.wf.local_energy(lpos) eloc_all = hvd.allgather(eloc, name='local_energies') e = torch.mean(eloc_all) s = torch.var(eloc_all) if prt: printd(hvd.rank(), 'Energy : ', e) printd(hvd.rank(), 'Variance : ', s) return pos, e, s
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [ torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor ] if _fp16_supported: dtypes += [torch.HalfTensor] if torch.cuda.is_available(): dtypes += [ torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor ] if _fp16_supported: dtypes += [torch.cuda.HalfTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor(*([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) tensor, gathered = self.convert_cpu_fp16_to_fp32(tensor, gathered) expected_size = sum(tensor_sizes) assert list(gathered.shape) == [expected_size] + [17] * (dim - 1) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = gathered[sum(tensor_sizes[:i] ):sum(tensor_sizes[:i + 1])] assert list(rank_tensor.shape) == rank_size assert rank_tensor.data.min() == i assert rank_tensor.data.max() == i
def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): if group is not None: raise ValueError( "Horovod does not support allgather using a subcommunicator at this time. " "Unset `group`." ) if len(result.shape) == 0: # Convert scalars to single dimension tensors result = result.reshape(1) # sync and gather all hvd.join() gathered = hvd.allgather(result) gathered_result = list(gathered.split(1, dim=0)) return gathered_result
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [ torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor ] if torch.cuda.is_available(): dtypes += [ torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor ] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor(*([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) grad_list = [] for r, size in enumerate(tensor_sizes): grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r) grad_ys = torch.cat(grad_list, dim=0) gathered = hvd.allgather(tensor) gathered.backward(grad_ys) grad_out = tensor.grad.data.numpy() expected = np.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess( err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def all_gather(data, max_size=65000): """ Gathers arbitrary data from all nodes into a list. This function is heavily borrowed from fairseq (https://github.com/pytorch/fairseq) Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python data. Note that *data* must be picklable. Args: data (Any): data from the local worker to be gathered on other workers group (optional): group of the collective max_size (int, optional): maximum size of the data to be gathered across workers """ world_size = hvd.size() buffer_size = max_size if not hasattr(all_gather, '_buffer') or \ all_gather._buffer.numel() < buffer_size: all_gather._buffer = torch.cuda.ByteTensor(buffer_size) buffer = all_gather._buffer buffer.zero_() enc = pickle.dumps(data) enc_size = len(enc) if enc_size + 2 > max_size: raise ValueError('encoded data exceeds max_size: {}'.format(enc_size + 2)) buffer_rank = buffer buffer_rank[0] = enc_size // 255 # this encoding works for max_size < 65k buffer_rank[1] = enc_size % 255 buffer_rank[2:enc_size + 2] = torch.ByteTensor(list(enc)) buffer_gathered = hvd.allgather(buffer) result = [] for i in range(world_size): out_buffer = buffer_gathered[i * max_size:(i + 1) * max_size] size = (255 * item(out_buffer[0])) + item(out_buffer[1]) if size > 0: result.append(pickle.loads(bytes(out_buffer[2:size + 2].tolist()))) return result
def calculate_shuffle_buffer_size(): """ Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling buffer such that on a single machine, among all the workers on that machine, at most memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size is identical among all the workers. example 1: memory_cap_gb = 4 machine1: 8 workers machine2: 3 workers shuffle_buffer_size = 0.5 GB example 2: memory_cap_gb = 4 machine1: 2 workers machine2: 3 workers shuffle_buffer_size = 1 GB example 3: memory_cap_gb = 4 machine1: 2 workers machine2: 8 workers machine3: 5 workers shuffle_buffer_size = 0.5 GB """ import horovod.torch as hvd # If user specifies any user_shuffle_buffer_size (even 0), we should honor it. if user_shuffle_buffer_size is not None: if user_shuffle_buffer_size < 0: raise ValueError( "user_shuffle_buffer_size cannot be negative!") return user_shuffle_buffer_size local_size = hvd.local_size() local_sizes = hvd.allgather(torch.tensor([local_size])) max_local_size = torch.max(local_sizes).item() if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB: shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size else: shuffle_buffer_size = BYTES_PER_GIB / avg_row_size return int(min(shuffle_buffer_size, train_rows / hvd.size()))
def evaluate(model, eval_loader): st = time() LOGGER.info("start running Image/Text Retrieval evaluation ...") score_matrix = inference(model, eval_loader) dset = eval_loader.dataset all_score = hvd.allgather(score_matrix) all_txt_ids = [i for ids in all_gather_list(dset.ids) for i in ids] all_img_ids = dset.all_img_ids assert all_score.size() == (len(all_txt_ids), len(all_img_ids)) if hvd.rank() != 0: return {} # NOTE: only use rank0 to compute final scores eval_log = itm_eval(all_score, all_txt_ids, all_img_ids, dset.txt2img, dset.img2txts) tot_time = time() - st LOGGER.info(f"evaluation finished in {int(tot_time)} seconds") return eval_log
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # Only Tensors of floating point dtype can require gradients dtypes = [torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] if _fp16_supported: dtypes += [torch.cuda.HalfTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) tensor.requires_grad_() grad_list = [] for r, size in enumerate(tensor_sizes): grad_list.append(self.cast_and_place( torch.ones([size] + [17] * (dim - 1)), dtype) * r) grad_ys = torch.cat(grad_list, dim=0) gathered = hvd.allgather(tensor) gathered.backward(grad_ys) grad_out = tensor.grad.data.cpu().numpy() expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def measure_fid(): # get all the resolutions to evaluate fid # WARNING: always assume that we measure the largest few resolutions n_res_to_eval = len(args.inception_path.split(',')) # support evaluating multiple resolutions inception_path_list = args.inception_path.split(',') n_batch = math.ceil(args.fid_n_sample / (args.fid_batch_size * hvd.size())) features_list = [None for _ in range(n_res_to_eval)] torch.manual_seed(int(time.time()) + hvd.rank() * 999) # just make sure they use different seed # collect features with torch.no_grad(): for _ in tqdm(range(n_batch), desc='FID', disable=hvd.rank() != 0): z = torch.randn(args.fid_batch_size, 1, 512, device=device) out, all_rgbs = g_ema(z, return_rgbs=True) for i_res in range(n_res_to_eval): img = all_rgbs[-i_res - 1] feat = inception(img.clamp(-1., 1.))[0].view(img.shape[0], -1).to('cpu') if features_list[i_res] is None: features_list[i_res] = feat else: features_list[i_res] = torch.cat((features_list[i_res], feat), dim=0) # compute the FID fid_dict = dict() for i_res, features in enumerate(features_list): features = hvd.allgather(features, name='fid_features{}'.format(i_res)).numpy() features = features[:args.fid_n_sample] if hvd.rank() == 0: # only compute on node 1, save some CPU sample_mean = np.mean(features, 0) sample_cov = np.cov(features, rowvar=False) with open(inception_path_list[i_res], 'rb') as f: embeds = pickle.load(f) real_mean = embeds['mean'] real_cov = embeds['cov'] fid = calc_fid(sample_mean, sample_cov, real_mean, real_cov) fid_dict[int(args.resolution / 2 ** i_res)] = fid else: fid_dict[int(args.resolution / 2 ** i_res)] = 1e9 if hvd.rank() == 0: print('fid:', {k: round(v, 3) for k, v in fid_dict.items()}) fid0 = hvd.broadcast(torch.tensor(fid_dict[args.resolution]).float(), root_rank=0, name='fid').item() return fid0 # only return the fid of the largest resolution
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) grad_list = [] for r, size in enumerate(tensor_sizes): grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r) grad_ys = torch.cat(grad_list, dim=0) gathered = hvd.allgather(tensor) gathered.backward(grad_ys) grad_out = tensor.grad.data.numpy() expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def _dequeue_and_enqueue(self, keys): # gather keys before updating queue if self.use_horovod: keys = hvd.allgather(keys) else: keys = concat_all_gather(keys) batch_size = keys.shape[0] # ptr = int(self.queue_ptr) ptr = int(self.queue_ptr.item()) assert self.K % batch_size == 0 # for simplicity # replace the keys at ptr (dequeue and enqueue) self.queue = torch.cat([self.queue[:, :ptr], keys.T, self.queue[:, ptr + batch_size :]], dim=1).detach() # self.queue = self.queue.clone() # self.queue[:, ptr : ptr + batch_size] = keys.T ptr = (ptr + batch_size) % self.K # move pointer self.queue_ptr[0] = ptr
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) expected_size = sum(tensor_sizes) assert list(gathered.shape) == [expected_size] + [17] * (dim - 1) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = gathered[sum( tensor_sizes[:i]):sum(tensor_sizes[:i + 1])] assert list(rank_tensor.shape) == rank_size assert rank_tensor.data.min() == i assert rank_tensor.data.max() == i
def _pad_to_largest_tensor(tensor, group): """ Returns: list[int]: size of the tensor, on each rank Tensor: padded tensor that has the max size """ global _USE_HVD if _USE_HVD: world_size = get_world_size() else: world_size = dist.get_world_size(group=group) assert world_size >= 1, "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] if _USE_HVD: size_list = hvd.allgather( local_size) # a tensor with (world_size,) actually else: dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros((max_size - local_size, ), dtype=torch.uint8, device=tensor.device) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor