def forward(self, data): """ Arguments: data: Tensor to be reduced across all processes. """ return hvd.allreduce_async(data, name=self.name, op=self.reduction)
def test_horovod_allreduce_async_fused(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors with Tensor Fusion.""" hvd.init() size = hvd.size() dtypes = [ torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor ] if _fp16_supported: dtypes += [torch.HalfTensor] if torch.cuda.is_available(): dtypes += [ torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor ] if _fp16_supported: dtypes += [torch.cuda.HalfTensor] dims = [1, 2, 3] tests = [] is_hvd_poll_false_once = False for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) handle = hvd.allreduce_async(tensor, average=False) if not hvd.poll(handle): is_hvd_poll_false_once = True tensor, = self.convert_cpu_fp16_to_fp32(tensor) multiplied = tensor * size tests.append((dtype, multiplied, handle)) # Make sure it's an asynchronous operation. assert is_hvd_poll_false_once, 'hvd.poll() always returns True, not an async op?' for dtype, multiplied, handle in tests: summed = hvd.synchronize(handle) summed, = self.convert_cpu_fp16_to_fp32(summed) max_difference = summed.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [ torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor ]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def test_horovod_allreduce_duplicate_name_error(self): """Test that the allreduce raises an error if there are two concurrent operations with the same name.""" hvd.init() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dims = [17] * 3 tensor = torch.FloatTensor(*dims) hvd.allreduce_async(tensor, name='duplicate_name') try: for i in range(10): hvd.allreduce_async(tensor, name='duplicate_name') assert False, 'hvd.allreduce_async did not throw error' except (torch.FatalError, ValueError): pass
def test_parallel(self): hvd.init() # TODO support non-MPI Adasum operation # Only do this test if there are GPUs available. if not hvd.mpi_enabled() or not torch.cuda.is_available(): self.skipTest("No GPUs available") device = torch.device('cuda:{}'.format(hvd.local_rank())) np.random.seed(2) torch.manual_seed(2) size = hvd.size() local_size = hvd.local_size() rank = hvd.rank() for data_type in self.data_types: all_Ns = [size * 20 - 13, size * 2 + 1, size + 2, 2**19] tensors = [] all_qs = [] for N in all_Ns: a = np.random.normal(0, 1, (N, 1)).astype(np.float64) r = np.random.normal(0, 1, (size, 1)).astype(np.float64) q = np.dot(a, r.T) q = q.astype(data_type) all_qs.append(q.astype(np.float64)) tensors.append(q[:, hvd.rank()]) tensors = list( map(lambda x: torch.from_numpy(x).to(device), tensors)) handles = [ hvd.allreduce_async(tensor, op=hvd.Adasum) for tensor in tensors ] reduced_tensors = [synchronize(h) for h in handles] expected = [np.sum(q, axis=1) / size for q in all_qs] all_comp = [ self.are_close(data_type, e, rt.cpu().numpy()) for e, rt in zip(expected, reduced_tensors) ] if np.alltrue(all_comp): print('Parallel test passed') else: for c, e, rt in zip(all_comp, expected, reduced_tensors): if c == False: print('computed: ', rt) print('expected: ', e) print('off by: ', self.diff_ratio(e, rt.cpu().numpy())) assert np.alltrue(all_comp)
def distributed_matmul_tn(left: Tensor, right: Tensor) -> Tensor: """ Multiply two sequence tensors to obtain the result of :math:`A^{T} B`. Left and right inputs can be N-dimensional tensors, where the first one must be of size :math:`* \times \frac{T}{N} \times T` and the second one of size , where :math:`* \times \frac{T}{N} \times D`, where :math:`T` is the total length, :math:`N` is the total number of processes available and :math:`D`, the dimension of the sequence. The result of this function is a tensor of size :math:`* \times \frac{T}{N} \times D`, that contain the result chunk for each process of the resulting operation. Inputs ------ left: Tensor :math:`A` in :math:`A^T B`, must be of size :math:`* \times \frac{T}{N} \times T` right: Tensor :math:`B` in :math:`A^T B`, must be of size :math:`* \times \frac{T}{N} \times D` Returns ------- result: Tensor For each process, this function computes the corresponding segment of the operation :math:`A^T B`, of size :math:`* \times \frac{T}{N} \times D` """ cols = left.size(-1) world_size = get_world_size() rank = get_rank() split_size = cols // world_size splits = left.split(split_size, -1) rank_block = None synchronize() for r in range(world_size): rank_split = splits[r] rank_multiplication = torch.matmul(rank_split.transpose(-1, -2), right) handle = hvd.allreduce_async(rank_multiplication, name=f'matmul_tn_{r}', op=hvd.Sum) if r == rank: rank_block = hvd.synchronize(handle) return rank_block.contiguous()
def test_horovod_allreduce_async_fused(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors with Tensor Fusion.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] tests = [] is_hvd_poll_false_once = False for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) handle = hvd.allreduce_async(tensor, average=False) if not hvd.poll(handle): is_hvd_poll_false_once = True multiplied = tensor * size tests.append((dtype, multiplied, handle)) # Make sure it's an asynchronous operation. assert is_hvd_poll_false_once, 'hvd.poll() always returns True, not an async op?' for dtype, multiplied, handle in tests: summed = hvd.synchronize(handle) max_difference = summed.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def async_send(self, tensors_compressed, ctx): # assert only one tensor in tensors_compressed for allreduce return allreduce_async(tensors_compressed[0], name=ctx[0], op=Average)