def test_broadcast(self): expected = torch.FloatTensor(128).uniform_() tensors = [expected.cuda()] for device in range(1, torch.cuda.device_count()): with torch.cuda.device(device): tensors.append(torch.cuda.FloatTensor(128)) nccl.broadcast(tensors) for i in range(torch.cuda.device_count()): self.assertEqual(tensors[i], expected)
def broadcast(tensor, devices): "Broadcasts a tensor to a number of GPUs" if nccl.is_available([tensor]) and len(set(devices)) == len(devices): tensors = [tensor] for device in devices[1:]: with torch.cuda.device(device): tensors.append(type(tensor)(tensor.size())) nccl.broadcast(tensors) return tuple(tensors) # TODO: copy to a pinned buffer first (if copy is from CPU) return tuple(tensor.cuda(gpu, async=True) for gpu in devices)
def test_broadcast(self, device, dtype): expected = torch.zeros(128).uniform_().to(dtype=dtype) tensors = [expected.cuda()] for device in range(1, torch.cuda.device_count()): tensors.append(torch.zeros(128, dtype=dtype, device=device)) nccl.broadcast(tensors) for i in range(torch.cuda.device_count()): self.assertEqual(tensors[i], expected) # Test with tuple tensors = [expected.cuda()] for device in range(1, torch.cuda.device_count()): tensors.append(torch.zeros(128, dtype=dtype, device=device)) nccl.broadcast(tuple(tensors)) for i in range(torch.cuda.device_count()): self.assertEqual(tensors[i], expected)
def test_collective_errors(self, device): t = torch.rand(10).cuda(0) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.all_reduce(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.reduce(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.broadcast(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.all_gather(t, t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.reduce_scatter(t, t)