def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=device_ids[0], streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip(grad_batch, _unflatten_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip( device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip( grad_batch, _unflatten_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def _queue_reduction(self, bucket_idx): grads_batch = self.buckets[bucket_idx] grads_batch_coalesced = [] # coalesce the bucket for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch): with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors( dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # reduce to the first GPU in self.device_ids if len(self.device_ids) > 1: nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) # divide by the number of processes here to reduce chances of overflow grads_batch_coalesced[0] /= self.process_group.size() # now work on the first gpu reduction_work = self.process_group.allreduce( [grads_batch_coalesced[0]], self.allreduce_opts) self.reduction_works[bucket_idx] = reduction_work self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
def reduce_add(inputs, destination=None): "Reduces tensors from multiple GPUs and returns a result a specified device" # TODO: try to find an input on another gpu, copy it, # and accumulate into the copy input_size = inputs[0].size() for i, inp in enumerate(inputs): assert inp.is_cuda, "reduce_add expects all inputs to be on GPUs" if inp.size() != input_size: got = 'x'.join(str(x) for x in inp.size()) expected = 'x'.join(str(x) for x in input_size) raise ValueError("input {} has invalid size: got {}, but expected " "{}".format(i, got, expected)) if destination is None: destination = torch.cuda.current_device() with torch.cuda.device(destination): result = type(inp)(input_size).zero_() if nccl.is_available(inputs) and inputs[0].get_device() == destination: outputs = [result] + [t.new(t.size()) for t in inputs[1:]] nccl.reduce(inputs, outputs) return result for inp in inputs: input_correct_gpu = inp.cuda(result.get_device()) result.add_(input_correct_gpu) return result
def test_reduce(self): tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)] expected = torch.FloatTensor(128).zero_() for t in tensors: expected.add_(t) tensors = [tensors[i].cuda(i) for i in range(nGPUs)] nccl.reduce(tensors) self.assertEqual(tensors[0], expected)
def test_reduce(self, device, dtype): tensors = [torch.zeros(128).uniform_().to(dtype=dtype) for i in range(nGPUs)] expected = torch.zeros(128, dtype=dtype) for t in tensors: expected.add_(t) tensors = [tensors[i].cuda(i) for i in range(nGPUs)] nccl.reduce(tensors) self.assertEqual(tensors[0], expected)
def reduce_add(inputs, destination=None): """Sums tensors from multiple GPUs. All inputs should have matching shapes, dtype, and layout. The output tensor will be of the same shape, dtype, and layout. Arguments: inputs (Iterable[Tensor]): an iterable of tensors to add. destination (int, optional): a device on which the output will be placed (default: current device). Returns: A tensor containing an elementwise sum of all inputs, placed on the :attr:`destination` device. """ destination = _get_device_index(destination, optional=True) input_size = inputs[0].size() root_index = None # index of input tensor that already is on the correct device for i, inp in enumerate(inputs): assert inp.device.type != "cpu", "reduce_add expects all inputs to be on GPUs" if inp.get_device() == destination: root_index = i if inp.size() != input_size: got = 'x'.join(str(x) for x in inp.size()) expected = 'x'.join(str(x) for x in input_size) raise ValueError("input {} has invalid size: got {}, but expected " "{}".format(i, got, expected)) if root_index is None: raise RuntimeError( "reduce_add expects destination to be on the same GPU with one of the tensors" ) if len(inputs) == 1: return inputs[0] if nccl.is_available(inputs): result = torch.empty_like(inputs[root_index]) nccl.reduce(inputs, output=result, root=root_index) else: destination_device = torch.device(inputs[root_index].device.type, destination) nonroot = [t for i, t in enumerate(inputs) if i != root_index] # make a new tensor w/o clone result = inputs[root_index] + nonroot[0].to(device=destination_device, non_blocking=True) for other in nonroot[1:]: result.add_(other.to(device=destination_device, non_blocking=True)) return result
def _queue_reduction(self, bucket_idx): grads_batch = self.buckets[bucket_idx] grads_batch_coalesced = [] # coalesce the bucket for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch): with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # reduce to the first GPU in self.device_ids if len(self.device_ids) > 1: nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) # now work on the first gpu reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group) self.reduction_works[bucket_idx] = reduction_work self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
def test_collective_errors(self, device): t = torch.rand(10).cuda(0) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.all_reduce(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.reduce(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.broadcast(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.all_gather(t, t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.reduce_scatter(t, t)