Esempio n. 1
0
    def test_queue_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]

        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        # The first return value should be the allreduce work item.
        self.assertTrue(isinstance(work, c10d.Work))
        # The second return value will be the finished allreduced gradients.
        self.assertTrue(isinstance(local_grad_sum, torch.Tensor))

        # Wait for the allreduce to finish.
        work.wait()

        # The expected result of the allreduce should be the average
        self.assertEqual(local_grad_sum,
                         torch.ones(10) * (self.world_size + 1) / 2.0)
Esempio n. 2
0
 def _queue_reduction(self, bucket_idx):
     # _queue_reduction will use a seperate CUDA stream to coalesce
     # the small tensors to achieve more parallelisms, before passing the
     # coalesced tensor into the c10d CUDA stream for reduction
     result = dist._queue_reduction(self.process_group,
                                    self.buckets[bucket_idx],
                                    self.device_ids)
     self.reduction_works[bucket_idx] = result[0]
     self.buckets_coalesced[bucket_idx] = result[1]
Esempio n. 3
0
    def test_sync_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]
        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        c10d._sync_reduction(work, grads_batch[0], local_grad_sum)
        # The expected result of the allreduce should be the average
        self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))
Esempio n. 4
0
 def _queue_reduction(self, bucket_idx):
     result = dist._queue_reduction(self.process_group,
                                    self.buckets[bucket_idx],
                                    self.device_ids)
     self.reduction_works[bucket_idx] = result[0]
     self.buckets_coalesced[bucket_idx] = result[1]