Esempi in Python per _queue_reduction

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: torch.distributed

Metodo/funzione: _queue_reduction

Esempi su hotexamples.com: 4

_queue_reduction in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per torch.distributed._queue_reduction, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: test_c10d.py Progetto: xswang/pytorch

    def test_queue_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]

        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        # The first return value should be the allreduce work item.
        self.assertTrue(isinstance(work, c10d.Work))
        # The second return value will be the finished allreduced gradients.
        self.assertTrue(isinstance(local_grad_sum, torch.Tensor))

        # Wait for the allreduce to finish.
        work.wait()

        # The expected result of the allreduce should be the average
        self.assertEqual(local_grad_sum,
                         torch.ones(10) * (self.world_size + 1) / 2.0)

Esempio n. 2

Mostra file

 def _queue_reduction(self, bucket_idx):
     # _queue_reduction will use a seperate CUDA stream to coalesce
     # the small tensors to achieve more parallelisms, before passing the
     # coalesced tensor into the c10d CUDA stream for reduction
     result = dist._queue_reduction(self.process_group,
                                    self.buckets[bucket_idx],
                                    self.device_ids)
     self.reduction_works[bucket_idx] = result[0]
     self.buckets_coalesced[bucket_idx] = result[1]

Esempio n. 3

Mostra file

File: test_c10d.py Progetto: xswang/pytorch

    def test_sync_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]
        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        c10d._sync_reduction(work, grads_batch[0], local_grad_sum)
        # The expected result of the allreduce should be the average
        self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))

Esempio n. 4

Mostra file

File: distributed.py Progetto: qingyan1990/pytorch

 def _queue_reduction(self, bucket_idx):
     result = dist._queue_reduction(self.process_group,
                                    self.buckets[bucket_idx],
                                    self.device_ids)
     self.reduction_works[bucket_idx] = result[0]
     self.buckets_coalesced[bucket_idx] = result[1]