Exemple #1
0
    def _sync_reduction_works(self):
        # Now only work on the first GPU of self.device_ids
        # _sync_reduction will use a seperate CUDA stream to uncoalesce
        # the coalesced tensors to achieve more parallelisms
        for bucket_idx, grads_batch in enumerate(self.buckets):
            # print('---before_sync_reduction_works_0---\n', self.buckets[bucket_idx][0])
            # print('---before_sync_reduction_works_1---\n', self.buckets[bucket_idx][1])
            dist._sync_reduction(self.reduction_works[bucket_idx],
                                 grads_batch[0],
                                 self.buckets_coalesced[bucket_idx])
            # print('---after_sync_reduction_works_0---\n', self.buckets[bucket_idx][0])
            # print('---after_sync_reduction_works_1---\n', self.buckets[bucket_idx][1])

        for p, (bucket_idx, bucket_offset) in self.bucket_map.items():
            p.grad.data.view(-1).zero_().masked_scatter_(
                self.masks[p], self.buckets[bucket_idx][0][bucket_offset])

        # Reset the module states
        self.next_bucket = len(self.bucket_sizes) - 1
        self.ready_buckets_not_reduced = set()
        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]

        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
                         for _ in range(len(self.device_ids))]
                        for i in range(len(self.bucket_sizes))]
        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))]
                                   for i in range(len(self.bucket_sizes))]
Exemple #2
0
    def test_sync_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]
        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        c10d._sync_reduction(work, grads_batch[0], local_grad_sum)
        # The expected result of the allreduce should be the average
        self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))
    def _sync_reduction_works(self):
        # Now only work on the first GPU of self.device_ids
        # _sync_reduction will use a seperate CUDA stream to uncoalesce
        # the coalesced tensors to achieve more parallelisms
        for bucket_idx, grads_batch in enumerate(self.buckets):
            dist._sync_reduction(self.reduction_works[bucket_idx],
                                 grads_batch[0],
                                 self.buckets_coalesced[bucket_idx])

        # Reset the module states
        self.next_bucket = len(self.bucket_sizes) - 1
        self.ready_buckets_not_reduced = set()
        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]

        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]