def _sync_reduction_works(self): # Now only work on the first GPU of self.device_ids # _sync_reduction will use a seperate CUDA stream to uncoalesce # the coalesced tensors to achieve more parallelisms for bucket_idx, grads_batch in enumerate(self.buckets): # print('---before_sync_reduction_works_0---\n', self.buckets[bucket_idx][0]) # print('---before_sync_reduction_works_1---\n', self.buckets[bucket_idx][1]) dist._sync_reduction(self.reduction_works[bucket_idx], grads_batch[0], self.buckets_coalesced[bucket_idx]) # print('---after_sync_reduction_works_0---\n', self.buckets[bucket_idx][0]) # print('---after_sync_reduction_works_1---\n', self.buckets[bucket_idx][1]) for p, (bucket_idx, bucket_offset) in self.bucket_map.items(): p.grad.data.view(-1).zero_().masked_scatter_( self.masks[p], self.buckets[bucket_idx][0][bucket_offset]) # Reset the module states self.next_bucket = len(self.bucket_sizes) - 1 self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
def test_sync_reduction(self): # Set up process group. store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # Get this process' split of devices. devices = gpus_for_rank(self.world_size)[self.rank] grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) * (self.rank + 1)).chunk(5) for d in devices] work, local_grad_sum = c10d._queue_reduction(process_group, grads_batch, devices) c10d._sync_reduction(work, grads_batch[0], local_grad_sum) # The expected result of the allreduce should be the average self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))
def _sync_reduction_works(self): # Now only work on the first GPU of self.device_ids # _sync_reduction will use a seperate CUDA stream to uncoalesce # the coalesced tensors to achieve more parallelisms for bucket_idx, grads_batch in enumerate(self.buckets): dist._sync_reduction(self.reduction_works[bucket_idx], grads_batch[0], self.buckets_coalesced[bucket_idx]) # Reset the module states self.next_bucket = len(self.bucket_sizes) - 1 self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]