def _test_all_gather_multigpu_helper(self, group, group_id, rank, rankToGPUMapping): for dest in group: tensors = [ _build_tensor(dest + 1).cuda(device=i) for i in rankToGPUMapping[rank] ] # construct expected output along with # a place holder to receive all gather results output_tensors = [] expected_output = [] output_per_gpu = [_build_tensor(dest + 1, -1)] * len( rankToGPUMapping[0]) * len(group) expected_per_gpu = [_build_tensor(dest + 1)] * len( rankToGPUMapping[0]) * len(group) for gpu in rankToGPUMapping[rank]: output_tensors.append( [t.cuda(device=gpu) for t in output_per_gpu]) expected_output.append( [t.cuda(device=gpu) for t in expected_per_gpu]) dist.all_gather_multigpu(output_tensors, tensors, group_id) self.assertEqual(output_tensors, expected_output) self._barrier()
def all_gather_op(single_node_output): all_node_outputs = [ torch.zeros_like(single_node_output).cuda() if torch.cuda.is_available() else torch.zeros_like(single_node_output) for rank in range(dist.get_world_size()) ] if torch.cuda.is_available(): single_node_output = single_node_output.cuda() dist.all_gather_multigpu(all_node_outputs, single_node_output) dist.barrier() return torch.cat(all_node_outputs)
def all_gather_op(single_node_output, verbose=False): try: all_node_outputs = [ torch.zeros_like(single_node_output).cuda() if torch.cuda.is_available() else torch.zeros_like(single_node_output) for rank in range(dist.get_world_size()) ] if torch.cuda.is_available(): single_node_output = single_node_output.cuda() dist.all_gather_multigpu(all_node_outputs, single_node_output) dist.barrier() return torch.cat(all_node_outputs) except AssertionError as e: if verbose: print(f'Distributed process group not initialized. Assuming 1 node. Error: {str(e)}') return single_node_output
def _test_all_gather_multigpu_helper(self, group, group_id, rank, rank_to_GPU): for dest in group: tensors = [_build_tensor(dest + 1).cuda(device=i) for i in rank_to_GPU[rank]] # construct expected output along with # a place holder to receive all gather results output_tensors = [] expected_output = [] output_per_gpu = [_build_tensor(dest + 1, -1)] * len(rank_to_GPU[0]) * len(group) expected_per_gpu = [_build_tensor(dest + 1)] * len(rank_to_GPU[0]) * len(group) for gpu in rank_to_GPU[rank]: output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu]) expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu]) dist.all_gather_multigpu(output_tensors, tensors, group_id) self.assertEqual(output_tensors, expected_output) self._barrier()
def all_gather_multigpu( self, output_tensor_lists, input_tensor_list, async_op=False ): # pragma: no cover return dist.all_gather_multigpu( output_tensor_lists, input_tensor_list, self.group, async_op )
assert_mean(tensor_list[1], 3.) # ---------------- ALL_GATHER ----------------- # all_gather semantics is quite complicated: # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather_multigpu tensor_list = get_tensor_list() """ Process 0: physical GPU 0, 1, output list device residence [[gpu0, gpu0, gpu0, gpu0], [gpu1, gpu1, gpu1, gpu1]] values [[1., 2., 3., 4.], [1., 2., 3., 4.]] from all GPUs across all procs Process 1: physical GPU 2, 3, output list device residence [[gpu2, gpu2, gpu2, gpu2], [gpu3, gpu3, gpu3, gpu3]] values [[1., 2., 3., 4.], [1., 2., 3., 4.]] from all GPUs across all procs """ output_tensor_lists = [[new_tensor(i, value=0.) for _ in range(4)] for i in local_gpu_ids] dist.all_gather_multigpu( output_tensor_lists, tensor_list, ) print('all_gather_multigpu rank ' + str(local_rank) + '\n' + '\n\t'.join(map(str, output_tensor_lists))) for same_gpu_tensor_list in output_tensor_lists: assert_mean(same_gpu_tensor_list[0], 1.) assert_mean(same_gpu_tensor_list[1], 2.) assert_mean(same_gpu_tensor_list[2], 3.) assert_mean(same_gpu_tensor_list[3], 4.)