def test_queue_reduction(self): # Set up process group. store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # Get this process' split of devices. devices = gpus_for_rank(self.world_size)[self.rank] grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) * (self.rank + 1)).chunk(5) for d in devices] work, local_grad_sum = c10d._queue_reduction(process_group, grads_batch, devices) # The first return value should be the allreduce work item. self.assertTrue(isinstance(work, c10d.Work)) # The second return value will be the finished allreduced gradients. self.assertTrue(isinstance(local_grad_sum, torch.Tensor)) # Wait for the allreduce to finish. work.wait() # The expected result of the allreduce should be the average self.assertEqual(local_grad_sum, torch.ones(10) * (self.world_size + 1) / 2.0)
def _create_wrapper_pg(self, with_new_group=False, timeout=10.0): store = c10d.FileStore(self.file_name, self.world_size) c10d.init_process_group( backend="nccl", rank=self.rank, world_size=self.world_size, store=store, timeout=timedelta(seconds=timeout), ) if with_new_group: pg = c10d.new_group(backend="nccl", timeout=timedelta(seconds=timeout)) else: _pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size, timeout=timedelta(seconds=timeout)) pg = c10d._create_process_group_wrapper( _pg, "unused", store, self.rank, self.world_size, timeout=timeout, ) return pg
def test_is_last_hook(self): store = dist.FileStore(self.file_name, self.world_size) process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size) def hook(flags, bucket): flags.append(bucket.is_last()) fut = torch.futures.Future() fut.set_result(bucket.buffer()) return fut flags = [] device_id = gpus_for_rank(self.world_size)[self.rank][0] model = nn.Sequential( nn.Linear(2, 4000, bias=False), *[nn.Linear(4000, 4000, bias=False) for _ in range(10)]) gpu_model = DistributedDataParallel( model.to(device_id), device_ids=[device_id], process_group=process_group, ) gpu_model.register_comm_hook(state=flags, hook=hook) input = torch.randn(10, 2) gpu_model(input).sum().backward() self.assertTrue(flags[-1]) self.assertFalse(any(flags[:-1]))
def test_fp16(self): store = c10d.TCPStore('localhost', self.port, self.rank == 0) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) gpus = gpus_for_rank(self.world_size)[self.rank] model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half() nn.init.constant_(model.weight, 1) ddp_model = DistributedDataParallel( model, device_ids=[gpus[0]], process_group=process_group, bucket_cap_mb=1, ) # Input 2**15, so that the gradients will overflow with a # world_size of 2, unless we normalize the gradient by the # world_size before the reduction input = torch.Tensor([[2**15]]).cuda(gpus[0]).half() # Step model ddp_model.train() output = ddp_model(input) loss = output.sum() loss.backward() self.assertFalse( any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
def train(self, model, data): torch.manual_seed(0) model = model.cuda(self.rank) for i in range(len(data)): data[i][0] = data[i][0].cuda(self.rank) data[i][1] = data[i][1].cuda(self.rank) torch.cuda.synchronize(self.rank) process_group_size = self.trainer_count store = c10d.FileStore("/tmp/tmpn_k_8so02", process_group_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, process_group_size) ddp_model = DDP(model, device_ids=[self.rank], process_group=process_group) hook_state = self.HookState(self, process_group) ddp_model.register_comm_hook(hook_state, DdpNcclTrainer.hook) criterion = nn.CrossEntropyLoss().cuda(self.rank) optimizer = torch.optim.SGD(ddp_model.parameters(), 1e-4) def epoch_key(epoch, index): return f"{epoch},{index}" for epoch in range(self.epochs): for index, batch in enumerate(data): hook_state.next_batch_state() input, target = batch[0], batch[1] self.record_batch_start(epoch_key(epoch, index)) optimizer.zero_grad() self.record_forward_start(epoch_key(epoch, index)) out = ddp_model(input) self.record_forward_end(epoch_key(epoch, index)) loss = criterion(out, target) self.record_backward_start(epoch_key(epoch, index)) loss.backward() self.record_backward_end(epoch_key(epoch, index)) optimizer.step() self.record_batch_end(epoch_key(epoch, index)) torch.cuda.synchronize(self.rank)
def test_nccl_backend(self): store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) gpus = gpus_for_rank(self.world_size)[self.rank] self._test_ddp_with_process_group(process_group, gpus) self._test_ddp_with_process_group( process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
def test_nccl_backend(self): store = c10d.TCPStore('localhost', self.port, self.is_master) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) gpus = gpus_for_rank(self.world_size)[self.rank] self._test_ddp_with_process_group(process_group, gpus) self._test_ddp_with_process_group( process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
def test_allreduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def allreduce(tensors, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce(tensors, opts) work.wait() # Sum tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.SUM) for i in range(self.num_gpus): self.assertEqual( torch.Tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), tensors[i]) # Product tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.PRODUCT) for i in range(self.num_gpus): self.assertEqual( torch.Tensor([float(math.factorial(self.num_gpus))]), tensors[i]) # Min tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.MIN) for i in range(self.num_gpus): self.assertEqual(torch.Tensor([1.0]), tensors[i]) # Max tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.MAX) for i in range(self.num_gpus): self.assertEqual(torch.Tensor([self.num_gpus]), tensors[i])
def _nccl_init(self, nccl_addr, nccl_ip, nccl_port): self.nccl_ip, self.nccl_addr, self.nccl_port = nccl_ip, nccl_addr, nccl_port print('Rank {} calling init_process_group. Addr: {}'.format(self.rank, nccl_addr)) # from https://github.com/pytorch/pytorch/blob/master/test/simulate_nccl_errors.py store = dist.TCPStore(self.nccl_ip, self.nccl_port, self.nb_learners, self.rank == 0) process_group = dist.ProcessGroupNCCL(store, self.rank, self.nb_learners) print('Rank {} initialized process group.'.format(self.rank)) process_group.barrier() print('Rank {} process group barrier finished.'.format(self.rank)) self.process_group = process_group # set optimizer process_group self.optimizer.set_process_group(self.process_group)
def test_ddp_comm_hook_allreduce_hook(self): """ This unit test verifies the ``allreduce`` hook registered case gives same result with no hook registered case. """ store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # No hook registered case, get the reference grads. reference_grads = self._get_grads(process_group, None) # Register hook case, get the hook grads. hook_grads = self._get_grads(process_group, DDPCommHookType.ALLREDUCE) np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
def test_ddp_comm_hook_quantize_per_tensor_hook(self): """ This unit test verifies the ``quantize per tensor`` hook registered case gives close result with no hook registered case. """ store = dist.FileStore(self.file_name, self.world_size) process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size) # No hook registered case, get the reference grads. reference_grads = self._get_grads(process_group, None) # Register hook case, get the hook grads. hook_grads = self._get_grads(process_group, DDPCommHookType.QUANTIZE_PER_TENSOR) np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
def test_ddp_comm_hook_fp16compress_hook(self): """ This unit test verifies the ``fp16 compress`` hook registered case gives close result with no hook registered case. """ store = dist.FileStore(self.file_name, self.world_size) process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size) # No hook registered case, get the reference grads. reference_grads = self._get_grads(process_group, None) # Register hook case, get the hook grads. hook_grads = self._get_grads(process_group, DDPCommHookType.FP16_COMPRESS) np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
def test_sync_reduction(self): # Set up process group. store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # Get this process' split of devices. devices = gpus_for_rank(self.world_size)[self.rank] grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) * (self.rank + 1)).chunk(5) for d in devices] work, local_grad_sum = c10d._queue_reduction(process_group, grads_batch, devices) c10d._sync_reduction(work, grads_batch[0], local_grad_sum) # The expected result of the allreduce should be the average self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))
def run_trainer(args, extra_args, model, data, rank, server_rref): trainer_class = get_benchmark_trainer_map()[str(args.trainer)] if extra_args is not None: trainer_args = extra_args.values() else: trainer_args = [] trainer_count = args.ntrainer + args.ncudatrainer store = c10d.FileStore(args.filestore, trainer_count) if args.backend == "gloo": process_group = c10d.ProcessGroupGloo(store, rank, trainer_count) elif args.backend == "nccl": process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count) use_cuda_rpc = rank >= args.ntrainer trainer = trainer_class(rank, args.ntrainer + args.ncudatrainer, process_group, use_cuda_rpc, server_rref, args.backend, args.epochs, *trainer_args) trainer.train(model, data) metrics = trainer.get_metrics() return [rank, metrics]
def test_broadcast_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def broadcast(xs, rootRank, rootTensor): opts = c10d.BroadcastOptions() opts.rootRank = rootRank opts.rootTensor = rootTensor work = pg.broadcast(xs, opts) work.wait() # for every root tensor for rt in range(self.num_gpus): tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i]).cuda(i)) broadcast(tensors, self.rank, rt) for i in range(self.num_gpus): self.assertEqual(tensors[i], tensors[rt])
def test_ddp_comm_hook_noop_hook(self): """ This unit test verifies the ``noop`` hook registered case and a subsequent allreduce gives same result with no hook registered case. """ store = dist.FileStore(self.file_name, self.world_size) process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size) # No hook registered case, get the reference grads. reference_grads = self._get_grads(process_group, None) # Register hook case, get the hook grads. hook_grads = self._get_grads(process_group, DDPCommHookType.NOOP) # Apply a subsequent allreduce to average grads. hook_grads.div_(self.world_size) dist.all_reduce(hook_grads, group=process_group) torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
def test_reduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def reduce(xs, rootRank, rootTensor): opts = c10d.ReduceOptions() opts.rootRank = rootRank opts.rootTensor = rootTensor work = pg.reduce(xs, opts) work.wait() # for every root tensor for rt in range(self.num_gpus): tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) reduce(tensors, self.rank, rt) self.assertEqual( torch.Tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), tensors[rt])
def test_allgather_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def allgather(output_ts, input_ts): work = pg.allgather(output_ts, input_ts) work.wait() tensors = [] output_ts = [[] for _ in range(self.num_gpus)] for idx, ls in enumerate(output_ts): for _ in range(self.world_size * self.num_gpus): ls.append(torch.Tensor([0]).cuda(idx)) for i in range(self.num_gpus): tensors.append(torch.Tensor([i]).cuda(i)) allgather(output_ts, tensors) # Verification for device_ts in output_ts: for s_idx, t in enumerate(device_ts): self.assertEqual(torch.Tensor([s_idx]), t)
def test_nccl_backend(self): store = c10d.TCPStore('localhost', self.port, self.is_master) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) self._test_ddp_with_process_group(process_group)
parser = argparse.ArgumentParser( description='Simple script to simulate NCCL errors. The script is ' 'supposed to be run on multiple different nodes simultaneously with ' 'appropriate rank and world_size. The script run an allreduce() on ' 'the rank 0 node and aborts all the other nodes to simulate an error ' 'in NCCL') parser.add_argument('addr', help='address of the master node to connect to.') parser.add_argument('port', help='port of the master node to connect to.') parser.add_argument('rank', help='rank of this node') parser.add_argument('world_size', help='number of nodes in process group') args = parser.parse_args() rank = int(args.rank) world_size = int(args.world_size) port = int(args.port) store = c10d.TCPStore(args.addr, port, world_size, rank == 0) process_group = c10d.ProcessGroupNCCL(store, rank, world_size) logging.info('Running first allreduce') process_group.allreduce(torch.rand(10).cuda(rank)).wait() if rank == 0: logging.info('Running second allreduce only on rank 0') work = process_group.allreduce(torch.rand(10).cuda(rank)) logging.info('Waiting for allreduce to complete...') work.wait() logging.info('Second allreduce successful: {}'.format( work.is_success())) else: logging.info('Aborting all other ranks.') os.abort()
def _init_pg_nccl(cls, rank, filename, world_size): store = c10d.FileStore(filename, world_size) return c10d.ProcessGroupNCCL(store, rank, world_size)
def run_trainer( args, extra_args, data, rank, server_rref ): r""" A function that runs obtains a trainer instance and calls the train method. Args: args (parser): benchmark configurations extra_args (dict): configurations added by the user data (list): training samples rank (int): process number in the world server_rrefs (dict): a dictionary containing server RRefs """ trainer_class = trainer_map[args.trainer] if extra_args is not None: trainer_args = extra_args.values() else: trainer_args = [] trainer_count = args.ntrainer + args.ncudatrainer store = c10d.FileStore(args.filestore, trainer_count) if args.backend == "gloo": process_group = c10d.ProcessGroupGloo( store, rank, trainer_count ) elif args.backend == "nccl": process_group = c10d.ProcessGroupNCCL( store, rank, trainer_count ) elif args.backend == "multi": process_group = c10d.ProcessGroupNCCL( store, rank, trainer_count ) if c10d.is_initialized() is False: c10d.init_process_group(backend="gloo", rank=rank, world_size=trainer_count) model = load_model(args) preprocess_data = preprocess_data_map[args.preprocess_data] create_criterion = criterion_map[args.create_criterion] create_ddp_model = ddp_model_map[args.create_ddp_model] iteration_step = iteration_step_map[args.iteration_step] hook_state_class = hook_state_map[args.hook_state] hook = ddp_hook_map[args.ddp_hook] # check if this a cudatrainer use_cuda_rpc = rank >= args.ntrainer trainer = trainer_class( process_group, use_cuda_rpc, server_rref, args.backend, args.epochs, preprocess_data, create_criterion, create_ddp_model, hook_state_class, hook, iteration_step, *trainer_args ) trainer.train(model, data) metrics = trainer.get_metrics() return [rank, metrics]