def _init_process_group(store, rank, world_size): # Initialize ProcessGroup. process_group_timeout = rpc_constants.DEFAULT_PROCESS_GROUP_TIMEOUT # We're using a bunch of private APIs here since `new_group` requires the # default group to be initialized. group = dist.ProcessGroupGloo(store, rank, world_size, process_group_timeout) assert group is not None, "Failed to initialize default ProcessGroup." if (rank != -1) and (rank != group.rank()): raise RuntimeError("rank argument {} doesn't match pg rank {}".format( rank, group.rank())) if (world_size != -1) and (world_size != group.size()): raise RuntimeError( "world_size argument {} doesn't match pg size {}".format( world_size, group.size())) return group
def run_trainer(args, extra_args, model, data, rank, server_rref): trainer_class = get_benchmark_trainer_map()[str(args.trainer)] if extra_args is not None: trainer_args = extra_args.values() else: trainer_args = [] trainer_count = args.ntrainer + args.ncudatrainer store = c10d.FileStore(args.filestore, trainer_count) if args.backend == "gloo": process_group = c10d.ProcessGroupGloo(store, rank, trainer_count) elif args.backend == "nccl": process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count) use_cuda_rpc = rank >= args.ntrainer trainer = trainer_class(rank, args.ntrainer + args.ncudatrainer, process_group, use_cuda_rpc, server_rref, args.backend, args.epochs, *trainer_args) trainer.train(model, data) metrics = trainer.get_metrics() return [rank, metrics]
def _create_wrapper_pg(self, with_new_group=False, timeout=10.0): store = c10d.FileStore(self.file_name, self.world_size) c10d.init_process_group(backend="gloo", rank=self.rank, world_size=self.world_size, store=store) if with_new_group: pg = c10d.new_group(backend="gloo") else: _pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts(timeout=timeout)) pg = c10d._create_process_group_wrapper( _pg, "unused", store, self.rank, self.world_size, timeout=timeout, ) return pg
def test_allreduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) def allreduce(x, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce([x], opts) work.wait() # Sum x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.SUM) self.assertEqual( torch.Tensor([float(self.world_size * (self.world_size + 1) / 2)]), x) # Product x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.PRODUCT) self.assertEqual( torch.Tensor([float(math.factorial(self.world_size))]), x) # Min x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MIN) self.assertEqual(torch.Tensor([1.0]), x) # Max x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MAX) self.assertEqual(torch.Tensor([self.world_size]), x) # Test overloaded convenience function (defaults to using sum) x = torch.Tensor([self.rank + 1.0]) work = pg.allreduce(x) work.wait() self.assertEqual( torch.Tensor([float(self.world_size * (self.world_size + 1) / 2)]), x)
def _test_base(self, net, inp, check_allclose=True): store = c10d.FileStore(self.file.name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) if inp[0].is_cuda: num_gpus = torch.cuda.device_count() batch_size = inp[0].size(0) # batch_size must be evenly divisible by num_gpus_used, take the largest one num_gpus_used = [ i for i in range(1, num_gpus + 1) if batch_size % i == 0 ][-1] device_ids = list(range(num_gpus_used)) else: device_ids = None ddp = nn.parallel.DistributedDataParallel(copy.deepcopy(net), device_ids=device_ids, process_group=process_group) net_opt = torch.optim.Adam(net.parameters(), lr=0.001) ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001) for i, j in zip(ddp.parameters(), net.parameters()): self.assertTrue(i.allclose(j)) for _ in range(10): net_out = net(*inp) ddp_out = ddp(*inp) net_out.sum().backward() ddp_out.sum().backward() net_opt.step() ddp_opt.step() if check_allclose: for i, j in zip(ddp.parameters(), net.parameters()): self.assertTrue(i.allclose(j))
def test_send_recv_all_to_all(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) # Preallocate tensors for input/output inputs = [torch.Tensor([self.rank]) for _ in range(self.world_size)] outputs = [torch.Tensor([-1]) for _ in range(self.world_size)] # Issue sends send_work = [] for i in range(self.world_size): if i == self.rank: continue send_work.append(pg.send([inputs[i]], i, 0)) # Issue recvs recv_work = [] for i in range(self.world_size): if i == self.rank: continue recv_work.append(pg.recv([outputs[i]], i, 0)) # Wait for sends to complete for work in send_work: work.wait() # Wait for recvs to complete for work in recv_work: work.wait() # Test that every output other than our own contains the respective rank for i in range(self.world_size): if i == self.rank: continue self.assertEqual(torch.Tensor([i]), outputs[i])
def test_sync_params_no_buffers(self): store = c10d.FileStore(self.file.name) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) # Use all available devices on every process here (data is small, so should be fine). devices = gpus_for_rank(self.world_size)[self.rank] target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5) parameter_data = [target] parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]] buffer_data = [[]] * len(parameter_data) c10d._sync_params( process_group, parameter_data=parameter_data, buffer_data=buffer_data, devices=devices, broadcast_bucket_size=10, broadcast_buffers=False) for device_data in parameter_data: for i, parameter in enumerate(device_data): self.assertEqual(parameter, target[i])
def test_sync_params_with_buffers(self): # Set up process group. store = c10d.TCPStore('localhost', self.port, self.is_master) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) devices = gpus_for_rank(self.world_size)[self.rank] target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5) parameter_data = [target] parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]] # sync_params should do a dist_broadcast for buffers, so we only populate the master buffers and # then check that other processes' tensors end up matching. if self.is_master: buffer_data = [target] buffer_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]] else: buffer_data = [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices] c10d._sync_params( process_group, parameter_data=parameter_data, buffer_data=buffer_data, devices=devices, broadcast_bucket_size=10, broadcast_buffers=True) for device_data in parameter_data: for i, parameter in enumerate(device_data): self.assertEqual(parameter, target[i]) for device_data in buffer_data: for i, buffer in enumerate(device_data): self.assertEqual(buffer, target[i])
def _init_pg_gloo(cls, rank, filename, world_size): store = c10d.FileStore(filename, world_size) return c10d.ProcessGroupGloo(store, rank, world_size, ProcessGroupShareTensorTest.opts())
def run_trainer( args, extra_args, data, rank, server_rref ): r""" A function that runs obtains a trainer instance and calls the train method. Args: args (parser): benchmark configurations extra_args (dict): configurations added by the user data (list): training samples rank (int): process number in the world server_rrefs (dict): a dictionary containing server RRefs """ trainer_class = trainer_map[args.trainer] if extra_args is not None: trainer_args = extra_args.values() else: trainer_args = [] trainer_count = args.ntrainer + args.ncudatrainer store = c10d.FileStore(args.filestore, trainer_count) if args.backend == "gloo": process_group = c10d.ProcessGroupGloo( store, rank, trainer_count ) elif args.backend == "nccl": process_group = c10d.ProcessGroupNCCL( store, rank, trainer_count ) elif args.backend == "multi": process_group = c10d.ProcessGroupNCCL( store, rank, trainer_count ) if c10d.is_initialized() is False: c10d.init_process_group(backend="gloo", rank=rank, world_size=trainer_count) model = load_model(args) preprocess_data = preprocess_data_map[args.preprocess_data] create_criterion = criterion_map[args.create_criterion] create_ddp_model = ddp_model_map[args.create_ddp_model] iteration_step = iteration_step_map[args.iteration_step] hook_state_class = hook_state_map[args.hook_state] hook = ddp_hook_map[args.ddp_hook] # check if this a cudatrainer use_cuda_rpc = rank >= args.ntrainer trainer = trainer_class( process_group, use_cuda_rpc, server_rref, args.backend, args.epochs, preprocess_data, create_criterion, create_ddp_model, hook_state_class, hook, iteration_step, *trainer_args ) trainer.train(model, data) metrics = trainer.get_metrics() return [rank, metrics]
def test_gloo_backend(self): store = c10d.TCPStore('localhost', self.port, self.is_master) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) self._test_ddp_with_process_group(process_group)