def init_processes(rank, size, model, train_pics, train_bsz, fn, backend='tcp'): os.environ['MASTER_ADDR'] = args.ps_ip os.environ['MASTER_PORT'] = args.ps_port dist.init_process_group(backend, rank=rank, world_size=size) fn(rank, model, train_pics, train_bsz)
def init_processes(rank, size, workers, model, save_path, train_dataset, test_dataset, fn, backend='tcp'): os.environ['MASTER_ADDR'] = args.ps_ip os.environ['MASTER_PORT'] = args.ps_port dist.init_process_group(backend, rank=rank, world_size=size) fn(rank, workers, model, save_path, train_dataset, test_dataset)
def init_processes(rank, size, model, train_dataset, test_dataset, q, param_q, stop_flag, fn, backend='tcp'): os.environ['MASTER_ADDR'] = args.ps_ip os.environ['MASTER_PORT'] = args.ps_port dist.init_process_group(backend, rank=rank, world_size=size) fn(rank, model, train_dataset, test_dataset, q, param_q, stop_flag)
def init_processes(master_address, world_size, rank, epoch_per_round, batch_size, run): # change 'tcp' to 'nccl' if running on GPU worker dist.init_process_group(backend='tcp', init_method=master_address, world_size=world_size, rank=rank) group = dist.new_group([i for i in range(world_size)]) run(world_size, rank, group, epoch_per_round, batch_size)
def init_processes(rank, size, model, test_data, queue, param_q, stop_signal, train_pics, fn, backend='tcp'): os.environ['MASTER_ADDR'] = args.ps_ip os.environ['MASTER_PORT'] = args.ps_port dist.init_process_group(backend, rank=rank, world_size=size) fn(model, test_data, queue, param_q, stop_signal, train_pics)
def _run(self, rank): self.rank = rank try: dist.init_process_group( init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE) ) except RuntimeError as e: if "recompile" in e.args[0]: sys.exit(SKIP_IF_BACKEND_UNAVAILABLE) # sys.exit(0) raise # self.id() == e.g. '__main__.TestDistributed.test_get_rank' # We're retreiving a corresponding test and executing it. getattr(self, self.id().split(".")[2])() sys.exit(0)
def init_processes(size, rank, run): dist.init_process_group(backend='tcp', init_method='tcp://127.0.0.1:5000', world_size=size, rank=rank) run(size, rank)
first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE or first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE ) if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE: raise unittest.SkipTest("cuda is not available") if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE: raise unittest.SkipTest( "One unique gpu per process is not available" ) if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE: raise unittest.SkipTest("worldsize is too small to run group tests") self.assertEqual(first_process.exitcode, 0) elif BACKEND == "mpi": WORLD_SIZE = os.environ["WORLD_SIZE"] dist.init_process_group(init_method=INIT_METHOD, backend="mpi") class TestMPI(TestCase, _DistTestBase): pass if __name__ == "__main__": assert ( not torch.cuda._initialized ), "test_distributed must not have initialized CUDA context on main process" unittest.main()
group_list.append(i) group = dist.new_group(group_list) while (1): for param in modell.parameters(): # for dst in range(1, size): # dist.send(param.data, dst=dst) dist.broadcast(param.data, src=0, group=group) for param in modell.parameters(): tensor_temp = torch.zeros_like(param.data) dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group) param.data = tensor_temp / (size - 1) # run " python master.py -size NUM ", NUM is the value of size. if __name__ == "__main__": rank = 0 if len(sys.argv) != 3 or sys.argv[1] != "-size" or sys.argv[2].isdigit() == False: print('Parameter list error!') sys.exit(0) size = int(sys.argv[2]) dist.init_process_group(backend='tcp', init_method='tcp://127.0.0.1:5000', world_size=size, rank=rank) run()
def init_processes(size, rank, epoch, batchsize, run): dist.init_process_group(backend='tcp', init_method='tcp://127.0.0.1:22222', world_size=size, rank=rank) run(size, rank, epoch, batchsize)