Exemple #1
0
def init_processes(rank, size,
                   model, train_pics, train_bsz,
                   fn, backend='tcp'):
    os.environ['MASTER_ADDR'] = args.ps_ip
    os.environ['MASTER_PORT'] = args.ps_port
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, model, train_pics, train_bsz)
Exemple #2
0
def init_processes(rank, size, workers,
                   model, save_path,
                   train_dataset, test_dataset,
                   fn, backend='tcp'):
    os.environ['MASTER_ADDR'] = args.ps_ip
    os.environ['MASTER_PORT'] = args.ps_port
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, workers, model, save_path, train_dataset, test_dataset)
Exemple #3
0
def init_processes(rank, size, model,
                   train_dataset, test_dataset,
                   q, param_q, stop_flag,
                   fn, backend='tcp'):
    os.environ['MASTER_ADDR'] = args.ps_ip
    os.environ['MASTER_PORT'] = args.ps_port
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, model, train_dataset, test_dataset, q, param_q, stop_flag)
Exemple #4
0
def init_processes(master_address, world_size, rank, epoch_per_round,
                   batch_size, run):
    # change 'tcp' to 'nccl' if running on GPU worker
    dist.init_process_group(backend='tcp',
                            init_method=master_address,
                            world_size=world_size,
                            rank=rank)
    group = dist.new_group([i for i in range(world_size)])
    run(world_size, rank, group, epoch_per_round, batch_size)
Exemple #5
0
def init_processes(rank,
                   size,
                   model,
                   test_data,
                   queue,
                   param_q,
                   stop_signal,
                   train_pics,
                   fn,
                   backend='tcp'):
    os.environ['MASTER_ADDR'] = args.ps_ip
    os.environ['MASTER_PORT'] = args.ps_port
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(model, test_data, queue, param_q, stop_signal, train_pics)
 def _run(self, rank):
     self.rank = rank
     try:
         dist.init_process_group(
             init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE)
         )
     except RuntimeError as e:
         if "recompile" in e.args[0]:
             sys.exit(SKIP_IF_BACKEND_UNAVAILABLE)
             # sys.exit(0)
         raise
     # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
     # We're retreiving a corresponding test and executing it.
     getattr(self, self.id().split(".")[2])()
     sys.exit(0)
Exemple #7
0
def init_processes(size, rank, run):
    dist.init_process_group(backend='tcp', init_method='tcp://127.0.0.1:5000', world_size=size, rank=rank)
    run(size, rank)
                    first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE or
                    first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE
                )

                if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE:
                    raise unittest.SkipTest("cuda is not available")
                if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE:
                    raise unittest.SkipTest(
                        "One unique gpu per process is not available"
                    )
                if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE:
                    raise unittest.SkipTest("worldsize is too small to run group tests")

            self.assertEqual(first_process.exitcode, 0)


elif BACKEND == "mpi":
    WORLD_SIZE = os.environ["WORLD_SIZE"]
    dist.init_process_group(init_method=INIT_METHOD, backend="mpi")

    class TestMPI(TestCase, _DistTestBase):
        pass


if __name__ == "__main__":
    assert (
        not torch.cuda._initialized
    ), "test_distributed must not have initialized CUDA context on main process"

    unittest.main()
Exemple #9
0
        group_list.append(i)
    group = dist.new_group(group_list)

    while (1):

        for param in modell.parameters():
            # for dst in range(1, size):
            # dist.send(param.data, dst=dst)
            dist.broadcast(param.data, src=0, group=group)

        for param in modell.parameters():
            tensor_temp = torch.zeros_like(param.data)
            dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group)
            param.data = tensor_temp / (size - 1)


# run " python master.py -size NUM ", NUM is the value of size.
if __name__ == "__main__":

    rank = 0

    if len(sys.argv) != 3 or sys.argv[1] != "-size" or sys.argv[2].isdigit() == False:
        print('Parameter list error!')
        sys.exit(0)

    size = int(sys.argv[2])

    dist.init_process_group(backend='tcp', init_method='tcp://127.0.0.1:5000', world_size=size, rank=rank)

    run()
Exemple #10
0
def init_processes(size, rank, epoch, batchsize, run):
    dist.init_process_group(backend='tcp',
                            init_method='tcp://127.0.0.1:22222',
                            world_size=size,
                            rank=rank)
    run(size, rank, epoch, batchsize)