Example #1
0
 def _run(self, rank):
     self.rank = rank
     try:
         dist.init_process_group(init_method=INIT_METHOD,
                                 backend=BACKEND,
                                 world_size=int(WORLD_SIZE),
                                 rank=self.rank)
     except RuntimeError as e:
         if "recompile" in e.args[0]:
             sys.exit(SKIP_IF_BACKEND_UNAVAILABLE)
             # sys.exit(0)
         raise
     # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
     # We're retreiving a corresponding test and executing it.
     getattr(self, self.id().split(".")[2])()
     sys.exit(0)
Example #2
0
                        or first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE
                        or first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE
                        or first_process.exitcode
                        == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE)

                if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE:
                    raise unittest.SkipTest("cuda is not available")
                if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE:
                    raise unittest.SkipTest(
                        "One unique gpu per process is not available")
                if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE:
                    raise unittest.SkipTest(
                        "worldsize is too small to run group tests")

            self.assertEqual(first_process.exitcode, 0)

elif BACKEND == "mpi":
    WORLD_SIZE = os.environ["WORLD_SIZE"]
    dist.init_process_group(init_method=INIT_METHOD, backend="mpi")

    class TestMPI(TestCase, _DistTestBase):
        pass


if __name__ == "__main__":
    assert (
        not torch.cuda._initialized
    ), "test_distributed must not have initialized CUDA context on main process"

    unittest.main()