from common import TestCase TCP_ADDR = '127.0.0.1' TCP_PORT = 29500 TIMEOUT_DEFAULT = 5 TIMEOUT_OVERRIDE = {} def get_timeout(test_id): return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT) if not c10d.is_available(): print('c10d not available, skipping tests') sys.exit(0) class StoreTestBase(object): def _create_store(self, i): raise RuntimeError("not implemented") def _test_set_get(self, fs): fs.set("key0", "value0") fs.set("key1", "value1") fs.set("key2", "value2") self.assertEqual(b"value0", fs.get("key0")) self.assertEqual(b"value1", fs.get("key1")) self.assertEqual(b"value2", fs.get("key2"))
DEFAULT_TIMEOUT = 300 CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500} if INIT_METHOD.startswith("file://"): FOLDER = INIT_METHOD[7:] def get_timeout(test_id): test_name = test_id.split(".")[-1] if test_name in CUSTOMIZED_TIMEOUT: return CUSTOMIZED_TIMEOUT[test_name] else: return DEFAULT_TIMEOUT if not dist.is_available(): print("Distributed not available, skipping tests") sys.exit(0) SKIP_IF_NO_CUDA_EXIT_CODE = 75 SKIP_IF_NO_GPU_EXIT_CODE = 76 SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77 SKIP_IF_BACKEND_UNAVAILABLE = 78 def skip_if_no_cuda_distributed(func): func.skip_if_no_cuda_distributed = True @wraps(func) def wrapper(*args, **kwargs): if not torch.cuda.is_available():
'cpp_extensions', 'distributed', 'distributions', 'multiprocessing', 'nccl', 'thd_distributed', 'utils', ] DISTRIBUTED_TESTS_CONFIG = { 'gloo': { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' }, } if c10d.is_available(): if c10d.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG['mpi'] = {'WORLD_SIZE': '3'} if c10d.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG['nccl'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' } THD_DISTRIBUTED_TESTS_CONFIG = { 'tcp': { 'WORLD_SIZE': '3' }, 'gloo': { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' }, # THD NCCL and MPI tests are known to be flaky in CI