def test_dist_package_related(self): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = f"{PYTORCH_COMM_PORT}" if dist.is_mpi_available(): backend = 'mpi' elif dist.is_nccl_available(): backend = 'nccl' elif dist.is_gloo_available(): backend = 'gloo' else: raise NotImplementedError( "None of mpi/nccl/gloo torch backends installed.") # One machine, one GPU: world_size = 1 rank = 0 dist.init_process_group( backend, init_method=f'env://?world_size={world_size}&rank={rank}') self.assertTrue(dist.is_available()) self.assertTrue(dist.is_initialized()) self.assertEqual(dist.get_rank(), rank) self.assertEqual(dist.get_world_size(), world_size) dist.destroy_process_group() self.assertFalse(dist.is_initialized())
def get_backend(): if dist.is_nccl_available(): return 'nccl' if dist.is_mpi_available(): return 'mpi' if dist.is_gloo_available(): return 'gloo' raise ValueError('No backend found.')
def init() -> None: if dist.is_mpi_available(): backend = "mpi" elif ( torch.cuda.is_available() and dist.is_nccl_available() and torch.cuda.device_count() > 0 ): backend = "nccl" elif dist.is_gloo_available(): backend = "gloo" dist.init_process_group(backend)
def launch( world_size=-1, rank_start=0, rank_end=None, backend='gloo', method=None, timeout=None, store=None, group_name=None, target=None, args=(), kwargs={}, ): # check if not dist.is_available(): raise Exception('Distributed is not available') if method == None or method == 'env://': address, port = os.environ.get('MASTER_ADDR', None), os.environ.get('MASTER_PORT', None) if address is None: raise Exception('MASTER_ADDR should be set in environment') if port is None: raise Exception('MASTER_PORT should be set in environment') if world_size < 0: world_size = os.environ.get('WORLD_SIZE', -1) if world_size < 0: raise Exception('Invalid world size {}'.format(world_size)) rank_end = rank_end or world_size if rank_start >= rank_end: raise Exception('invalid rank range {}'.format((rank_start, rank_end))) if target is None: raise Exception('invalid target {}'.format(target)) if backend == 'gloo': if not dist.is_gloo_available(): raise Exception('backend gloo is not available') elif backend == 'nccl': if not dist.is_nccl_available(): raise Exception('backend nccl is not available') elif backend == 'mpi': if not dist.is_mpi_available(): raise Exception('backend mpi is not available') else: raise Exception('invalid backend {}'.format(backend)) # launch process processes = [] for rank in range(rank_start, rank_end, 1): p = Process( target=_on_process_launch, args=( rank, world_size, backend, method, timeout, store, group_name, target, args, kwargs, ) ) p.start() processes.append(p) # join for p in processes: p.join()
def init_multiprocessing(self, rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = f"{PYTORCH_COMM_PORT}" if dist.is_mpi_available(): backend = 'mpi' elif dist.is_nccl_available(): backend = 'nccl' elif dist.is_gloo_available(): backend = 'gloo' else: raise NotImplementedError( "None of mpi/nccl/gloo torch backends installed.") dist.init_process_group( backend, init_method=f'env://?world_size={world_size}&rank={rank}')
def test__native_dist_model(): available_backends = _NativeDistModel.available_backends if dist.is_nccl_available(): assert "nccl" in available_backends else: assert "nccl" not in available_backends if dist.is_gloo_available(): assert "gloo" in available_backends else: assert "gloo" not in available_backends if dist.is_mpi_available(): assert "mpi" in available_backends else: assert "mpi" not in available_backends
def test__native_dist_model(): available_backends = _NativeDistModel.available_backends if dist.is_nccl_available(): assert "nccl" in available_backends else: assert "nccl" not in available_backends if dist.is_gloo_available(): assert "gloo" in available_backends else: assert "gloo" not in available_backends if dist.is_mpi_available(): assert "mpi" in available_backends else: assert "mpi" not in available_backends with pytest.raises(ValueError, match=r"Backend should be one of"): _NativeDistModel.create_from_backend("abc")
def requires_mpi(): return unittest.skipUnless( c10d.is_mpi_available(), "c10d was not compiled with the MPI backend", )
'distributed', 'distributions', 'multiprocessing', 'nccl', 'thd_distributed', 'utils', ] DISTRIBUTED_TESTS_CONFIG = { 'gloo': { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' }, } if dist.is_available(): if dist.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG['mpi'] = {'WORLD_SIZE': '3'} if dist.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG['nccl'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' } THD_DISTRIBUTED_TESTS_CONFIG = { 'tcp': { 'WORLD_SIZE': '3' }, 'gloo': { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' }, # THD NCCL and MPI tests are known to be flaky in CI }
'distributed/pipeline/sync/test_worker', ] # the JSON file to store the S3 test stats TEST_TIMES_FILE = '.pytorch-test-times.json' # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST SLOW_TEST_THRESHOLD = 300 _DEP_MODULES_CACHE: Dict[str, set] = {} DISTRIBUTED_TESTS_CONFIG = {} if dist.is_available(): DISTRIBUTED_TESTS_CONFIG['test'] = {'WORLD_SIZE': '1'} if not TEST_WITH_ROCM and dist.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG['mpi'] = { 'WORLD_SIZE': '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi' } if dist.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG['nccl'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl' } if dist.is_gloo_available(): DISTRIBUTED_TESTS_CONFIG['gloo'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo' }
def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""): global myreq global my_rank global my_size global my_local_rank global my_local_size global a2a_impl global alltoall_supported # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2) num_mpi_ranks = env2int([ "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE" ]) if backend == "" and num_mpi_ranks > 1: if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0: backend = "ccl" elif use_gpu and dist.is_nccl_available(): backend = "nccl" elif dist.is_mpi_available(): backend = "mpi" else: print( "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available." ) backend = "gloo" if backend != "": # guess Rank and size if rank == -1: rank = env2int([ "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK" ], 0) if size == -1: size = env2int( [ "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE", ], 1, ) if not os.environ.get("RANK", None) and rank != -1: os.environ["RANK"] = str(rank) if not os.environ.get("WORLD_SIZE", None) and size != -1: os.environ["WORLD_SIZE"] = str(size) if not os.environ.get("MASTER_PORT", None): os.environ["MASTER_PORT"] = "29500" if not os.environ.get("MASTER_ADDR", None): local_size = env2int( [ "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE", ], 1, ) if local_size != size and backend != "mpi": print( "Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default" ) print( "If this run hangs, try exporting rank 0's hostname as MASTER_ADDR" ) os.environ["MASTER_ADDR"] = "127.0.0.1" if size > 1: if local_rank == -1: my_local_rank = env2int( [ "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK", "LOCAL_RANK", ], 0, ) else: my_local_rank = local_rank my_local_size = env2int( [ "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE", ], 1, ) if use_gpu: if my_local_size > torch.cuda.device_count(): print( "Not sufficient GPUs available... local_size = %d, ngpus = %d" % (my_local_size, torch.cuda.device_count())) sys.exit(1) torch.cuda.set_device(my_local_rank) dist.init_process_group(backend, rank=rank, world_size=size) my_rank = dist.get_rank() my_size = dist.get_world_size() if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend)) if hasattr(dist, "all_to_all_single"): try: t = torch.zeros([4]) if use_gpu: t = t.cuda() dist.all_to_all_single(t, t) alltoall_supported = True except RuntimeError as err: print("fail to enable all_to_all_single primitive: %s" % err) if a2a_impl == "alltoall" and alltoall_supported == False: print( "Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall" % (a2a_impl, backend)) a2a_impl = "scatter" if a2a_impl != "": print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl) else: my_rank = 0 my_size = 1 my_local_rank = 0 my_local_size = 1 print_all("world size: %d, current rank: %d, local rank: %d" % (my_size, my_rank, my_local_rank)) myreq = Request()
def requires_mpi(): return sandcastle_skip_if( not c10d.is_mpi_available(), "c10d was not compiled with the MPI backend", )
def init_distributed(rank = -1, size = -1, backend=''): global myreq #global my_rank global my_size global my_local_rank global my_local_size global a2a_impl global alltoall_supported global allgatherv_supported # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2) num_mpi_ranks = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE']) if backend == '' and num_mpi_ranks > 1: if torch_ccl and env2int(['CCL_WORKER_COUNT']) > 0: backend = 'ccl' elif dist.is_mpi_available(): backend = 'mpi' else: print("WARNING: MPI multi-process launch detected but PyTorch MPI backend not available.") backend = 'gloo' if backend != '': #guess Rank and size if rank == -1: rank = env2int(['PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'MV2_COMM_WORLD_RANK', 'RANK'], 0) if size == -1: size = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'], 1) if not os.environ.get('RANK', None) and rank != -1: os.environ['RANK'] = str(rank) if not os.environ.get('WORLD_SIZE', None) and size != -1: os.environ['WORLD_SIZE'] = str(size) if not os.environ.get('MASTER_PORT', None): os.environ['MASTER_PORT'] = '29500' if not os.environ.get('MASTER_ADDR', None): local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1) if local_size != size and backend != 'mpi': print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default") print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR") os.environ['MASTER_ADDR'] = '127.0.0.1' if size > 1: dist.init_process_group(backend, rank=rank, world_size=size) my_rank = dist.get_rank() my_size = dist.get_world_size() my_local_rank = env2int(['MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'MV2_COMM_WORLD_LOCAL_RANK'], 0) my_local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1) if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend)) if backend == 'ccl': print("Using CCL_ATL_TRANSPORT=%s" % os.environ.get('CCL_ATL_TRANSPORT', '(default)')) print("Using CCL_ATL_SHM=%s" % os.environ.get('CCL_ATL_SHM', '(default)')) if hasattr(dist, 'all_to_all_single'): try: # dist.all_to_all_single(torch.empty([0]), torch.empty([0])) alltoall_supported = True except RuntimeError: pass if a2a_impl == 'alltoall' and alltoall_supported == False: print("Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall" % (a2a_impl, backend)) a2a_impl = 'scatter' if a2a_impl != '': print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl) try: x = torch.ones([my_rank]) y = torch.zeros([(my_size*(my_size-1))//2]) y = list(y.split([r for r in range(my_size)])) dist.all_gather(y, x) allgatherv_supported = True except RuntimeError: pass else: my_rank = 0 my_size = 1 my_local_rank = 0 my_local_size = 1 myreq = Request()
def __init__(self, apex_config:Config, logger:Optional[OrderedDictLogger])->None: # region conf vars self._enabled = apex_config['enabled'] # global switch to disable anything apex self._distributed_enabled = apex_config['distributed_enabled'] # enable/disable distributed mode self._mixed_prec_enabled = apex_config['mixed_prec_enabled'] # enable/disable distributed mode self._opt_level = apex_config['opt_level'] # optimization level for mixed precision self._bn_fp32 = apex_config['bn_fp32'] # keep BN in fp32 self._loss_scale = apex_config['loss_scale'] # loss scaling mode for mixed prec self._sync_bn = apex_config['sync_bn'] # should be replace BNs with sync BNs for distributed model self._scale_lr = apex_config['scale_lr'] # enable/disable distributed mode self._min_world_size = apex_config['min_world_size'] # allows to confirm we are indeed in distributed setting seed = apex_config['seed'] detect_anomaly = apex_config['detect_anomaly'] conf_gpu_ids = apex_config['gpus'] conf_ray = apex_config['ray'] self.ray_enabled = conf_ray['enabled'] self.ray_local_mode = conf_ray['local_mode'] # endregion # to avoid circular references= with common, logger is passed from outside self.logger = logger # defaults for non-distributed mode self._amp, self._ddp = None, None self._set_ranks(conf_gpu_ids) #_log_info({'apex_config': apex_config.to_dict()}) self._log_info({'ray.enabled': self.is_ray(), 'apex.enabled': self._enabled}) self._log_info({'torch.distributed.is_available': dist.is_available(), 'apex.distributed_enabled': self._distributed_enabled, 'apex.mixed_prec_enabled': self._mixed_prec_enabled}) if dist.is_available(): # dist.* properties are otherwise not accessible self._op_map = {'mean': dist.ReduceOp.SUM, 'sum': dist.ReduceOp.SUM, 'min': dist.ReduceOp.MIN, 'max': dist.ReduceOp.MAX} self._log_info({'gloo_available': dist.is_gloo_available(), 'mpi_available': dist.is_mpi_available(), 'nccl_available': dist.is_nccl_available()}) if self.is_mixed(): # init enable mixed precision assert cudnn.enabled, "Amp requires cudnn backend to be enabled." from apex import amp self._amp = amp # enable distributed processing if self.is_dist(): assert not self.is_ray(), "Ray is not yet enabled for Apex distributed mode" from apex import parallel self._ddp = parallel assert dist.is_available() # distributed module is available assert dist.is_nccl_available() if not dist.is_initialized(): dist.init_process_group(backend='nccl', init_method='env://') assert dist.is_initialized() assert dist.get_world_size() == self.world_size assert dist.get_rank() == self.global_rank if self.is_ray(): assert not self.is_dist(), "Ray is not yet enabled for Apex distributed mode" import ray if not ray.is_initialized(): ray.init(local_mode=self.ray_local_mode, include_dashboard=False, # for some reason Ray is detecting wrong number of GPUs num_gpus=torch.cuda.device_count()) ray_cpus = ray.nodes()[0]['Resources']['CPU'] ray_gpus = ray.nodes()[0]['Resources']['GPU'] self._log_info({'ray_cpus': ray_cpus, 'ray_gpus':ray_gpus}) assert self.world_size >= 1 assert not self._min_world_size or self.world_size >= self._min_world_size assert self.local_rank >= 0 and self.local_rank < self.world_size assert self.global_rank >= 0 and self.global_rank < self.world_size assert self._gpu < torch.cuda.device_count() torch.cuda.set_device(self._gpu) self.device = torch.device('cuda', self._gpu) self._setup_gpus(seed, detect_anomaly) self._log_info({'amp_available': self._amp is not None, 'distributed_available': self._ddp is not None}) self._log_info({'dist_initialized': dist.is_initialized() if dist.is_available() else False, 'world_size': self.world_size, 'gpu': self._gpu, 'gpu_ids':self.gpu_ids, 'local_rank': self.local_rank, 'global_rank': self.global_rank})