DISTRIBUTED_TESTS_CONFIG = {} if dist.is_available(): DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"} if not TEST_WITH_ROCM and dist.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG["mpi"] = { "WORLD_SIZE": "3", "TEST_REPORT_SOURCE_OVERRIDE": "dist-mpi", } if dist.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG["nccl"] = { "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3", "TEST_REPORT_SOURCE_OVERRIDE": "dist-nccl", } if dist.is_gloo_available(): DISTRIBUTED_TESTS_CONFIG["gloo"] = { "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3", "TEST_REPORT_SOURCE_OVERRIDE": "dist-gloo", } # https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python SIGNALS_TO_NAMES_DICT = { getattr(signal, n): n for n in dir(signal) if n.startswith("SIG") and "_" not in n } CPP_EXTENSIONS_ERROR = """ Ninja (https://ninja-build.org) is required for some of the C++ extensions tests, but it could not be found. Install ninja with `pip install ninja` or `conda install ninja`. Alternatively, disable said tests with `run_test.py --exclude test_cpp_extensions_aot_ninja test_cpp_extensions_jit`.
def requires_gloo(): return unittest.skipUnless( c10d.is_gloo_available(), "c10d was not compiled with the Gloo backend", )
def requires_gloo(): return sandcastle_skip_if( not c10d.is_gloo_available(), "c10d was not compiled with the Gloo backend", )
DISTRIBUTED_TESTS_CONFIG = {} if dist.is_available(): DISTRIBUTED_TESTS_CONFIG['test'] = {'WORLD_SIZE': '1'} if not TEST_WITH_ROCM and dist.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG['mpi'] = { 'WORLD_SIZE': '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi' } if dist.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG['nccl'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl' } if not TEST_WITH_ROCM and dist.is_gloo_available(): DISTRIBUTED_TESTS_CONFIG['gloo'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo' } # https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python SIGNALS_TO_NAMES_DICT = { getattr(signal, n): n for n in dir(signal) if n.startswith('SIG') and '_' not in n } CPP_EXTENSIONS_ERROR = """ Ninja (https://ninja-build.org) is required for some of the C++ extensions tests, but it could not be found. Install ninja with `pip install ninja` or `conda install ninja`. Alternatively, disable said tests with
def __init__(self, apex_config: Config, logger: Optional[OrderedDictLogger]) -> None: # region conf vars self._enabled = apex_config[ 'enabled'] # global switch to disable anything apex self._distributed_enabled = apex_config[ 'distributed_enabled'] # enable/disable distributed mode self._mixed_prec_enabled = apex_config[ 'mixed_prec_enabled'] # enable/disable distributed mode self._opt_level = apex_config[ 'opt_level'] # optimization level for mixed precision self._bn_fp32 = apex_config['bn_fp32'] # keep BN in fp32 self._loss_scale = apex_config[ 'loss_scale'] # loss scaling mode for mixed prec self._sync_bn = apex_config[ 'sync_bn'] # should be replace BNs with sync BNs for distributed model self._scale_lr = apex_config[ 'scale_lr'] # enable/disable distributed mode self._min_world_size = apex_config[ 'min_world_size'] # allows to confirm we are indeed in distributed setting seed = apex_config['seed'] detect_anomaly = apex_config['detect_anomaly'] conf_gpu_ids = apex_config['gpus'] # endregion # to avoid circular references= with common, logger is passed from outside self.logger = logger # defaults for non-distributed mode self._amp, self._ddp = None, None self._set_ranks(conf_gpu_ids) #_log_info({'apex_config': apex_config.to_dict()}) self._log_info({ 'torch.distributed.is_available': dist.is_available(), 'apex.enabled': self._enabled, 'apex.distributed_enabled': self._distributed_enabled, 'apex.mixed_prec_enabled': self._mixed_prec_enabled }) if dist.is_available(): # dist.* properties are otherwise not accessible self._op_map = { 'mean': dist.ReduceOp.SUM, 'sum': dist.ReduceOp.SUM, 'min': dist.ReduceOp.MIN, 'max': dist.ReduceOp.MAX } self._log_info({ 'gloo_available': dist.is_gloo_available(), 'mpi_available': dist.is_mpi_available(), 'nccl_available': dist.is_nccl_available() }) if self.is_mixed(): # init enable mixed precision assert cudnn.enabled, "Amp requires cudnn backend to be enabled." from apex import amp self._amp = amp # enable distributed processing if self.is_dist(): from apex import parallel self._ddp = parallel assert dist.is_available() # distributed module is available assert dist.is_nccl_available() if not dist.is_initialized(): dist.init_process_group(backend='nccl', init_method='env://') assert dist.is_initialized() assert dist.get_world_size() == self.world_size assert dist.get_rank() == self.global_rank assert self.world_size >= 1 assert not self._min_world_size or self.world_size >= self._min_world_size assert self.local_rank >= 0 and self.local_rank < self.world_size assert self.global_rank >= 0 and self.global_rank < self.world_size assert self._gpu < torch.cuda.device_count() torch.cuda.set_device(self._gpu) self.device = torch.device('cuda', self._gpu) self._setup_gpus(seed, detect_anomaly) self._log_info({ 'amp_available': self._amp is not None, 'distributed_available': self._ddp is not None }) self._log_info({ 'dist_initialized': dist.is_initialized() if dist.is_available() else False, 'world_size': self.world_size, 'gpu': self._gpu, 'gpu_ids': self.gpu_ids, 'local_rank': self.local_rank, 'global_rank': self.global_rank })