Ejemplo n.º 1
0
DISTRIBUTED_TESTS_CONFIG = {}


if dist.is_available():
    DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"}
    if not TEST_WITH_ROCM and dist.is_mpi_available():
        DISTRIBUTED_TESTS_CONFIG["mpi"] = {
            "WORLD_SIZE": "3",
            "TEST_REPORT_SOURCE_OVERRIDE": "dist-mpi",
        }
    if dist.is_nccl_available():
        DISTRIBUTED_TESTS_CONFIG["nccl"] = {
            "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3",
            "TEST_REPORT_SOURCE_OVERRIDE": "dist-nccl",
        }
    if dist.is_gloo_available():
        DISTRIBUTED_TESTS_CONFIG["gloo"] = {
            "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3",
            "TEST_REPORT_SOURCE_OVERRIDE": "dist-gloo",
        }

# https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python
SIGNALS_TO_NAMES_DICT = {
    getattr(signal, n): n for n in dir(signal) if n.startswith("SIG") and "_" not in n
}

CPP_EXTENSIONS_ERROR = """
Ninja (https://ninja-build.org) is required for some of the C++ extensions
tests, but it could not be found. Install ninja with `pip install ninja`
or `conda install ninja`. Alternatively, disable said tests with
`run_test.py --exclude test_cpp_extensions_aot_ninja test_cpp_extensions_jit`.
Ejemplo n.º 2
0
def requires_gloo():
    return unittest.skipUnless(
        c10d.is_gloo_available(),
        "c10d was not compiled with the Gloo backend",
    )
Ejemplo n.º 3
0
def requires_gloo():
    return sandcastle_skip_if(
        not c10d.is_gloo_available(),
        "c10d was not compiled with the Gloo backend",
    )
Ejemplo n.º 4
0
DISTRIBUTED_TESTS_CONFIG = {}

if dist.is_available():
    DISTRIBUTED_TESTS_CONFIG['test'] = {'WORLD_SIZE': '1'}
    if not TEST_WITH_ROCM and dist.is_mpi_available():
        DISTRIBUTED_TESTS_CONFIG['mpi'] = {
            'WORLD_SIZE': '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi'
        }
    if dist.is_nccl_available():
        DISTRIBUTED_TESTS_CONFIG['nccl'] = {
            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl'
        }
    if not TEST_WITH_ROCM and dist.is_gloo_available():
        DISTRIBUTED_TESTS_CONFIG['gloo'] = {
            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo'
        }

# https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python
SIGNALS_TO_NAMES_DICT = {
    getattr(signal, n): n
    for n in dir(signal) if n.startswith('SIG') and '_' not in n
}

CPP_EXTENSIONS_ERROR = """
Ninja (https://ninja-build.org) is required for some of the C++ extensions
tests, but it could not be found. Install ninja with `pip install ninja`
or `conda install ninja`. Alternatively, disable said tests with
Ejemplo n.º 5
0
    def __init__(self, apex_config: Config,
                 logger: Optional[OrderedDictLogger]) -> None:
        # region conf vars
        self._enabled = apex_config[
            'enabled']  # global switch to disable anything apex
        self._distributed_enabled = apex_config[
            'distributed_enabled']  # enable/disable distributed mode
        self._mixed_prec_enabled = apex_config[
            'mixed_prec_enabled']  # enable/disable distributed mode
        self._opt_level = apex_config[
            'opt_level']  # optimization level for mixed precision
        self._bn_fp32 = apex_config['bn_fp32']  # keep BN in fp32
        self._loss_scale = apex_config[
            'loss_scale']  # loss scaling mode for mixed prec
        self._sync_bn = apex_config[
            'sync_bn']  # should be replace BNs with sync BNs for distributed model
        self._scale_lr = apex_config[
            'scale_lr']  # enable/disable distributed mode
        self._min_world_size = apex_config[
            'min_world_size']  # allows to confirm we are indeed in distributed setting
        seed = apex_config['seed']
        detect_anomaly = apex_config['detect_anomaly']
        conf_gpu_ids = apex_config['gpus']
        # endregion

        # to avoid circular references= with common, logger is passed from outside
        self.logger = logger

        # defaults for non-distributed mode
        self._amp, self._ddp = None, None
        self._set_ranks(conf_gpu_ids)

        #_log_info({'apex_config': apex_config.to_dict()})
        self._log_info({
            'torch.distributed.is_available': dist.is_available(),
            'apex.enabled': self._enabled,
            'apex.distributed_enabled': self._distributed_enabled,
            'apex.mixed_prec_enabled': self._mixed_prec_enabled
        })

        if dist.is_available():
            # dist.* properties are otherwise not accessible
            self._op_map = {
                'mean': dist.ReduceOp.SUM,
                'sum': dist.ReduceOp.SUM,
                'min': dist.ReduceOp.MIN,
                'max': dist.ReduceOp.MAX
            }
            self._log_info({
                'gloo_available': dist.is_gloo_available(),
                'mpi_available': dist.is_mpi_available(),
                'nccl_available': dist.is_nccl_available()
            })

        if self.is_mixed():
            # init enable mixed precision
            assert cudnn.enabled, "Amp requires cudnn backend to be enabled."
            from apex import amp
            self._amp = amp

        # enable distributed processing
        if self.is_dist():
            from apex import parallel
            self._ddp = parallel

            assert dist.is_available()  # distributed module is available
            assert dist.is_nccl_available()
            if not dist.is_initialized():
                dist.init_process_group(backend='nccl', init_method='env://')
                assert dist.is_initialized()
            assert dist.get_world_size() == self.world_size
            assert dist.get_rank() == self.global_rank

        assert self.world_size >= 1
        assert not self._min_world_size or self.world_size >= self._min_world_size
        assert self.local_rank >= 0 and self.local_rank < self.world_size
        assert self.global_rank >= 0 and self.global_rank < self.world_size

        assert self._gpu < torch.cuda.device_count()
        torch.cuda.set_device(self._gpu)
        self.device = torch.device('cuda', self._gpu)
        self._setup_gpus(seed, detect_anomaly)

        self._log_info({
            'amp_available': self._amp is not None,
            'distributed_available': self._ddp is not None
        })
        self._log_info({
            'dist_initialized':
            dist.is_initialized() if dist.is_available() else False,
            'world_size':
            self.world_size,
            'gpu':
            self._gpu,
            'gpu_ids':
            self.gpu_ids,
            'local_rank':
            self.local_rank,
            'global_rank':
            self.global_rank
        })