Exemple #1
0
    def test_dist_package_related(self):

        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = f"{PYTORCH_COMM_PORT}"

        if dist.is_mpi_available():
            backend = 'mpi'
        elif dist.is_nccl_available():
            backend = 'nccl'
        elif dist.is_gloo_available():
            backend = 'gloo'
        else:
            raise NotImplementedError(
                "None of mpi/nccl/gloo torch backends installed.")

        # One machine, one GPU:
        world_size = 1
        rank = 0
        dist.init_process_group(
            backend, init_method=f'env://?world_size={world_size}&rank={rank}')
        self.assertTrue(dist.is_available())
        self.assertTrue(dist.is_initialized())
        self.assertEqual(dist.get_rank(), rank)
        self.assertEqual(dist.get_world_size(), world_size)

        dist.destroy_process_group()
        self.assertFalse(dist.is_initialized())
def get_backend():
    if dist.is_nccl_available():
        return 'nccl'
    if dist.is_mpi_available():
        return 'mpi'
    if dist.is_gloo_available():
        return 'gloo'

    raise ValueError('No backend found.')
Exemple #3
0
def init() -> None:
    if dist.is_mpi_available():
        backend = "mpi"
    elif (
        torch.cuda.is_available()
        and dist.is_nccl_available()
        and torch.cuda.device_count() > 0
    ):
        backend = "nccl"
    elif dist.is_gloo_available():
        backend = "gloo"
    dist.init_process_group(backend)
Exemple #4
0
def launch(
        world_size=-1,
        rank_start=0,
        rank_end=None,
        backend='gloo',
        method=None,
        timeout=None,
        store=None,
        group_name=None,
        target=None,
        args=(),
        kwargs={},
):
    # check
    if not dist.is_available(): raise Exception('Distributed is not available')
    if method == None or method == 'env://':
        address, port = os.environ.get('MASTER_ADDR', None), os.environ.get('MASTER_PORT', None)
        if address is None: raise Exception('MASTER_ADDR should be set in environment')
        if port is None: raise Exception('MASTER_PORT should be set in environment')

    if world_size < 0: world_size = os.environ.get('WORLD_SIZE', -1)
    if world_size < 0: raise Exception('Invalid world size {}'.format(world_size))
    rank_end = rank_end or world_size
    if rank_start >= rank_end: raise Exception('invalid rank range {}'.format((rank_start, rank_end)))
    if target is None: raise Exception('invalid target {}'.format(target))
    if backend == 'gloo':
        if not dist.is_gloo_available(): raise Exception('backend gloo is not available')
    elif backend == 'nccl':
        if not dist.is_nccl_available(): raise Exception('backend nccl is not available')
    elif backend == 'mpi':
        if not dist.is_mpi_available(): raise Exception('backend mpi is not available')
    else:
        raise Exception('invalid backend {}'.format(backend))


    # launch process
    processes = []
    for rank in range(rank_start, rank_end, 1):
        p = Process(
            target=_on_process_launch,
            args=(
                rank, world_size, backend, method, timeout, store, group_name,
                target, args, kwargs,
            )
        )
        p.start()
        processes.append(p)

    # join
    for p in processes:
        p.join()
Exemple #5
0
    def init_multiprocessing(self, rank, world_size):

        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = f"{PYTORCH_COMM_PORT}"

        if dist.is_mpi_available():
            backend = 'mpi'
        elif dist.is_nccl_available():
            backend = 'nccl'
        elif dist.is_gloo_available():
            backend = 'gloo'
        else:
            raise NotImplementedError(
                "None of mpi/nccl/gloo torch backends installed.")

        dist.init_process_group(
            backend, init_method=f'env://?world_size={world_size}&rank={rank}')
Exemple #6
0
def test__native_dist_model():
    available_backends = _NativeDistModel.available_backends

    if dist.is_nccl_available():
        assert "nccl" in available_backends
    else:
        assert "nccl" not in available_backends

    if dist.is_gloo_available():
        assert "gloo" in available_backends
    else:
        assert "gloo" not in available_backends

    if dist.is_mpi_available():
        assert "mpi" in available_backends
    else:
        assert "mpi" not in available_backends
Exemple #7
0
def test__native_dist_model():
    available_backends = _NativeDistModel.available_backends

    if dist.is_nccl_available():
        assert "nccl" in available_backends
    else:
        assert "nccl" not in available_backends

    if dist.is_gloo_available():
        assert "gloo" in available_backends
    else:
        assert "gloo" not in available_backends

    if dist.is_mpi_available():
        assert "mpi" in available_backends
    else:
        assert "mpi" not in available_backends

    with pytest.raises(ValueError, match=r"Backend should be one of"):
        _NativeDistModel.create_from_backend("abc")
Exemple #8
0
def requires_mpi():
    return unittest.skipUnless(
        c10d.is_mpi_available(),
        "c10d was not compiled with the MPI backend",
    )
Exemple #9
0
    'distributed',
    'distributions',
    'multiprocessing',
    'nccl',
    'thd_distributed',
    'utils',
]

DISTRIBUTED_TESTS_CONFIG = {
    'gloo': {
        'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
    },
}

if dist.is_available():
    if dist.is_mpi_available():
        DISTRIBUTED_TESTS_CONFIG['mpi'] = {'WORLD_SIZE': '3'}
    if dist.is_nccl_available():
        DISTRIBUTED_TESTS_CONFIG['nccl'] = {
            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
        }

THD_DISTRIBUTED_TESTS_CONFIG = {
    'tcp': {
        'WORLD_SIZE': '3'
    },
    'gloo': {
        'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
    },
    # THD NCCL and MPI tests are known to be flaky in CI
}
Exemple #10
0
    'distributed/pipeline/sync/test_worker',
]

# the JSON file to store the S3 test stats
TEST_TIMES_FILE = '.pytorch-test-times.json'

# if a test file takes longer than 5 min, we add it to TARGET_DET_LIST
SLOW_TEST_THRESHOLD = 300

_DEP_MODULES_CACHE: Dict[str, set] = {}

DISTRIBUTED_TESTS_CONFIG = {}

if dist.is_available():
    DISTRIBUTED_TESTS_CONFIG['test'] = {'WORLD_SIZE': '1'}
    if not TEST_WITH_ROCM and dist.is_mpi_available():
        DISTRIBUTED_TESTS_CONFIG['mpi'] = {
            'WORLD_SIZE': '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi'
        }
    if dist.is_nccl_available():
        DISTRIBUTED_TESTS_CONFIG['nccl'] = {
            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl'
        }
    if dist.is_gloo_available():
        DISTRIBUTED_TESTS_CONFIG['gloo'] = {
            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo'
        }
Exemple #11
0
def init_distributed(rank=-1,
                     local_rank=-1,
                     size=-1,
                     use_gpu=False,
                     backend=""):
    global myreq
    global my_rank
    global my_size
    global my_local_rank
    global my_local_size
    global a2a_impl
    global alltoall_supported

    # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
    num_mpi_ranks = env2int([
        "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"
    ])
    if backend == "" and num_mpi_ranks > 1:
        if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0:
            backend = "ccl"
        elif use_gpu and dist.is_nccl_available():
            backend = "nccl"
        elif dist.is_mpi_available():
            backend = "mpi"
        else:
            print(
                "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
            )
            backend = "gloo"

    if backend != "":
        # guess Rank and size
        if rank == -1:
            rank = env2int([
                "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK",
                "RANK"
            ], 0)
        if size == -1:
            size = env2int(
                [
                    "PMI_SIZE",
                    "OMPI_COMM_WORLD_SIZE",
                    "MV2_COMM_WORLD_SIZE",
                    "WORLD_SIZE",
                ],
                1,
            )
        if not os.environ.get("RANK", None) and rank != -1:
            os.environ["RANK"] = str(rank)
        if not os.environ.get("WORLD_SIZE", None) and size != -1:
            os.environ["WORLD_SIZE"] = str(size)
        if not os.environ.get("MASTER_PORT", None):
            os.environ["MASTER_PORT"] = "29500"
        if not os.environ.get("MASTER_ADDR", None):
            local_size = env2int(
                [
                    "MPI_LOCALNRANKS",
                    "OMPI_COMM_WORLD_LOCAL_SIZE",
                    "MV2_COMM_WORLD_LOCAL_SIZE",
                ],
                1,
            )
            if local_size != size and backend != "mpi":
                print(
                    "Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default"
                )
                print(
                    "If this run hangs, try exporting rank 0's hostname as MASTER_ADDR"
                )
            os.environ["MASTER_ADDR"] = "127.0.0.1"

    if size > 1:
        if local_rank == -1:
            my_local_rank = env2int(
                [
                    "MPI_LOCALRANKID",
                    "OMPI_COMM_WORLD_LOCAL_RANK",
                    "MV2_COMM_WORLD_LOCAL_RANK",
                    "LOCAL_RANK",
                ],
                0,
            )
        else:
            my_local_rank = local_rank
        my_local_size = env2int(
            [
                "MPI_LOCALNRANKS",
                "OMPI_COMM_WORLD_LOCAL_SIZE",
                "MV2_COMM_WORLD_LOCAL_SIZE",
            ],
            1,
        )
        if use_gpu:
            if my_local_size > torch.cuda.device_count():
                print(
                    "Not sufficient GPUs available... local_size = %d, ngpus = %d"
                    % (my_local_size, torch.cuda.device_count()))
                sys.exit(1)
            torch.cuda.set_device(my_local_rank)
        dist.init_process_group(backend, rank=rank, world_size=size)
        my_rank = dist.get_rank()
        my_size = dist.get_world_size()
        if my_rank == 0:
            print("Running on %d ranks using %s backend" % (my_size, backend))
        if hasattr(dist, "all_to_all_single"):
            try:
                t = torch.zeros([4])
                if use_gpu:
                    t = t.cuda()
                dist.all_to_all_single(t, t)
                alltoall_supported = True
            except RuntimeError as err:
                print("fail to enable all_to_all_single primitive: %s" % err)
        if a2a_impl == "alltoall" and alltoall_supported == False:
            print(
                "Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall"
                % (a2a_impl, backend))
            a2a_impl = "scatter"
        if a2a_impl != "":
            print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
    else:
        my_rank = 0
        my_size = 1
        my_local_rank = 0
        my_local_size = 1
    print_all("world size: %d, current rank: %d, local rank: %d" %
              (my_size, my_rank, my_local_rank))
    myreq = Request()
Exemple #12
0
def requires_mpi():
    return sandcastle_skip_if(
        not c10d.is_mpi_available(),
        "c10d was not compiled with the MPI backend",
    )
Exemple #13
0
def init_distributed(rank = -1, size = -1, backend=''):
    global myreq
    #global my_rank
    global my_size
    global my_local_rank
    global my_local_size
    global a2a_impl
    global alltoall_supported
    global allgatherv_supported
    # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
    num_mpi_ranks = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'])
    if backend == '' and num_mpi_ranks > 1:
        if torch_ccl and env2int(['CCL_WORKER_COUNT']) > 0:
            backend = 'ccl'
        elif dist.is_mpi_available():
            backend = 'mpi'
        else:
            print("WARNING: MPI multi-process launch detected but PyTorch MPI backend not available.")
            backend = 'gloo'
    if backend != '':
        #guess Rank and size
        if rank == -1:
            rank = env2int(['PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'MV2_COMM_WORLD_RANK', 'RANK'], 0)
        if size == -1:
            size = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'], 1)
        if not os.environ.get('RANK', None) and rank != -1: os.environ['RANK'] = str(rank)
        if not os.environ.get('WORLD_SIZE', None) and size != -1: os.environ['WORLD_SIZE'] = str(size)
        if not os.environ.get('MASTER_PORT', None): os.environ['MASTER_PORT'] = '29500'
        if not os.environ.get('MASTER_ADDR', None):
            local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1)
            if local_size != size and backend != 'mpi':
                print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default")
                print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR")
            os.environ['MASTER_ADDR'] = '127.0.0.1'
    if size > 1:
        dist.init_process_group(backend, rank=rank, world_size=size)
        my_rank = dist.get_rank()
        my_size = dist.get_world_size()
        my_local_rank = env2int(['MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'MV2_COMM_WORLD_LOCAL_RANK'], 0)
        my_local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1)
        if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend))
        if backend == 'ccl':
            print("Using CCL_ATL_TRANSPORT=%s" % os.environ.get('CCL_ATL_TRANSPORT', '(default)'))
            print("Using CCL_ATL_SHM=%s" % os.environ.get('CCL_ATL_SHM', '(default)'))
        if hasattr(dist, 'all_to_all_single'):
            try:
               # dist.all_to_all_single(torch.empty([0]), torch.empty([0]))
                alltoall_supported = True
            except RuntimeError:
                pass
        if a2a_impl == 'alltoall' and alltoall_supported == False:
            print("Requested DLRM_ALLTOALL_IMPL=%s but backend %s does not support it, use scatter/gather based alltoall" % (a2a_impl, backend))
            a2a_impl = 'scatter'
        if a2a_impl != '': print("Using DLRM_ALLTOALL_IMPL=%s" % a2a_impl)
        try:
            x = torch.ones([my_rank])
            y = torch.zeros([(my_size*(my_size-1))//2])
            y = list(y.split([r for r in range(my_size)]))
            dist.all_gather(y, x)
            allgatherv_supported = True
        except RuntimeError:
            pass
    else:
        my_rank = 0
        my_size = 1
        my_local_rank = 0
        my_local_size = 1
    myreq = Request()
Exemple #14
0
    def __init__(self, apex_config:Config, logger:Optional[OrderedDictLogger])->None:
        # region conf vars
        self._enabled = apex_config['enabled'] # global switch to disable anything apex
        self._distributed_enabled = apex_config['distributed_enabled'] # enable/disable distributed mode
        self._mixed_prec_enabled = apex_config['mixed_prec_enabled'] # enable/disable distributed mode
        self._opt_level = apex_config['opt_level'] # optimization level for mixed precision
        self._bn_fp32 = apex_config['bn_fp32'] # keep BN in fp32
        self._loss_scale = apex_config['loss_scale'] # loss scaling mode for mixed prec
        self._sync_bn = apex_config['sync_bn'] # should be replace BNs with sync BNs for distributed model
        self._scale_lr = apex_config['scale_lr'] # enable/disable distributed mode
        self._min_world_size = apex_config['min_world_size'] # allows to confirm we are indeed in distributed setting
        seed = apex_config['seed']
        detect_anomaly = apex_config['detect_anomaly']
        conf_gpu_ids = apex_config['gpus']

        conf_ray = apex_config['ray']
        self.ray_enabled = conf_ray['enabled']
        self.ray_local_mode = conf_ray['local_mode']
        # endregion

        # to avoid circular references= with common, logger is passed from outside
        self.logger = logger

        # defaults for non-distributed mode
        self._amp, self._ddp = None, None
        self._set_ranks(conf_gpu_ids)

        #_log_info({'apex_config': apex_config.to_dict()})
        self._log_info({'ray.enabled':  self.is_ray(), 'apex.enabled': self._enabled})
        self._log_info({'torch.distributed.is_available': dist.is_available(),
                        'apex.distributed_enabled': self._distributed_enabled,
                        'apex.mixed_prec_enabled': self._mixed_prec_enabled})

        if dist.is_available():
            # dist.* properties are otherwise not accessible
            self._op_map = {'mean': dist.ReduceOp.SUM, 'sum': dist.ReduceOp.SUM,
                        'min': dist.ReduceOp.MIN, 'max': dist.ReduceOp.MAX}
            self._log_info({'gloo_available': dist.is_gloo_available(),
                        'mpi_available': dist.is_mpi_available(),
                        'nccl_available': dist.is_nccl_available()})

        if self.is_mixed():
            # init enable mixed precision
            assert cudnn.enabled, "Amp requires cudnn backend to be enabled."
            from apex import amp
            self._amp = amp

        # enable distributed processing
        if self.is_dist():
            assert not self.is_ray(), "Ray is not yet enabled for Apex distributed mode"

            from apex import parallel
            self._ddp = parallel

            assert dist.is_available() # distributed module is available
            assert dist.is_nccl_available()
            if not dist.is_initialized():
                dist.init_process_group(backend='nccl', init_method='env://')
                assert dist.is_initialized()
            assert dist.get_world_size() == self.world_size
            assert dist.get_rank() == self.global_rank

        if self.is_ray():
            assert not self.is_dist(), "Ray is not yet enabled for Apex distributed mode"

            import ray

            if not ray.is_initialized():
                ray.init(local_mode=self.ray_local_mode, include_dashboard=False,
                         # for some reason Ray is detecting wrong number of GPUs
                         num_gpus=torch.cuda.device_count())
                ray_cpus = ray.nodes()[0]['Resources']['CPU']
                ray_gpus = ray.nodes()[0]['Resources']['GPU']
                self._log_info({'ray_cpus': ray_cpus, 'ray_gpus':ray_gpus})

        assert self.world_size >= 1
        assert not self._min_world_size or self.world_size >= self._min_world_size
        assert self.local_rank >= 0 and self.local_rank < self.world_size
        assert self.global_rank >= 0 and self.global_rank < self.world_size

        assert self._gpu < torch.cuda.device_count()
        torch.cuda.set_device(self._gpu)
        self.device = torch.device('cuda', self._gpu)
        self._setup_gpus(seed, detect_anomaly)

        self._log_info({'amp_available': self._amp is not None,
                     'distributed_available': self._ddp is not None})
        self._log_info({'dist_initialized': dist.is_initialized() if dist.is_available() else False,
                     'world_size': self.world_size,
                     'gpu': self._gpu, 'gpu_ids':self.gpu_ids,
                     'local_rank': self.local_rank,
                     'global_rank': self.global_rank})