Esempio n. 1
0
def _infer_slurm_init(cfg: DistributedTrainingConfig, num_pipelines_per_node):
    node_list = os.environ.get("SLURM_STEP_NODELIST")
    if node_list is None:
        node_list = os.environ.get("SLURM_JOB_NODELIST")
    if node_list is not None:
        try:
            hostnames = subprocess.check_output(
                ["scontrol", "show", "hostnames", node_list]
            )
            cfg.distributed_init_method = "tcp://{host}:{port}".format(
                host=hostnames.split()[0].decode("utf-8"),
                port=cfg.distributed_port,
            )
            nnodes = int(os.environ.get("SLURM_NNODES"))
            ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE")
            if ntasks_per_node is not None:
                ntasks_per_node = int(ntasks_per_node)
            else:
                ntasks = int(os.environ.get("SLURM_NTASKS"))
                nnodes = int(os.environ.get("SLURM_NNODES"))
                assert ntasks % nnodes == 0
                ntasks_per_node = int(ntasks / nnodes)
            if ntasks_per_node == 1:
                gpus_per_node = torch.cuda.device_count()
                node_id = int(os.environ.get("SLURM_NODEID"))
                cfg.distributed_rank = node_id * gpus_per_node
                cfg.distributed_world_size = nnodes * gpus_per_node
            elif cfg.pipeline_model_parallel:
                assert ntasks_per_node == num_pipelines_per_node, (
                    "SLURM --ntasks-per-node must match number of pipelines per "
                    "node (={})".format(num_pipelines_per_node)
                )
                cfg.distributed_no_spawn = True
                # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on
                # the first node, [1, 2] on the second node, etc. This
                # matches torch.distributed.launch.
                node_id = int(os.environ.get("SLURM_NODEID"))
                local_id = int(os.environ.get("SLURM_LOCALID"))
                cfg.distributed_rank = node_id * num_pipelines_per_node + local_id
                # In the above example, device_id will always be in [0, 1],
                # which also matches torch.distributed.launch.
                cfg.device_id = local_id
                # We also want to set distributed_world_size to be the total
                # number of pipelines across all nodes.
                cfg.distributed_world_size = nnodes * num_pipelines_per_node
            else:
                assert ntasks_per_node == cfg.distributed_world_size // nnodes
                cfg.distributed_no_spawn = True
                cfg.distributed_rank = int(os.environ.get("SLURM_PROCID"))
                cfg.device_id = int(os.environ.get("SLURM_LOCALID"))
        except subprocess.CalledProcessError as e:  # scontrol failed
            raise e
        except FileNotFoundError:  # Slurm is not installed
            pass
Esempio n. 2
0
def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False):
    if cfg.distributed_init_method is not None or cfg.tpu:
        return

    num_pipelines_per_node = None
    if cfg.pipeline_model_parallel:
        num_pipeline_devices, num_pipelines_per_node = _pipeline_parallel_pre_init(cfg)

    if all(
        key in os.environ
        for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]
    ):
        # support torch.distributed.launch
        _infer_torch_distributed_launch_init(cfg)
    elif cfg.distributed_port > 0:
        # we can determine the init method automatically for Slurm
        _infer_slurm_init(cfg, num_pipelines_per_node)
    elif cfg.distributed_world_size > 1 or force_distributed:
        # fallback for single node with multiple GPUs
        _infer_single_node_init(cfg)

    if cfg.pipeline_model_parallel:
        _pipeline_parallel_post_init(cfg, num_pipeline_devices, num_pipelines_per_node)
    elif not cfg.distributed_no_spawn:
        with open_dict(cfg):
            cfg.distributed_num_procs = min(
                torch.cuda.device_count(), cfg.distributed_world_size
            )
Esempio n. 3
0
def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig):
    from fairseq_stchde import utils

    balance_exists = (
        cfg.pipeline_balance is not None
        or cfg.pipeline_encoder_balance is not None
        or cfg.pipeline_decoder_balance is not None
    )
    devices_exist = (
        cfg.pipeline_devices is not None
        or cfg.pipeline_encoder_devices is not None
        or cfg.pipeline_decoder_devices is not None
    )
    if not balance_exists:
        raise ValueError(
            "--pipeline-balance is currently required for pipeline model parallelism"
        )
    if not devices_exist:
        raise ValueError(
            "--pipeline-devices is currently required for pipeline model parallelism"
        )

    cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int)
    if cfg.pipeline_devices is not None:
        cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int)
        num_pipeline_devices = len(set(cfg.pipeline_devices))
    else:
        cfg.pipeline_encoder_devices = utils.eval_str_list(
            cfg.pipeline_encoder_devices, type=int
        )
        cfg.pipeline_decoder_devices = utils.eval_str_list(
            cfg.pipeline_decoder_devices, type=int
        )
        num_pipeline_devices = len(
            set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices)
        )
    gpus_per_node = torch.cuda.device_count()
    assert (
        gpus_per_node >= num_pipeline_devices
        and gpus_per_node % num_pipeline_devices == 0
    ), (
        "the number of unique device IDs in --pipeline-devices must evenly divide "
        "the number of GPUs per node (multi-node pipelining is not yet supported)"
    )
    num_pipelines_per_node = gpus_per_node // num_pipeline_devices
    return num_pipeline_devices, num_pipelines_per_node
Esempio n. 4
0
def add_distributed_training_args(parser, default_world_size=None):
    group = parser.add_argument_group("distributed_training")
    if default_world_size is None:
        default_world_size = max(1, torch.cuda.device_count())
    gen_parser_from_dataclass(
        group,
        DistributedTrainingConfig(distributed_world_size=default_world_size))
    return group
Esempio n. 5
0
def _pipeline_parallel_post_init(
    cfg: DistributedTrainingConfig, num_pipeline_devices, num_pipelines_per_node
):
    if not cfg.distributed_no_spawn:
        # When distributed_no_spawn is False, we expect distributed_rank and
        # distributed_world_size to be based on the total number of GPUs, so
        # we need to correct them to be based on the number of pipelines.
        assert cfg.distributed_world_size % num_pipeline_devices == 0
        cfg.distributed_world_size = (
            cfg.distributed_world_size // num_pipeline_devices
        )
        # In the case of 4-way MP on nodes with 8 GPUs, we want
        # distributed_rank to be the starting GPU index for each pipeline
        # i.e., 0, 2, ...
        gpus_per_node = torch.cuda.device_count()
        assert cfg.distributed_rank % gpus_per_node == 0
        assert cfg.distributed_rank % num_pipeline_devices == 0

        with open_dict(cfg):
            cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices
            # launch one process per pipeline
            cfg.distributed_num_procs = num_pipelines_per_node

    # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0
    # and 4, indicating the starting device IDs for each pipeline
    cfg.device_id *= num_pipeline_devices

    if cfg.device_id > 0:
        # if there's multiple pipelines on a node (e.g., 4-way MP on an 8
        # GPU node), we need to adjust pipeline_devices accordingly
        logger.debug(
            "setting CUDA device={} on rank {}".format(
                cfg.device_id, cfg.distributed_rank
            )
        )
        torch.cuda.set_device(cfg.device_id)
        with open_dict(cfg):
            cfg.pipeline_devices = [cfg.device_id + d for d in cfg.pipeline_devices]
        logger.info(
            "setting pipeline_devices={} on rank {}".format(
                cfg.pipeline_devices, cfg.distributed_rank
            )
        )
Esempio n. 6
0
def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig):
    cfg.distributed_init_method = "env://"
    cfg.distributed_world_size = int(os.environ["WORLD_SIZE"])
    cfg.distributed_rank = int(os.environ["RANK"])
    # processes are created by torch.distributed.launch
    cfg.distributed_no_spawn = True
Esempio n. 7
0
def _infer_single_node_init(cfg: DistributedTrainingConfig):
    assert (
        cfg.distributed_world_size <= torch.cuda.device_count()
    ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices"
    port = random.randint(10000, 20000)
    cfg.distributed_init_method = "tcp://localhost:{port}".format(port=port)