Beispiel #1
0
def _infer_slurm_init(cfg: DistributedTrainingConfig, num_pipelines_per_node):
    node_list = os.environ.get("SLURM_STEP_NODELIST")
    if node_list is None:
        node_list = os.environ.get("SLURM_JOB_NODELIST")
    if node_list is not None:
        try:
            hostnames = subprocess.check_output(
                ["scontrol", "show", "hostnames", node_list]
            )
            cfg.distributed_init_method = "tcp://{host}:{port}".format(
                host=hostnames.split()[0].decode("utf-8"),
                port=cfg.distributed_port,
            )
            nnodes = int(os.environ.get("SLURM_NNODES"))
            ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE")
            if ntasks_per_node is not None:
                ntasks_per_node = int(ntasks_per_node)
            else:
                ntasks = int(os.environ.get("SLURM_NTASKS"))
                nnodes = int(os.environ.get("SLURM_NNODES"))
                assert ntasks % nnodes == 0
                ntasks_per_node = int(ntasks / nnodes)
            if ntasks_per_node == 1:
                gpus_per_node = torch.cuda.device_count()
                node_id = int(os.environ.get("SLURM_NODEID"))
                cfg.distributed_rank = node_id * gpus_per_node
                cfg.distributed_world_size = nnodes * gpus_per_node
            elif cfg.pipeline_model_parallel:
                assert ntasks_per_node == num_pipelines_per_node, (
                    "SLURM --ntasks-per-node must match number of pipelines per "
                    "node (={})".format(num_pipelines_per_node)
                )
                cfg.distributed_no_spawn = True
                # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on
                # the first node, [1, 2] on the second node, etc. This
                # matches torch.distributed.launch.
                node_id = int(os.environ.get("SLURM_NODEID"))
                local_id = int(os.environ.get("SLURM_LOCALID"))
                cfg.distributed_rank = node_id * num_pipelines_per_node + local_id
                # In the above example, device_id will always be in [0, 1],
                # which also matches torch.distributed.launch.
                cfg.device_id = local_id
                # We also want to set distributed_world_size to be the total
                # number of pipelines across all nodes.
                cfg.distributed_world_size = nnodes * num_pipelines_per_node
            else:
                assert ntasks_per_node == cfg.distributed_world_size // nnodes
                cfg.distributed_no_spawn = True
                cfg.distributed_rank = int(os.environ.get("SLURM_PROCID"))
                cfg.device_id = int(os.environ.get("SLURM_LOCALID"))
        except subprocess.CalledProcessError as e:  # scontrol failed
            raise e
        except FileNotFoundError:  # Slurm is not installed
            pass
Beispiel #2
0
def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False):
    if cfg.distributed_init_method is not None or cfg.tpu:
        return

    num_pipelines_per_node = None
    if cfg.pipeline_model_parallel:
        num_pipeline_devices, num_pipelines_per_node = _pipeline_parallel_pre_init(
            cfg)

    if all(key in os.environ
           for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]):
        # support torch.distributed.launch
        _infer_torch_distributed_launch_init(cfg)
    elif cfg.distributed_port > 0:
        # we can determine the init method automatically for Slurm
        _infer_slurm_init(cfg, num_pipelines_per_node)
    elif cfg.distributed_world_size > 1 or force_distributed:
        # fallback for single node with multiple GPUs
        _infer_single_node_init(cfg)

    if cfg.pipeline_model_parallel:
        _pipeline_parallel_post_init(cfg, num_pipeline_devices,
                                     num_pipelines_per_node)
    elif not cfg.distributed_no_spawn:
        with open_dict(cfg):
            cfg.distributed_num_procs = min(torch.cuda.device_count(),
                                            cfg.distributed_world_size)
Beispiel #3
0
def add_distributed_training_args(parser, default_world_size=None):
    group = parser.add_argument_group("distributed_training")
    if default_world_size is None:
        default_world_size = max(1, torch.cuda.device_count())
    gen_parser_from_dataclass(
        group,
        DistributedTrainingConfig(distributed_world_size=default_world_size))
    return group
Beispiel #4
0
def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig):
    from fairseq import utils

    balance_exists = (
        cfg.pipeline_balance is not None
        or cfg.pipeline_encoder_balance is not None
        or cfg.pipeline_decoder_balance is not None
    )
    devices_exist = (
        cfg.pipeline_devices is not None
        or cfg.pipeline_encoder_devices is not None
        or cfg.pipeline_decoder_devices is not None
    )
    if not balance_exists:
        raise ValueError(
            "--pipeline-balance is currently required for pipeline model parallelism"
        )
    if not devices_exist:
        raise ValueError(
            "--pipeline-devices is currently required for pipeline model parallelism"
        )

    cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int)
    if cfg.pipeline_devices is not None:
        cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int)
        num_pipeline_devices = len(set(cfg.pipeline_devices))
    else:
        cfg.pipeline_encoder_devices = utils.eval_str_list(
            cfg.pipeline_encoder_devices, type=int
        )
        cfg.pipeline_decoder_devices = utils.eval_str_list(
            cfg.pipeline_decoder_devices, type=int
        )
        num_pipeline_devices = len(
            set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices)
        )
    gpus_per_node = torch.cuda.device_count()
    assert (
        gpus_per_node >= num_pipeline_devices
        and gpus_per_node % num_pipeline_devices == 0
    ), (
        "the number of unique device IDs in --pipeline-devices must evenly divide "
        "the number of GPUs per node (multi-node pipelining is not yet supported)"
    )
    num_pipelines_per_node = gpus_per_node // num_pipeline_devices
    return num_pipeline_devices, num_pipelines_per_node
Beispiel #5
0
class InferConfig(FairseqDataclass):
    task: Any = None
    decoding: DecodingConfig = DecodingConfig()
    common: CommonConfig = CommonConfig()
    common_eval: CommonEvalConfig = CommonEvalConfig()
    checkpoint: CheckpointConfig = CheckpointConfig()
    generation: GenerationConfig = GenerationConfig()
    distributed_training: DistributedTrainingConfig = DistributedTrainingConfig(
    )
    dataset: DatasetConfig = DatasetConfig()
Beispiel #6
0
def _pipeline_parallel_post_init(
    cfg: DistributedTrainingConfig, num_pipeline_devices, num_pipelines_per_node
):
    if not cfg.distributed_no_spawn:
        # When distributed_no_spawn is False, we expect distributed_rank and
        # distributed_world_size to be based on the total number of GPUs, so
        # we need to correct them to be based on the number of pipelines.
        assert cfg.distributed_world_size % num_pipeline_devices == 0
        cfg.distributed_world_size = (
            cfg.distributed_world_size // num_pipeline_devices
        )
        # In the case of 4-way MP on nodes with 8 GPUs, we want
        # distributed_rank to be the starting GPU index for each pipeline
        # i.e., 0, 2, ...
        gpus_per_node = torch.cuda.device_count()
        assert cfg.distributed_rank % gpus_per_node == 0
        assert cfg.distributed_rank % num_pipeline_devices == 0

        with open_dict(cfg):
            cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices
            # launch one process per pipeline
            cfg.distributed_num_procs = num_pipelines_per_node

    # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0
    # and 4, indicating the starting device IDs for each pipeline
    cfg.device_id *= num_pipeline_devices

    if cfg.device_id > 0:
        # if there's multiple pipelines on a node (e.g., 4-way MP on an 8
        # GPU node), we need to adjust pipeline_devices accordingly
        logger.debug(
            "setting CUDA device={} on rank {}".format(
                cfg.device_id, cfg.distributed_rank
            )
        )
        torch.cuda.set_device(cfg.device_id)
        with open_dict(cfg):
            cfg.pipeline_devices = [cfg.device_id + d for d in cfg.pipeline_devices]
        logger.info(
            "setting pipeline_devices={} on rank {}".format(
                cfg.pipeline_devices, cfg.distributed_rank
            )
        )
Beispiel #7
0
class InferConfig(FairseqDataclass):
    task: Any = None
    decoding: DecodingConfig = DecodingConfig()
    common: CommonConfig = CommonConfig()
    common_eval: CommonEvalConfig = CommonEvalConfig()
    checkpoint: CheckpointConfig = CheckpointConfig()
    distributed_training: DistributedTrainingConfig = DistributedTrainingConfig()
    dataset: DatasetConfig = DatasetConfig()
    is_ax: bool = field(
        default=False,
        metadata={
            "help": "if true, assumes we are using ax for tuning and returns a tuple for ax to consume"
        },
    )
Beispiel #8
0
def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig):
    cfg.distributed_init_method = "env://"
    cfg.distributed_world_size = int(os.environ["WORLD_SIZE"])
    cfg.distributed_rank = int(os.environ["RANK"])
    # processes are created by torch.distributed.launch
    cfg.distributed_no_spawn = True
    if os.environ["CUDA_VISIBLE_DEVICES"]:
        cfg.device_id = int(
            os.environ["CUDA_VISIBLE_DEVICES"].split(',')[cfg.local_rank])
    else:
        cfg.device_id = cfg.local_rank
def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False):
    if cfg.distributed_init_method is not None or cfg.tpu:
        return

    if cfg.pipeline_model_parallel:
        balance_exists = (cfg.pipeline_balance is not None
                          or cfg.pipeline_encoder_balance is not None
                          or cfg.pipeline_decoder_balance is not None)
        devices_exist = (cfg.pipeline_devices is not None
                         or cfg.pipeline_encoder_devices is not None
                         or cfg.pipeline_decoder_devices is not None)
        if not balance_exists:
            raise ValueError(
                "--pipeline-balance is currently required for pipeline model parallelism"
            )
        if not devices_exist:
            raise ValueError(
                "--pipeline-devices is currently required for pipeline model parallelism"
            )

        cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance,
                                                   type=int)
        if cfg.pipeline_devices is not None:
            cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices,
                                                       type=int)
            num_pipeline_devices = len(set(cfg.pipeline_devices))
        else:
            cfg.pipeline_encoder_devices = utils.eval_str_list(
                cfg.pipeline_encoder_devices, type=int)
            cfg.pipeline_decoder_devices = utils.eval_str_list(
                cfg.pipeline_decoder_devices, type=int)
            num_pipeline_devices = len(
                set(cfg.pipeline_encoder_devices +
                    cfg.pipeline_decoder_devices))
        gpus_per_node = torch.cuda.device_count()
        assert (
            gpus_per_node >= num_pipeline_devices
            and gpus_per_node % num_pipeline_devices == 0
        ), ("the number of unique device IDs in --pipeline-devices must evenly divide "
            "the number of GPUs per node (multi-node pipelining is not yet supported)"
            )
        num_pipelines_per_node = gpus_per_node // num_pipeline_devices

    # support torch.distributed.launch
    if all(key in os.environ
           for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]):
        cfg.distributed_init_method = "env://"
        cfg.distributed_world_size = int(os.environ["WORLD_SIZE"])
        cfg.distributed_rank = int(os.environ["RANK"])
        # processes are created by torch.distributed.launch
        cfg.distributed_no_spawn = True

    # we can determine the init method automatically for Slurm
    elif cfg.distributed_port > 0:
        node_list = os.environ.get("SLURM_STEP_NODELIST")
        if node_list is None:
            node_list = os.environ.get("SLURM_JOB_NODELIST")
        if node_list is not None:
            try:
                hostnames = subprocess.check_output(
                    ["scontrol", "show", "hostnames", node_list])
                cfg.distributed_init_method = "tcp://{host}:{port}".format(
                    host=hostnames.split()[0].decode("utf-8"),
                    port=cfg.distributed_port,
                )
                nnodes = int(os.environ.get("SLURM_NNODES"))
                ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE")
                if ntasks_per_node is not None:
                    ntasks_per_node = int(ntasks_per_node)
                else:
                    ntasks = int(os.environ.get("SLURM_NTASKS"))
                    nnodes = int(os.environ.get("SLURM_NNODES"))
                    assert ntasks % nnodes == 0
                    ntasks_per_node = int(ntasks / nnodes)
                if ntasks_per_node == 1:
                    gpus_per_node = torch.cuda.device_count()
                    node_id = int(os.environ.get("SLURM_NODEID"))
                    cfg.distributed_rank = node_id * gpus_per_node
                    cfg.distributed_world_size = nnodes * gpus_per_node
                elif cfg.pipeline_model_parallel:
                    assert ntasks_per_node == num_pipelines_per_node, (
                        "SLURM --ntasks-per-node must match number of pipelines per "
                        "node (={})".format(num_pipelines_per_node))
                    cfg.distributed_no_spawn = True
                    # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on
                    # the first node, [1, 2] on the second node, etc. This
                    # matches torch.distributed.launch.
                    node_id = int(os.environ.get("SLURM_NODEID"))
                    local_id = int(os.environ.get("SLURM_LOCALID"))
                    cfg.distributed_rank = node_id * num_pipelines_per_node + local_id
                    # In the above example, device_id will always be in [0, 1],
                    # which also matches torch.distributed.launch.
                    cfg.device_id = local_id
                    # We also want to set distributed_world_size to be the total
                    # number of pipelines across all nodes.
                    cfg.distributed_world_size = nnodes * num_pipelines_per_node
                else:
                    assert ntasks_per_node == cfg.distributed_world_size // nnodes
                    cfg.distributed_no_spawn = True
                    cfg.distributed_rank = int(os.environ.get("SLURM_PROCID"))
                    cfg.device_id = int(os.environ.get("SLURM_LOCALID"))
            except subprocess.CalledProcessError as e:  # scontrol failed
                raise e
            except FileNotFoundError:  # Slurm is not installed
                pass

    elif cfg.distributed_world_size > 1 or force_distributed:
        # fallback for single node with multiple GPUs
        assert (
            cfg.distributed_world_size <= torch.cuda.device_count()
        ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices"
        port = random.randint(10000, 20000)
        cfg.distributed_init_method = "tcp://localhost:{port}".format(
            port=port)

    if cfg.pipeline_model_parallel:
        if not cfg.distributed_no_spawn:
            # When distributed_no_spawn is False, we expect distributed_rank and
            # distributed_world_size to be based on the total number of GPUs, so
            # we need to correct them to be based on the number of pipelines.
            assert cfg.distributed_world_size % num_pipeline_devices == 0
            cfg.distributed_world_size = (cfg.distributed_world_size //
                                          num_pipeline_devices)
            # In the case of 4-way MP on nodes with 8 GPUs, we want
            # distributed_rank to be the starting GPU index for each pipeline
            # i.e., 0, 2, ...
            assert cfg.distributed_rank % gpus_per_node == 0
            assert cfg.distributed_rank % num_pipeline_devices == 0

            with open_dict(cfg):
                cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices
                # launch one process per pipeline
                cfg.distributed_num_procs = num_pipelines_per_node

        # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0
        # and 4, indicating the starting device IDs for each pipeline
        cfg.device_id *= num_pipeline_devices

        if cfg.device_id > 0:
            # if there's multiple pipelines on a node (e.g., 4-way MP on an 8
            # GPU node), we need to adjust pipeline_devices accordingly
            logger.debug("setting CUDA device={} on rank {}".format(
                cfg.device_id, cfg.distributed_rank))
            torch.cuda.set_device(cfg.device_id)
            with open_dict(cfg):
                cfg.pipeline_devices = [
                    cfg.device_id + d for d in cfg.pipeline_devices
                ]
            logger.info("setting pipeline_devices={} on rank {}".format(
                cfg.pipeline_devices, cfg.distributed_rank))
    elif not cfg.distributed_no_spawn:
        with open_dict(cfg):
            cfg.distributed_num_procs = min(torch.cuda.device_count(),
                                            cfg.distributed_world_size)
Beispiel #10
0
def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig):
    cfg.distributed_init_method = "env://"
    cfg.distributed_world_size = int(os.environ["WORLD_SIZE"])
    cfg.distributed_rank = int(os.environ["RANK"])
    # processes are created by torch.distributed.launch
    cfg.distributed_no_spawn = True
Beispiel #11
0
def _infer_single_node_init(cfg: DistributedTrainingConfig):
    assert (
        cfg.distributed_world_size <= torch.cuda.device_count()
    ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices"
    port = random.randint(10000, 20000)
    cfg.distributed_init_method = "tcp://localhost:{port}".format(port=port)