Python initialize_model_parallel Examples, megatron.mpu.initialize_model_parallel Python Examples

Example #1

0

Show file

File: nlp_overrides.py Project: silencelearner/NeMo

    def init_model_parallel(self, global_rank: int, world_size: int) -> None:
        """ Initializes Megatron-LM model parallel if using model parallelism.

        Args:
            global_rank (int): the global process index.
            world_size (int): the total number of GPUs, num_nodes * num_gpus
            is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM.
        """
        app_state = AppState()

        # we initialize megatron-lm model parallel and data parallel groups
        # after initializing DDP with PTL.
        if app_state.model_parallel_size is not None:
            if torch.distributed.is_initialized():
                mpu.initialize_model_parallel(app_state.model_parallel_size)
                app_state.model_parallel_group = mpu.get_model_parallel_group()
                app_state.data_parallel_group = mpu.get_data_parallel_group()
                app_state.model_parallel_rank = mpu.get_tensor_model_parallel_rank(
                )
                app_state.data_parallel_rank = mpu.get_data_parallel_rank()
                app_state.data_parallel_size = mpu.get_data_parallel_world_size(
                )
                logging.info(f'mp_rank: {app_state.model_parallel_rank}')
                logging.info(f'dp_rank: {app_state.data_parallel_rank}')
                # TODO: get random seed from PTL
                seed = os.environ.get("PL_GLOBAL_SEED", 1234)
                # random seed must be set for megatron model parallel init
                _set_random_seed(seed)

Example #2

0

Show file

File: nlp_model.py Project: vinayphadnis/NeMo

    def init_ddp_connection(self,
                            global_rank: int,
                            world_size: int,
                            is_slurm_managing_tasks: bool = True) -> None:
        """ Override for LightningModule DDP initialization.
            Initializes Megatron-LM model parallel if using model parallelism.

        Args:
            global_rank (int): the global process index.
            world_size (int): the total number of GPUs, num_nodes * num_gpus
            is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM.
        """
        LightningModule.init_ddp_connection(self, global_rank, world_size,
                                            is_slurm_managing_tasks)

        app_state = AppState()

        # we initialize megatron-lm model parallel and data parallel groups
        # after initializing DDP with PTL.
        if app_state.model_parallel_size is not None:
            if app_state.model_parallel_group is None:
                mpu.initialize_model_parallel(app_state.model_parallel_size)
                app_state.model_parallel_group = mpu.get_model_parallel_group()
                app_state.data_parallel_group = mpu.get_data_parallel_group()
                app_state.model_parallel_rank = torch.distributed.get_rank(
                    group=app_state.model_parallel_group)
                app_state.data_parallel_rank = torch.distributed.get_rank(
                    group=app_state.data_parallel_group)
                logging.info(f'mp_rank: {app_state.model_parallel_rank}')
                logging.info(f'dp_rank: {app_state.data_parallel_rank}')

Example #3

0

Show file

def _initialize_distributed():
    """Initialize torch.distributed and mpu."""
    args = get_args()

    device_count = torch.cuda.device_count()
    if torch.distributed.is_initialized():

        if args.rank == 0:
            print(
                'torch distributed is already initialized, '
                'skipping initialization ...',
                flush=True)
        args.rank = torch.distributed.get_rank()
        args.world_size = torch.distributed.get_world_size()

    else:

        if args.rank == 0:
            print('> initializing torch distributed ...', flush=True)
        # Manually set the device ids.
        if device_count > 0:
            device = args.rank % device_count
            if args.local_rank is not None:
                assert args.local_rank == device, \
                    'expected local-rank to be the same as rank % device-count.'
            else:
                args.local_rank = device
            torch.cuda.set_device(device)
        # Call the init process
        init_method = 'tcp://'
        master_ip = os.getenv('MASTER_ADDR', 'localhost')
        master_port = os.getenv('MASTER_PORT', '6000')
        init_method += master_ip + ':' + master_port
        torch.distributed.init_process_group(backend=args.distributed_backend,
                                             world_size=args.world_size,
                                             rank=args.rank,
                                             init_method=init_method)

    # Set the model-parallel / data-parallel communicators.
    if device_count > 0:
        if mpu.model_parallel_is_initialized():
            print('model parallel is already initialized')
        else:
            mpu.initialize_model_parallel(args.model_parallel_size)

    # Optional DeepSpeed Activation Checkpointing Features
    #
    if args.deepspeed and args.deepspeed_activation_checkpointing:
        setup_deepspeed_random_and_activation_checkpointing(args)

Example #4

0

Show file

File: initialize.py Project: JF-D/Megatron-LM

def _initialize_distributed():
    """Initialize torch.distributed and mpu."""
    args = get_args()

    device_count = torch.cuda.device_count()
    if torch.distributed.is_initialized():

        if args.rank == 0:
            print(
                'torch distributed is already initialized, '
                'skipping initialization ...',
                flush=True)
        args.rank = torch.distributed.get_rank()
        args.world_size = torch.distributed.get_world_size()

    else:

        if args.rank == 0:
            print('> initializing torch distributed ...', flush=True)
        # Manually set the device ids.
        if device_count > 0:
            device = args.rank % device_count
            if args.local_rank is not None:
                assert args.local_rank == device, \
                    'expected local-rank to be the same as rank % device-count.'
            else:
                args.local_rank = device
            torch.cuda.set_device(device)
    # Call the init process
    torch.distributed.init_process_group(backend=args.distributed_backend,
                                         world_size=args.world_size,
                                         rank=args.rank,
                                         timeout=timedelta(minutes=10))

    # Set the tensor model-parallel, pipeline model-parallel, and
    # data-parallel communicators.
    if device_count > 0:
        if mpu.model_parallel_is_initialized():
            print('model parallel is already initialized')
        else:
            mpu.initialize_model_parallel(
                args.tensor_model_parallel_size,
                args.pipeline_model_parallel_size,
                args.virtual_pipeline_model_parallel_size,
                args.pipeline_model_parallel_split_rank)

Example #5

0

Show file

def _initialize_distributed():
    """Initialize torch.distributed and mpu."""
    args = get_args()

    device_count = torch.cuda.device_count()
    if torch.distributed.is_initialized():

        if args.rank == 0:
            print('torch distributed is already initialized, '
                  'skipping initialization ...', flush=True)
        args.rank = torch.distributed.get_rank()
        args.world_size = torch.distributed.get_world_size()

    else:

        if args.rank == 0:
            print('> initializing torch distributed ...', flush=True)
        # Manually set the device ids.
        if device_count > 0:
            device = args.rank % device_count
            if args.local_rank is not None:
                assert args.local_rank == device, \
                    'expected local-rank to be the same as rank % device-count.'
            else:
                args.local_rank = device
            torch.cuda.set_device(device)
            
        distributed.init_distributed(
            dist_backend=args.distributed_backend,
            auto_mpi_discovery=True,
            distributed_port=os.getenv('MASTER_PORT', '6000'),
            verbose=True,
        )

    # Setup 3D topology.
    if args.pipe_parallel_size > 0:
        pp = args.pipe_parallel_size
        mp = args.model_parallel_size
        assert args.world_size % (pp * mp) == 0
        dp = args.world_size // (pp * mp)

        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
        # this does pipe on the most outside, then data, then model. 
        # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
        topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)

        # Offset base seeds for the interior pipeline stages.
        # TODO: adjust last stage too once IO is improved.
        stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
        if 0 < stage_id < topo.get_dim('pipe') - 1:
            offset = args.seed + 1138
            args.seed = offset + (stage_id * mp)
    else:
        topo = None

    # Set the model-parallel / data-parallel communicators.
    if device_count > 0:
        if mpu.model_parallel_is_initialized():
            print('model parallel is already initialized')
        else:
            mpu.initialize_model_parallel(args.model_parallel_size, topology=topo)

    # Optional DeepSpeed Activation Checkpointing Features
    #
    if args.deepspeed and args.deepspeed_activation_checkpointing:
        setup_deepspeed_random_and_activation_checkpointing(args)

Example #6

0

Show file

def _initialize_distributed(neox_args):
    """Initialize torch.distributed and mpu."""

    device_count = torch.cuda.device_count()
    if torch.distributed.is_initialized():

        if neox_args.rank == 0:
            print(
                "torch distributed is already initialized, "
                "skipping initialization ...",
                flush=True,
            )
        neox_args.rank = torch.distributed.get_rank()
        neox_args.world_size = torch.distributed.get_world_size()

    else:

        if neox_args.rank == 0:
            print("> initializing torch distributed ...", flush=True)
        # Manually set the device ids.
        if device_count > 0:
            device = neox_args.rank % device_count
            if neox_args.local_rank is not None:
                assert (
                    neox_args.local_rank == device
                ), "expected local-rank to be the same as rank % device-count."
            else:
                neox_args.local_rank = device
            torch.cuda.set_device(device)

        distributed.init_distributed(
            dist_backend=neox_args.distributed_backend,
            auto_mpi_discovery=True,
            distributed_port=os.getenv("MASTER_PORT", "6000"),
            verbose=True,
        )

    # Setup 3D topology.
    pp = neox_args.pipe_parallel_size if neox_args.pipe_parallel_size >= 1 else 1
    mp = neox_args.model_parallel_size if neox_args.model_parallel_size >= 1 else 1
    assert (
        neox_args.world_size %
        (pp * mp) == 0), f"world_size={neox_args.world_size}, pp={pp}, mp={mp}"
    dp = neox_args.world_size // (pp * mp)

    from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology

    # this does pipe on the most outside, then data, then model.
    # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
    topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)

    # Offset base seeds for the interior pipeline stages.
    # TODO: adjust last stage too once IO is improved.
    stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
    if 0 < stage_id < topo.get_dim("pipe") - 1:
        offset = neox_args.seed + 1138
        neox_args.seed = offset + (stage_id * mp)

    # Set the model-parallel / data-parallel communicators.
    if device_count > 0:
        if mpu.model_parallel_is_initialized():
            print(
                "_initialize_distributed() model parallel is already initialized",
                flush=True,
            )
        else:
            mpu.initialize_model_parallel(
                neox_args.model_parallel_size,
                topology=topo,
                fp32_allreduce=neox_args.fp32_allreduce,
            )

    # Init DeepSpeed Activation Checkpointing Features
    setup_deepspeed_random_and_activation_checkpointing(neox_args=neox_args)