def init_model_parallel(self, global_rank: int, world_size: int) -> None: """ Initializes Megatron-LM model parallel if using model parallelism. Args: global_rank (int): the global process index. world_size (int): the total number of GPUs, num_nodes * num_gpus is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM. """ app_state = AppState() # we initialize megatron-lm model parallel and data parallel groups # after initializing DDP with PTL. if app_state.model_parallel_size is not None: if torch.distributed.is_initialized(): mpu.initialize_model_parallel(app_state.model_parallel_size) app_state.model_parallel_group = mpu.get_model_parallel_group() app_state.data_parallel_group = mpu.get_data_parallel_group() app_state.model_parallel_rank = mpu.get_tensor_model_parallel_rank( ) app_state.data_parallel_rank = mpu.get_data_parallel_rank() app_state.data_parallel_size = mpu.get_data_parallel_world_size( ) logging.info(f'mp_rank: {app_state.model_parallel_rank}') logging.info(f'dp_rank: {app_state.data_parallel_rank}') # TODO: get random seed from PTL seed = os.environ.get("PL_GLOBAL_SEED", 1234) # random seed must be set for megatron model parallel init _set_random_seed(seed)
def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: """ Override for LightningModule DDP initialization. Initializes Megatron-LM model parallel if using model parallelism. Args: global_rank (int): the global process index. world_size (int): the total number of GPUs, num_nodes * num_gpus is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM. """ LightningModule.init_ddp_connection(self, global_rank, world_size, is_slurm_managing_tasks) app_state = AppState() # we initialize megatron-lm model parallel and data parallel groups # after initializing DDP with PTL. if app_state.model_parallel_size is not None: if app_state.model_parallel_group is None: mpu.initialize_model_parallel(app_state.model_parallel_size) app_state.model_parallel_group = mpu.get_model_parallel_group() app_state.data_parallel_group = mpu.get_data_parallel_group() app_state.model_parallel_rank = torch.distributed.get_rank( group=app_state.model_parallel_group) app_state.data_parallel_rank = torch.distributed.get_rank( group=app_state.data_parallel_group) logging.info(f'mp_rank: {app_state.model_parallel_rank}') logging.info(f'dp_rank: {app_state.data_parallel_rank}')
def _initialize_distributed(): """Initialize torch.distributed and mpu.""" args = get_args() device_count = torch.cuda.device_count() if torch.distributed.is_initialized(): if args.rank == 0: print( 'torch distributed is already initialized, ' 'skipping initialization ...', flush=True) args.rank = torch.distributed.get_rank() args.world_size = torch.distributed.get_world_size() else: if args.rank == 0: print('> initializing torch distributed ...', flush=True) # Manually set the device ids. if device_count > 0: device = args.rank % device_count if args.local_rank is not None: assert args.local_rank == device, \ 'expected local-rank to be the same as rank % device-count.' else: args.local_rank = device torch.cuda.set_device(device) # Call the init process init_method = 'tcp://' master_ip = os.getenv('MASTER_ADDR', 'localhost') master_port = os.getenv('MASTER_PORT', '6000') init_method += master_ip + ':' + master_port torch.distributed.init_process_group(backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, init_method=init_method) # Set the model-parallel / data-parallel communicators. if device_count > 0: if mpu.model_parallel_is_initialized(): print('model parallel is already initialized') else: mpu.initialize_model_parallel(args.model_parallel_size) # Optional DeepSpeed Activation Checkpointing Features # if args.deepspeed and args.deepspeed_activation_checkpointing: setup_deepspeed_random_and_activation_checkpointing(args)
def _initialize_distributed(): """Initialize torch.distributed and mpu.""" args = get_args() device_count = torch.cuda.device_count() if torch.distributed.is_initialized(): if args.rank == 0: print( 'torch distributed is already initialized, ' 'skipping initialization ...', flush=True) args.rank = torch.distributed.get_rank() args.world_size = torch.distributed.get_world_size() else: if args.rank == 0: print('> initializing torch distributed ...', flush=True) # Manually set the device ids. if device_count > 0: device = args.rank % device_count if args.local_rank is not None: assert args.local_rank == device, \ 'expected local-rank to be the same as rank % device-count.' else: args.local_rank = device torch.cuda.set_device(device) # Call the init process torch.distributed.init_process_group(backend=args.distributed_backend, world_size=args.world_size, rank=args.rank, timeout=timedelta(minutes=10)) # Set the tensor model-parallel, pipeline model-parallel, and # data-parallel communicators. if device_count > 0: if mpu.model_parallel_is_initialized(): print('model parallel is already initialized') else: mpu.initialize_model_parallel( args.tensor_model_parallel_size, args.pipeline_model_parallel_size, args.virtual_pipeline_model_parallel_size, args.pipeline_model_parallel_split_rank)
def _initialize_distributed(): """Initialize torch.distributed and mpu.""" args = get_args() device_count = torch.cuda.device_count() if torch.distributed.is_initialized(): if args.rank == 0: print('torch distributed is already initialized, ' 'skipping initialization ...', flush=True) args.rank = torch.distributed.get_rank() args.world_size = torch.distributed.get_world_size() else: if args.rank == 0: print('> initializing torch distributed ...', flush=True) # Manually set the device ids. if device_count > 0: device = args.rank % device_count if args.local_rank is not None: assert args.local_rank == device, \ 'expected local-rank to be the same as rank % device-count.' else: args.local_rank = device torch.cuda.set_device(device) distributed.init_distributed( dist_backend=args.distributed_backend, auto_mpi_discovery=True, distributed_port=os.getenv('MASTER_PORT', '6000'), verbose=True, ) # Setup 3D topology. if args.pipe_parallel_size > 0: pp = args.pipe_parallel_size mp = args.model_parallel_size assert args.world_size % (pp * mp) == 0 dp = args.world_size // (pp * mp) from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology # this does pipe on the most outside, then data, then model. # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order. topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp) # Offset base seeds for the interior pipeline stages. # TODO: adjust last stage too once IO is improved. stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe if 0 < stage_id < topo.get_dim('pipe') - 1: offset = args.seed + 1138 args.seed = offset + (stage_id * mp) else: topo = None # Set the model-parallel / data-parallel communicators. if device_count > 0: if mpu.model_parallel_is_initialized(): print('model parallel is already initialized') else: mpu.initialize_model_parallel(args.model_parallel_size, topology=topo) # Optional DeepSpeed Activation Checkpointing Features # if args.deepspeed and args.deepspeed_activation_checkpointing: setup_deepspeed_random_and_activation_checkpointing(args)
def _initialize_distributed(neox_args): """Initialize torch.distributed and mpu.""" device_count = torch.cuda.device_count() if torch.distributed.is_initialized(): if neox_args.rank == 0: print( "torch distributed is already initialized, " "skipping initialization ...", flush=True, ) neox_args.rank = torch.distributed.get_rank() neox_args.world_size = torch.distributed.get_world_size() else: if neox_args.rank == 0: print("> initializing torch distributed ...", flush=True) # Manually set the device ids. if device_count > 0: device = neox_args.rank % device_count if neox_args.local_rank is not None: assert ( neox_args.local_rank == device ), "expected local-rank to be the same as rank % device-count." else: neox_args.local_rank = device torch.cuda.set_device(device) distributed.init_distributed( dist_backend=neox_args.distributed_backend, auto_mpi_discovery=True, distributed_port=os.getenv("MASTER_PORT", "6000"), verbose=True, ) # Setup 3D topology. pp = neox_args.pipe_parallel_size if neox_args.pipe_parallel_size >= 1 else 1 mp = neox_args.model_parallel_size if neox_args.model_parallel_size >= 1 else 1 assert ( neox_args.world_size % (pp * mp) == 0), f"world_size={neox_args.world_size}, pp={pp}, mp={mp}" dp = neox_args.world_size // (pp * mp) from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology # this does pipe on the most outside, then data, then model. # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order. topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp) # Offset base seeds for the interior pipeline stages. # TODO: adjust last stage too once IO is improved. stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe if 0 < stage_id < topo.get_dim("pipe") - 1: offset = neox_args.seed + 1138 neox_args.seed = offset + (stage_id * mp) # Set the model-parallel / data-parallel communicators. if device_count > 0: if mpu.model_parallel_is_initialized(): print( "_initialize_distributed() model parallel is already initialized", flush=True, ) else: mpu.initialize_model_parallel( neox_args.model_parallel_size, topology=topo, fp32_allreduce=neox_args.fp32_allreduce, ) # Init DeepSpeed Activation Checkpointing Features setup_deepspeed_random_and_activation_checkpointing(neox_args=neox_args)