def initialize_model_parallel_for_nemo( world_size, global_rank, local_rank, tensor_model_parallel_size=1, seed=1234, ): # updating NeMo globals app_state = AppState() app_state.global_rank = global_rank app_state.world_size = world_size app_state.local_rank = local_rank app_state.model_parallel_size = tensor_model_parallel_size app_state.model_parallel_rank = compute_model_parallel_rank( local_rank, tensor_model_parallel_size) # update apex.mpu globals set_tensor_model_parallel_world_size(tensor_model_parallel_size) set_tensor_model_parallel_rank(app_state.model_parallel_rank) # pipeline model parallelism not implemented in NeMo yet set_pipeline_model_parallel_rank(0) set_pipeline_model_parallel_world_size(1) _set_random_seed(seed) app_state._is_megatron_initialized = True
def initialize_model_parallel_for_nemo( world_size, global_rank, local_rank, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, micro_batch_size=None, global_batch_size=None, seed=1234, apex_transformer_log_level=30, ): # updating NeMo globals app_state = AppState() app_state.global_rank = global_rank app_state.world_size = world_size app_state.local_rank = local_rank app_state.tensor_model_parallel_size = tensor_model_parallel_size app_state.pipeline_model_parallel_size = pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, ) = fake_initialize_model_parallel( world_size=world_size, rank=global_rank, tensor_model_parallel_size_=tensor_model_parallel_size, pipeline_model_parallel_size_=pipeline_model_parallel_size, ) # update apex.transformer globals set_tensor_model_parallel_world_size(app_state.tensor_model_parallel_size) set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank) # pipeline model parallelism not implemented in NeMo yet set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank) set_pipeline_model_parallel_world_size( app_state.pipeline_model_parallel_size) _set_random_seed(seed) if global_batch_size and micro_batch_size is not None: # TODO: add rampup_batch_size here when we have it implemented setup_microbatch_calculator( rank=global_rank, global_batch_size=global_batch_size, micro_batch_size=micro_batch_size, data_parallel_size=app_state.data_parallel_size, rampup_batch_size=None, ) app_state._is_megatron_initialized = True set_logging_level(apex_transformer_log_level)
def restore_from( cls, restore_path: str, override_config_path: Optional[Union[OmegaConf, str]] = None, map_location: Optional[torch.device] = None, strict: bool = True, return_config: bool = False, trainer: Trainer = None, save_restore_connector: SaveRestoreConnector = None, ): """ Restores model instance (weights and configuration) from .nemo file. Args: restore_path: path to .nemo file from which model should be instantiated override_config_path: path to a yaml config that will override the internal config file or an OmegaConf / DictConfig object representing the model config. map_location: Optional torch.device() to map the instantiated model to a device. By default (None), it will select a GPU if available, falling back to CPU otherwise. strict: Passed to load_state_dict. Set to True by default. return_config: If set to true, will return just the underlying config of the restored model as an OmegaConf DictConfig object without instantiating the model. trainer: PyTorch Lightning trainer. Must be passed in order to use model parallel .nemo Example: ``` model = nemo.collections.nlp.models.TokenClassificationModel.restore_from('token_classification.nemo') assert isinstance(model, nemo.collections.nlp.models.TokenClassificationModel) ``` Returns: An instance of type cls or its underlying config (if return_config is set). """ if save_restore_connector is None: save_restore_connector = SaveRestoreConnector() if not os.path.exists(restore_path): raise FileNotFoundError(f"Can't find {restore_path}") app_state = AppState() app_state.model_restore_path = os.path.abspath( os.path.expanduser(restore_path)) # detect if we have a model parallel .nemo file with tempfile.TemporaryDirectory() as tmpdir: cwd = os.getcwd() os.chdir(tmpdir) # detect if model parallel from tarfile tar = tarfile.open(app_state.model_restore_path, "r:gz") names = tar.getnames() mp_ranks = [] for name in names: if 'mp_rank' in name: mp_ranks.append(name) if mp_ranks: app_state.model_parallel_size = len( mp_ranks ) // 2 # directory and file are included in getnames() # get checkpoint version checkpoint_version_member = None for member in tar.getmembers(): if 'megatron_checkpoint_version.json' in member.name: checkpoint_version_member = member tar.extract(checkpoint_version_member, tmpdir) with open(checkpoint_version_member.name, 'r') as f: checkpoint_version = json.load(f).get( 'checkpoint_version', None) logging.info( (f'Detected model parallel .nemo file: {restore_path}. ' f'Assuming megatron model parallelism with ' f'model_parallel_size: {app_state.model_parallel_size} ' f'and checkpoint version: {checkpoint_version}')) tar.close() os.chdir(cwd) if app_state.model_parallel_size is not None: if not isinstance(trainer, Trainer): raise ValueError( "trainer must be a PyTorch Lightning Trainer to restore model parallel .nemo files." ) if checkpoint_version is None: raise ValueError( "Restoring from megatron model parallel .nemo but could not find megatron checkpoint version." ) else: logging.info( f"Setting megatron checkpoint version: {checkpoint_version}" ) set_checkpoint_version(checkpoint_version) app_state.world_size = trainer.num_gpus * trainer.num_nodes if trainer.local_rank is not None: app_state.local_rank = trainer.local_rank else: raise ValueError( "trainer.local_rank is None. local_rank needed to restore model parallel models." ) model_parallel_rank = compute_model_parallel_rank( trainer.local_rank, app_state.model_parallel_size) app_state.model_parallel_rank = model_parallel_rank cls.update_save_restore_connector(save_restore_connector) restored_model = cls._save_restore_connector.restore_from( cls, app_state.model_restore_path, override_config_path, map_location, strict, return_config) restored_model.set_trainer(trainer) return restored_model else: return super().restore_from( app_state.model_restore_path, override_config_path, map_location, strict, return_config, save_restore_connector=save_restore_connector, )
def get_megatron_lm_model( pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None, checkpoint_file: Optional[str] = None, ) -> Tuple[MegatronBertEncoder, str]: """ Returns MegatronBertEncoder and a default or user specified path to the checkpoint file Args: pretrained_mode_name: model name from MEGATRON_CONFIG_MAP for example: megatron-bert-cased config_dict: model configuration parameters config_file: path to model configuration file. Takes precedence over config_dict if both supplied. checkpoint_file: path to checkpoint file or directory if using model parallel. Returns: model: MegatronBertEncoder checkpoint_file: path to checkpoint file or directory """ config = None # get default config and checkpoint if config_file: with open(config_file) as f: config = json.load(f) # replace dashes with underscores in config keys fixed_config = {} for key in config.keys(): fixed_key = key.replace("-", "_") if fixed_key == 'max_seq_length': fixed_key = 'max_position_embeddings' fixed_config[fixed_key] = config[key] # 'vocab_size" no longer used. if 'vocab_size' in fixed_config: fixed_config.pop('vocab_size') config = fixed_config elif config_dict: config = config_dict elif pretrained_model_name in get_megatron_lm_models_list(): config = get_megatron_config(pretrained_model_name) else: raise ValueError(f"{pretrained_model_name} is not supported") if config is None: raise ValueError( f"config_file or config_dict is required for {pretrained_model_name}" ) if not checkpoint_file: checkpoint_file = get_megatron_checkpoint(pretrained_model_name) vocab = get_megatron_vocab_file(pretrained_model_name) # if checkpoint path is a directory, then we automatically compute model parallel size, # and model parallel rank if os.path.isdir(checkpoint_file): app_state = AppState() model_parallel_size = len(os.listdir(checkpoint_file)) app_state.model_parallel_size = model_parallel_size logging.info((f'restore_path: {checkpoint_file} is a directory. ' f'Assuming megatron model parallelism with ' f'model_parallel_size: {model_parallel_size}')) # try to get local rank from global local_rank = None try: local_rank = int(os.environ['LOCAL_RANK']) except: logging.info('Global variable LOCAL_RANK not yet specified') if local_rank is not None: app_state.local_rank = local_rank else: # if local is None then we are on the main process local_rank = 0 model_parallel_rank = compute_model_parallel_rank( local_rank, model_parallel_size) app_state.model_parallel_rank = model_parallel_rank else: model_parallel_size = None model_parallel_rank = None model = MegatronBertEncoder( model_name=pretrained_model_name, config=config, vocab_file=vocab, model_parallel_size=model_parallel_size, model_parallel_rank=model_parallel_rank, ) return model, checkpoint_file