def initialize_model_parallel_for_nemo( world_size, global_rank, local_rank, tensor_model_parallel_size=1, seed=1234, ): # updating NeMo globals app_state = AppState() app_state.global_rank = global_rank app_state.world_size = world_size app_state.model_parallel_size = tensor_model_parallel_size app_state.model_parallel_rank = compute_model_parallel_rank( local_rank, tensor_model_parallel_size) # update apex.mpu globals set_tensor_model_parallel_world_size(tensor_model_parallel_size) set_tensor_model_parallel_rank(app_state.model_parallel_rank) # pipeline model parallelism not implemented in NeMo yet set_pipeline_model_parallel_rank(0) set_pipeline_model_parallel_world_size(1) _set_random_seed(seed) app_state._is_megatron_initialized = True
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.model_parallel_size = 1 app_state.model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get( 'tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) try: from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper compile_helper() logging.info('Megatron dataset helper compiled successfully.') from nemo.collections.nlp.data.language_modeling.megatron import helpers except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' )
def convert(local_rank, rank, world_size, args): app_state = AppState() app_state.data_parallel_rank = 0 num_nodes = world_size // args.gpus_per_node if args.bcp: trainer = Trainer(devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu', plugins=[TorchElasticEnvironment()]) else: trainer = Trainer(devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu') app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size app_state.tensor_model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size parallel_state.initialize_model_parallel( tensor_model_parallel_size_=app_state.tensor_model_parallel_size, pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size, ) app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank( ) app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank( ) # inject model parallel rank checkpoint_path = inject_model_parallel_rank( os.path.join(args.checkpoint_folder, args.checkpoint_name)) logging.info( f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}' ) if args.model_type == 'gpt': model = MegatronGPTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 'bert': model = MegatronBertModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 't5': model = MegatronT5Model.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 'nmt': model = MegatronNMTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() if torch.distributed.is_initialized(): torch.distributed.barrier() model.save_to(args.nemo_file_path) logging.info(f'NeMo model saved to: {args.nemo_file_path}')
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.model_parallel_size = 1 app_state.model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get( 'tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), )
def restore_from( cls, restore_path: str, override_config_path: Optional[Union[OmegaConf, str]] = None, map_location: Optional[torch.device] = None, strict: bool = True, return_config: bool = False, trainer: Trainer = None, save_restore_connector: SaveRestoreConnector = None, ): """ Restores model instance (weights and configuration) from .nemo file. Args: restore_path: path to .nemo file from which model should be instantiated override_config_path: path to a yaml config that will override the internal config file or an OmegaConf / DictConfig object representing the model config. map_location: Optional torch.device() to map the instantiated model to a device. By default (None), it will select a GPU if available, falling back to CPU otherwise. strict: Passed to load_state_dict. Set to True by default. return_config: If set to true, will return just the underlying config of the restored model as an OmegaConf DictConfig object without instantiating the model. trainer: PyTorch Lightning trainer. Must be passed in order to use model parallel .nemo Example: ``` model = nemo.collections.nlp.models.TokenClassificationModel.restore_from('token_classification.nemo') assert isinstance(model, nemo.collections.nlp.models.TokenClassificationModel) ``` Returns: An instance of type cls or its underlying config (if return_config is set). """ if save_restore_connector is None: save_restore_connector = SaveRestoreConnector() if not os.path.exists(restore_path): raise FileNotFoundError(f"Can't find {restore_path}") app_state = AppState() app_state.model_restore_path = os.path.abspath( os.path.expanduser(restore_path)) # detect if we have a model parallel .nemo file with tempfile.TemporaryDirectory() as tmpdir: cwd = os.getcwd() os.chdir(tmpdir) # detect if model parallel from tarfile tar = tarfile.open(app_state.model_restore_path, "r:gz") names = tar.getnames() mp_ranks = [] for name in names: if 'mp_rank' in name: mp_ranks.append(name) if mp_ranks: app_state.model_parallel_size = len( mp_ranks ) // 2 # directory and file are included in getnames() # get checkpoint version checkpoint_version_member = None for member in tar.getmembers(): if 'megatron_checkpoint_version.json' in member.name: checkpoint_version_member = member tar.extract(checkpoint_version_member, tmpdir) with open(checkpoint_version_member.name, 'r') as f: checkpoint_version = json.load(f).get( 'checkpoint_version', None) logging.info( (f'Detected model parallel .nemo file: {restore_path}. ' f'Assuming megatron model parallelism with ' f'model_parallel_size: {app_state.model_parallel_size} ' f'and checkpoint version: {checkpoint_version}')) tar.close() os.chdir(cwd) if app_state.model_parallel_size is not None: if not isinstance(trainer, Trainer): raise ValueError( "trainer must be a PyTorch Lightning Trainer to restore model parallel .nemo files." ) if checkpoint_version is None: raise ValueError( "Restoring from megatron model parallel .nemo but could not find megatron checkpoint version." ) else: logging.info( f"Setting megatron checkpoint version: {checkpoint_version}" ) set_checkpoint_version(checkpoint_version) app_state.world_size = trainer.num_gpus * trainer.num_nodes if trainer.local_rank is not None: app_state.local_rank = trainer.local_rank else: raise ValueError( "trainer.local_rank is None. local_rank needed to restore model parallel models." ) model_parallel_rank = compute_model_parallel_rank( trainer.local_rank, app_state.model_parallel_size) app_state.model_parallel_rank = model_parallel_rank cls.update_save_restore_connector(save_restore_connector) restored_model = cls._save_restore_connector.restore_from( cls, app_state.model_restore_path, override_config_path, map_location, strict, return_config) restored_model.set_trainer(trainer) return restored_model else: return super().restore_from( app_state.model_restore_path, override_config_path, map_location, strict, return_config, save_restore_connector=save_restore_connector, )
def convert(local_rank, rank, world_size, args): app_state = AppState() app_state.data_parallel_rank = 0 tensor_model_parallel_size = args.tensor_model_parallel_size num_nodes = world_size // args.gpus_per_node pipeline_model_parallel_size = world_size // args.tensor_model_parallel_size assert args.pipeline_model_parallel_size == pipeline_model_parallel_size trainer = Trainer(devices=args.gpus_per_node, accelerator='gpu', num_nodes=num_nodes) app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size app_state.tensor_model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size parallel_state.initialize_model_parallel( tensor_model_parallel_size_=app_state.tensor_model_parallel_size, pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size, ) app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank( ) app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank( ) pipeline_rank = rank // tensor_model_parallel_size tensor_rank = app_state.tensor_model_parallel_rank assert pipeline_rank == app_state.pipeline_model_parallel_rank if tensor_model_parallel_size is not None and tensor_model_parallel_size > 1 and pipeline_model_parallel_size == 1: # inject model parallel rank checkpoint_path = os.path.join(args.checkpoint_folder, f'mp_rank_{tensor_rank:02d}', args.checkpoint_name) elif tensor_model_parallel_size is not None and pipeline_model_parallel_size > 1: checkpoint_path = os.path.join( args.checkpoint_folder, f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}', args.checkpoint_name) else: checkpoint_path = os.path.join(args.checkpoint_folder, args.checkpoint_name) logging.info(f"loading checkpoint {checkpoint_path}") if args.model_type == 'gpt': ## this dictionary is used to rename the model parameters name_translate = {} name_translate['transformer'] = 'encoder' name_translate['.attention.'] = '.self_attention.' # nemo megatron doesn't have _for_head key name_translate['word_embeddings_for_head'] = 'word_embeddings' checkpoint, consumed, steps, version = load_from_checkpoint( MegatronGPTModel, checkpoint_path, hparams_file=args.hparams_file, trainer=trainer, translator=name_translate, strict=False, ) elif args.model_type == 'bert': ## this dictionary is used to rename the model parameters name_translate = {} name_translate['transformer'] = 'encoder' name_translate['.attention.'] = '.self_attention.' # nemo megatron doesn't have _for_head key name_translate['word_embeddings_for_head'] = 'word_embeddings' checkpoint, consumed, steps, version = load_from_checkpoint( MegatronBertModel, checkpoint_path, hparams_file=args.hparams_file, trainer=trainer, translator=name_translate, strict=False, ) else: raise NotImplemented("{} is not supported".format(args.model_type)) if torch.distributed.is_initialized(): torch.distributed.barrier() if args.output_ckpt_file_path: filepath = args.output_ckpt_file_path base_dir = pathlib.Path(filepath).parent filename_str = pathlib.Path(filepath).name suffix = '.ckpt' content = {} if consumed is not None: content['consumed'] = consumed else: content['consumed'] = 0 if steps is not None: content['steps'] = steps else: content['steps'] = 0 filename = filename_str.format(**content) + suffix checkpoint_path_output = inject_model_parallel_rank( os.path.join(base_dir, filename)) trainer.accelerator.training_type_plugin.checkpoint_io.save_checkpoint( checkpoint, checkpoint_path_output) logging.info( f'NeMo model checkpoint files saved to: {args.output_ckpt_file_path}' ) if args.nemo_file_path: if args.model_type == 'gpt': model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) elif args.model_type == 'bert': model = load_model(MegatronBertModel, checkpoint, strict=False, trainer=trainer) else: raise NotImplemented("{} is not supported".format(args.model_type)) # verify tensor parallel rank id and pipeline parallel rank id matches assert app_state.data_parallel_size == 1 assert app_state.tensor_model_parallel_size == tensor_model_parallel_size assert app_state.tensor_model_parallel_rank == tensor_rank assert app_state.pipeline_model_parallel_size == pipeline_model_parallel_size assert app_state.pipeline_model_parallel_rank == pipeline_rank model._save_restore_connector = NLPSaveRestoreConnector() model.save_to(args.nemo_file_path) logging.info(f'NeMo model saved to: {args.nemo_file_path}')
def get_megatron_lm_model( pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None, checkpoint_file: Optional[str] = None, ) -> Tuple[MegatronBertEncoder, str]: """ Returns MegatronBertEncoder and a default or user specified path to the checkpoint file Args: pretrained_mode_name: model name from MEGATRON_CONFIG_MAP for example: megatron-bert-cased config_dict: model configuration parameters config_file: path to model configuration file. Takes precedence over config_dict if both supplied. checkpoint_file: path to checkpoint file or directory if using model parallel. Returns: model: MegatronBertEncoder checkpoint_file: path to checkpoint file or directory """ config = None # get default config and checkpoint if config_file: with open(config_file) as f: config = json.load(f) # replace dashes with underscores in config keys fixed_config = {} for key in config.keys(): fixed_key = key.replace("-", "_") if fixed_key == 'max_seq_length': fixed_key = 'max_position_embeddings' fixed_config[fixed_key] = config[key] # 'vocab_size" no longer used. if 'vocab_size' in fixed_config: fixed_config.pop('vocab_size') config = fixed_config elif config_dict: config = config_dict elif pretrained_model_name in get_megatron_lm_models_list(): config = get_megatron_config(pretrained_model_name) else: raise ValueError(f"{pretrained_model_name} is not supported") if config is None: raise ValueError( f"config_file or config_dict is required for {pretrained_model_name}" ) if not checkpoint_file: checkpoint_file = get_megatron_checkpoint(pretrained_model_name) vocab = get_megatron_vocab_file(pretrained_model_name) # if checkpoint path is a directory, then we automatically compute model parallel size, # and model parallel rank if os.path.isdir(checkpoint_file): app_state = AppState() model_parallel_size = len(os.listdir(checkpoint_file)) app_state.model_parallel_size = model_parallel_size logging.info((f'restore_path: {checkpoint_file} is a directory. ' f'Assuming megatron model parallelism with ' f'model_parallel_size: {model_parallel_size}')) # try to get local rank from global local_rank = None try: local_rank = int(os.environ['LOCAL_RANK']) except: logging.info('Global variable LOCAL_RANK not yet specified') if local_rank is not None: app_state.local_rank = local_rank else: # if local is None then we are on the main process local_rank = 0 model_parallel_rank = compute_model_parallel_rank( local_rank, model_parallel_size) app_state.model_parallel_rank = model_parallel_rank else: model_parallel_size = None model_parallel_rank = None model = MegatronBertEncoder( model_name=pretrained_model_name, config=config, vocab_file=vocab, model_parallel_size=model_parallel_size, model_parallel_rank=model_parallel_rank, ) return model, checkpoint_file