def get_lm_model( pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None, checkpoint_file: Optional[str] = None, vocab_file: Optional[str] = None, ) -> BertModule: """ Helper function to instantiate a language model encoder, either from scratch or a pretrained model. If only pretrained_model_name are passed, a pretrained model is returned. If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights. Args: pretrained_model_name: pretrained model name, for example, bert-base-uncased or megatron-bert-cased. See get_pretrained_lm_models_list() for full list. config_dict: path to the model configuration dictionary config_file: path to the model configuration file checkpoint_file: path to the pretrained model checkpoint vocab_file: path to vocab_file to be used with Megatron-LM Returns: Pretrained BertModule """ # check valid model type if not pretrained_model_name or pretrained_model_name not in get_pretrained_lm_models_list( include_external=False): logging.warning( f'{pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), ' f'will be using AutoModel from HuggingFace.') # warning when user passes both configuration dict and file if config_dict and config_file: logging.warning( f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used." ) if "megatron" in pretrained_model_name: model, checkpoint_file = get_megatron_lm_model( config_dict=config_dict, config_file=config_file, pretrained_model_name=pretrained_model_name, checkpoint_file=checkpoint_file, vocab_file=vocab_file, ) else: model = get_huggingface_lm_model( config_dict=config_dict, config_file=config_file, pretrained_model_name=pretrained_model_name, ) if checkpoint_file: app_state = AppState() if not app_state.is_model_being_restored and not os.path.exists( checkpoint_file): raise ValueError(f'{checkpoint_file} not found') model.restore_weights(restore_path=checkpoint_file) return model
def get_lm_model( config_dict: Optional[dict] = None, config_file: Optional[str] = None, vocab_file: Optional[str] = None, trainer: Optional[Trainer] = None, cfg: DictConfig = None, ) -> BertModule: """ Helper function to instantiate a language model encoder, either from scratch or a pretrained model. If only pretrained_model_name are passed, a pretrained model is returned. If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights. Args: config_dict: path to the model configuration dictionary config_file: path to the model configuration file vocab_file: path to vocab_file to be used with Megatron-LM trainer: an instance of a PyTorch Lightning trainer cfg: a model configuration Returns: Pretrained BertModule """ # check valid model type if cfg.language_model.get('pretrained_model_name'): if (not cfg.language_model.pretrained_model_name or cfg.language_model.pretrained_model_name not in get_pretrained_lm_models_list(include_external=False)): logging.warning( f'{cfg.language_model.pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), ' f'will be using AutoModel from HuggingFace.') # warning when user passes both configuration dict and file if config_dict and config_file: logging.warning( f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used." ) pretrain_model_name = '' if cfg.get('language_model') and cfg.language_model.get( 'pretrained_model_name', ''): pretrain_model_name = cfg.language_model.get('pretrained_model_name', '') all_pretrained_megatron_bert_models = get_megatron_pretrained_bert_models() if (cfg.tokenizer is not None and cfg.tokenizer.get("tokenizer_name", "") is not None and "megatron" in cfg.tokenizer.get("tokenizer_name", "") ) or pretrain_model_name in all_pretrained_megatron_bert_models: import torch from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel class Identity(torch.nn.Module): def __init__(self): super(Identity, self).__init__() def forward(self, x, *args): return x if cfg.language_model.get("lm_checkpoint"): model = MegatronBertModel.restore_from( restore_path=cfg.language_model.lm_checkpoint, trainer=trainer) else: model = MegatronBertModel.from_pretrained( cfg.language_model.get('pretrained_model_name'), trainer=trainer) # remove the headers that are only revelant for pretraining model.model.lm_head = Identity() model.model.binary_head = Identity() model.model.language_model.pooler = Identity() else: model = get_huggingface_lm_model( config_dict=config_dict, config_file=config_file, pretrained_model_name=cfg.language_model.pretrained_model_name, ) if cfg.language_model.get("lm_checkpoint"): app_state = AppState() if not app_state.is_model_being_restored and not os.path.exists( cfg.language_model.lm_checkpoint): raise ValueError( f'{cfg.language_model.lm_checkpoint} not found') model.restore_weights( restore_path=cfg.language_model.lm_checkpoint) return model
def get_lm_model( pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None, checkpoint_file: Optional[str] = None, nemo_file: Optional[str] = None, vocab_file: Optional[str] = None, trainer: Optional[Trainer] = None, ) -> BertModule: """ Helper function to instantiate a language model encoder, either from scratch or a pretrained model. If only pretrained_model_name are passed, a pretrained model is returned. If a configuration is passed, whether as a file or dictionary, the model is initialized with random weights. Args: pretrained_model_name: pretrained model name, for example, bert-base-uncased or megatron-bert-cased. See get_pretrained_lm_models_list() for full list. config_dict: path to the model configuration dictionary config_file: path to the model configuration file checkpoint_file: path to the pretrained model checkpoint vocab_file: path to vocab_file to be used with Megatron-LM Returns: Pretrained BertModule """ # check valid model type if not pretrained_model_name or pretrained_model_name not in get_pretrained_lm_models_list(include_external=False): logging.warning( f'{pretrained_model_name} is not in get_pretrained_lm_models_list(include_external=False), ' f'will be using AutoModel from HuggingFace.' ) # warning when user passes both configuration dict and file if config_dict and config_file: logging.warning( f"Both config_dict and config_file were found, defaulting to use config_file: {config_file} will be used." ) if nemo_file is not None: import torch from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel class Identity(torch.nn.Module): def __init__(self): super(Identity, self).__init__() def forward(self, x, *args): return x model = MegatronBertModel.restore_from(restore_path=nemo_file, trainer=trainer) # remove the headers that are only revelant for pretraining model.model.lm_head = Identity() model.model.binary_head = Identity() model.model.language_model.pooler = Identity() else: model = get_huggingface_lm_model( config_dict=config_dict, config_file=config_file, pretrained_model_name=pretrained_model_name, ) if checkpoint_file: app_state = AppState() if not app_state.is_model_being_restored and not os.path.exists(checkpoint_file): raise ValueError(f'{checkpoint_file} not found') model.restore_weights(restore_path=checkpoint_file) return model