def __init__(self, auto_model: str, auto_path: str): super().__init__() if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
def __init__(self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True): super().__init__(config=config) if pre_load_model: self.xlm = XLMModel.from_pretrained(self.config.model_name, output_hidden_states=True) else: xlm_config = XLMConfig.from_pretrained(self.config.model_name, output_hidden_states=True) self.xlm = XLMModel(xlm_config) self.source_lang_id = self.xlm.config.lang2id.get( self.config.source_language) self.target_lang_id = self.xlm.config.lang2id.get( self.config.target_language) if None in (self.source_lang_id, self.target_lang_id): raise ValueError( f'Invalid lang_id for XLM model.' f' Valid ids are: {self.xlm.config.lang2id.keys()}') self.mlp = None if self.config.use_mlp: self.mlp = nn.Sequential( nn.Linear(self.xlm.config.hidden_size, self.config.hidden_size), nn.Tanh(), ) output_size = self.config.hidden_size else: output_size = self.xlm.config.hidden_size self._sizes = { const.TARGET: output_size, const.TARGET_LOGITS: output_size, const.TARGET_SENTENCE: 2 * output_size, const.SOURCE: output_size, const.SOURCE_LOGITS: output_size, } self.vocabs = { const.TARGET: vocabs[const.TARGET], const.SOURCE: vocabs[const.SOURCE], } self.output_embeddings = self.xlm.embeddings if self.config.freeze: for param in self.xlm.parameters(): param.requires_grad = False
def get_attentions(): model_name = request.args.get('model') source = request.args.get('source') target = request.args.get('target') if model_name == 'XLM': model_version = 'xlm-mlm-ende-1024' model = XLMModel.from_pretrained(model_version, output_attentions=True) tokenizer = XLMTokenizer.from_pretrained(model_version) elif model_name == 'GPT-2': model_version = 'gpt2' model = GPT2Model.from_pretrained(model_version, output_attentions=True) tokenizer = GPT2Tokenizer.from_pretrained(model_version) else: # BERT model_version = 'bert-base-uncased' model = BertModel.from_pretrained(model_version, output_attentions=True) tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True) inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True) token_type_ids = inputs['token_type_ids'] input_ids = inputs['input_ids'] attention = model(input_ids, token_type_ids=token_type_ids)[-1] input_id_list = input_ids[0].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformer = XLMModel(config) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() self.dropout = nn.Dropout(0.1)
def get_transformers_model( settings: Dict[str, Any], model_name: str, pretrained: bool = True, ckptdir: Optional[Path] = None, ) -> PreTrainedModel: model_path = model_name if pretrained else str(ckptdir) config = AutoConfig.from_pretrained(model_path) config.attention_probs_dropout_prob = settings.get( 'encoder_attn_dropout_rate', 0.1) config.hidden_dropout_prob = settings.get('encoder_ffn_dropout_rate', 0.1) config.layer_norm_eps = settings.get('layer_norm_eps', 1e-5) if pretrained: model = AutoModel.from_pretrained(model_name, config=config) return model # if you want not parameters but only model structure, each model class is needed. if 'xlm' in model_name: model = XLMModel(config=config) elif 'albert' in model_name: model = AlbertModel(config=config) elif 'roberta' in model_name: model = RobertaModel(config=config) elif 'deberta-v2' in model_name: model = DebertaV2Model(config=config) elif 'deberta' in model_name: model = DebertaModel(config=config) elif 'bert' in model_name: model = BertModel(config=config) elif 'electra' in model_name: model = ElectraModel(config=config) else: model = BertModel(config=config) return model
def xlm_model(): config = XLMConfig( vocab_size=93000, emb_dim=32, n_layers=5, n_heads=4, dropout=0.1, max_position_embeddings=512, lang2id={ "ar": 0, "bg": 1, "de": 2, "el": 3, "en": 4, "es": 5, "fr": 6, "hi": 7, "ru": 8, "sw": 9, "th": 10, "tr": 11, "ur": 12, "vi": 13, "zh": 14, }, ) return XLMModel(config=config)
def test_model(modelname): model, log = XLMModel.from_pretrained(modelname, output_loading_info=True) tokenizer = XLMTokenizer.from_pretrained(modelname, do_lower_case=False) # this line is important: by default, XLMTokenizer removes diacritics, even with do_lower_case=False flag tokenizer.do_lowercase_and_remove_accent = False print("Dictionary values must be empty lists:") print(log)
def __init__(self, config): super(XLMForMultiLabelSequenceClassification, self).__init__(config) self.num_labels = config.num_labels self.transformer = XLMModel(config) self.sequence_summary = SequenceSummary(config) self.init_weights()
def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): model = XLMModel(config=config) model.eval() outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids) outputs = model(input_ids, langs=token_type_ids) outputs = model(input_ids) sequence_output = outputs[0] result = { "sequence_output": sequence_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size])
def get_model_and_tokenizer(model_name, device, random_weights=False): model_name = model_name if model_name.startswith('xlnet'): model = XLNetModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(model_name) sep = u'▁' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('gpt2'): model = GPT2Model.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(model_name) sep = 'Ġ' sizes = { "gpt2": 768, "gpt2-medium": 1024, "gpt2-large": 1280, "gpt2-xl": 1600 } emb_dim = sizes[model_name] elif model_name.startswith('xlm'): model = XLMModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(model_name) sep = '</w>' elif model_name.startswith('bert'): model = BertModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('distilbert'): model = DistilBertModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 768 elif model_name.startswith('roberta'): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = 'Ġ' emb_dim = 1024 if "large" in model_name else 768 else: print('Unrecognized model name:', model_name) sys.exit() if random_weights: print('Randomizing weights') model.init_weights() return model, tokenizer, sep, emb_dim
def init(args): BERTTool.multi_bert = XLMModel.from_pretrained( args.multi_bert.location) BERTTool.multi_tokener = XLMTokenizer.from_pretrained( args.multi_bert.location) BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids( ["<pad>"])[0] BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids( ["</s>"])[0] BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids( ["<s>"])[0]
def get_model_and_tokenizer(model_name, device="cpu", random_weights=False, model_path=None): """ model_path: if given, initialize from path instead of official repo """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep
def __init__(self, model_type): """Constructor :param model_type: which model is used, xlm or mbert """ if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') model = XLMModel.from_pretrained('xlm-mlm-100-1280') self.embeddings = model.embeddings.weight elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') model = BertModel.from_pretrained('bert-base-multilingual-uncased') self.embeddings = model.embeddings.word_embeddings.weight self.emb_dim = self.embeddings.shape[1]
def build_model(model, time_pooling, layer_pooling, layer, new_num_tokens, device, **kwargs): n_class = 2 if model == 'mbert': base_model = BertModel.from_pretrained('bert-base-multilingual-uncased', output_hidden_states=True, **kwargs) elif model == 'xlm': base_model = XLMModel.from_pretrained('xlm-mlm-100-1280', output_hidden_states=True, **kwargs) base_model.resize_token_embeddings(new_num_tokens) # All transformers models model = PoolClassifier(base_model, n_class, time_pooling, layer_pooling, layer) return model.to(device)
def __init__(self, config): BertPreTrainedModel.__init__(config) XLMPreTrainedModel.__init__(config) self.num_labels = BertPreTrainedModel.config.num_labels self.bert = BertModel(config) self.classifier = nn.Linear(BertPreTrainedModel.config.hidden_size + XLMPreTrainedModel.config.hidden_size, config.num_labels) self.init_weights() #self.num_labels = config.num_labels self.transformer = XLMModel(config) self.init_weights() self.dropout = nn.Dropout(0.1)
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels_list self.transformer = XLMModel(config) self.dropout = nn.Dropout(config.dropout) self.pooler = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size), nn.Tanh()) self.classifiers = nn.ModuleList([ nn.Linear(config.hidden_size, num_label) for num_label in self.num_labels ]) self.init_weights()
def __init__(self, model_type): """Constructor :param model_type: if and xlm or bert model is used """ # Instantiate model and tokenizers from pre-trained multilingual versions if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') self.model = XLMModel.from_pretrained('xlm-mlm-100-1280', output_hidden_states=True) elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-uncased') self.model = BertModel.from_pretrained( 'bert-base-multilingual-uncased', output_hidden_states=True) else: raise ValueError( 'Unrecognized model type. Only bert and xlm supported')
def __init__(self, config: Munch = None, **kwargs): """ Initialize a new XLM synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(XLMSynapse, self).__init__(config=config, **kwargs) if config == None: config = XLMSynapse.default_config() bittensor.config.Config.update_with_kwargs(config.synapse, kwargs) XLMSynapse.check_config(config) self.config = config # Build config. xlm_config = XLMConfig( vocab_size=bittensor.__vocab_size__, emb_dim=bittensor.__network_dim__, n_layers=config.synapse.n_layers, n_heads=config.synapse.n_heads, # More needed ) # model layer: encodes tokenized sequences to network dim. self.xlm = XLMModel(xlm_config) # pooler layer: pools the hidden units for use by the pkm dendrite rpc query. self.pooler = XLMPooler(xlm_config) # router: (PKM layer) queries network using embeddings as context self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden layer: transforms context and encoding to network dimension hidden units. self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target layer: maps from hidden layer to vocab dimension for each token. self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function self.loss_fct = nn.CrossEntropyLoss() self.to(self.device)
def xlm_convert_to_huggingface(args): """ Given a FaceBook's XLM model checkpoint, a BPE merges file, create and save a HuggingFace XLMTokenizer and a XLMModel. """ xlm_pth = torch.load(args.checkpoint, map_location=torch.device('cpu')) with NamedTemporaryFile() as tfile: tfile.write(b'{}') tfile.flush() tokenizer = XLMTokenizer( tfile.name, args.merges, do_lowercase_and_remove_accent=False) tokenizer.encoder = convert_vocab(xlm_pth['dico_word2id']) vocab_size = len(tokenizer) params = xlm_pth['params'] xlm_config = XLMConfig( emb_dim=params['emb_dim'], vocab_size=params['n_words'], n_layers=params['n_layers'], n_heads=params['n_heads'], n_langs=params['n_langs'], sinusoidal_embeddings=params['sinusoidal_embeddings'], use_lang_emb=params['use_lang_emb'], is_encoder=params['encoder_only'], output_hidden_states=True, n_words = params['n_words'], ) # Provide both config and state dict to model init model = XLMModel.from_pretrained( None, config=xlm_config, state_dict=xlm_pth['model']) # Save save_directory = Path(args.output_dir) if not save_directory.exists(): save_directory.mkdir(parents=True, exist_ok=True) model.save_pretrained(str(save_directory)) tokenizer.save_pretrained(str(save_directory)) tokenizer.save_vocabulary(str(save_directory))
def create_and_check_xlm_model( self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, choice_labels, input_mask, ): model = XLMModel(config=config) model.to(torch_device) model.eval() result = model(input_ids, lengths=input_lengths, langs=token_type_ids) result = model(input_ids, langs=token_type_ids) result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def test_model_from_pretrained(self): for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = XLMModel.from_pretrained(model_name) self.assertIsNotNone(model)
def get_model_and_tokenizer( model_name, device="cpu", random_weights=False, model_path=None ): """ model_path: if given, initialize from path instead of official repo models typically cached in ~/.cache/torch/transformers/ """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define QARiB https://huggingface.co/qarib/bert-base-qarib elif model_name.startswith("qarib"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define AraBERT https://huggingface.co/aubmindlab/bert-base-arabert elif model_name.startswith("aubmindlab"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define ArabicBERT https://huggingface.co/asafaya/bert-base-arabic elif model_name.startswith("asafaya"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" #Define https://huggingface.co/UBC-NLP/MARBERT elif model_name.startswith("UBC-NLP"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("bert-base-multilingual"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True ).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained(model_name, output_hidden_states=True).to( device ) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep
def test_model_from_pretrained(self): for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = XLMModel.from_pretrained(model_name) self.assertIsNotNone(model)
def test_xlm_embeddings(): xlm_model: str = "xlm-mlm-en-2048" tokenizer = XLMTokenizer.from_pretrained(xlm_model) model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<s>" + s + "</s>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # # <s> 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s> # | | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLMEmbeddings( pretrained_model_name_or_path=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[10].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[10]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9], first_layer[10]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * model.embeddings.embedding_dim actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * model.embeddings.embedding_dim actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def test_model_from_pretrained(self): cache_dir = "/tmp/transformers_test/" for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
def get_embedding(type_embedding, data): if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': data = [x.lower() for x in data] path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True) # Set the device to GPU (cuda) if available, otherwise stick with CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' list_of_four_last_embeddings = [] list_of_mean = [] for l in data: # Convert the string "granola bars" to tokenized vocabulary IDs input_ids = tokenizer.encode(l) #print(input_ids) # Convert the list of IDs to a tensor of IDs input_ids = torch.LongTensor(input_ids) #print(input_ids) model = model.to(device) input_ids = input_ids.to(device) #print(input_ids) model.eval() # unsqueeze IDs to get batch size of 1 as added dimension input_ids = input_ids.unsqueeze(0) with torch.no_grad(): out = model(input_ids=input_ids) # we only want the hidden_states if type_embedding == 'xlm': hidden_states = out[1] else: hidden_states = out[2] #mean of layers sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze() list_of_mean.append(sentence_embedding.tolist()) # get last four layers last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)] # cast layers to a tuple and concatenate over the last dimension cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1) # take the mean of the concatenated vector over the token dimension cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze() list_of_four_last_embeddings.append(cat_sentence_embedding.tolist()) #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape) #print('list of mean', np.array(list_of_mean).shape) return list_of_mean, list_of_four_last_embeddings
def generate_embedding(objectives, model_name, batch_size=100, output_attention=False): """ Takes in a pandas dataframe and generates embeddings for the text column using the hugging face implemented models - Inputs: pd_dataframe (pandas dataframe): The dataframe containing all text column and their ids model_name (str): name of the model to be used for generating embeddings batch_size (int): batch size to use when generating embeddings for sentences - Output: sentence_embedding (tensor): tensor of shape n by 1024 where n is the number of sentence """ if model_name == "bert": # Load pre-trained bert model (weights) model = BertModel.from_pretrained("bert-base-uncased", output_attentions=output_attention) elif model_name == "xlnet": # Load pre-trained xlnet model (weights) model = XLNetModel.from_pretrained("xlnet-base-cased", output_attentions=output_attention) elif model_name == "xlm": # Load pre-trained xlm model (weights) model = XLMModel.from_pretrained("xlm-mlm-en-2048", output_attentions=output_attention) elif model_name == "electra": # Load pre-trained electra model (weights) model = ElectraModel.from_pretrained( "google/electra-small-discriminator", output_attentions=output_attention) elif model_name == "albert": # Load pre-trained albert model (weights) model = AlbertForMaskedLM.from_pretrained( "albert-base-v2", output_attentions=output_attention) else: print( "Please select an implemented model name. {} doesn't exist".format( model_name)) return sentences_per_batch = batch_size # setting up the device if torch.cuda.is_available(): dev = "cuda:0" else: dev = "cpu" device = torch.device(dev) print("using ", device) # Put the model in "evaluation" mode, meaning feed-forward operation. model.eval() model.to(device) num_sentences = len(objectives) sentence_embedding = [] attention_layers = None if num_sentences > sentences_per_batch: num_batches = num_sentences // sentences_per_batch for i in range(num_batches): start = i * sentences_per_batch end = (i + 1) * sentences_per_batch if i == num_batches - 1: end = num_sentences mini_objective = list(objectives[start:end]) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([mini_objective]).squeeze() tokens_tensor = tokens_tensor.to(device) # Predict hidden states features for each layer with torch.no_grad(): encoded_layers = model(tokens_tensor) # taking embeddings of the last layer. # token_vecs` is a tensor with shape [n x k x 1024] token_vecs = encoded_layers[0] # take the vector corresponing to the [CLS] token if it has a cls token. if model_name in ["bert", "albert", "electra"]: sentence_embedding += token_vecs[:, 0, :].tolist() # for those without a cls token, Calculate the average of all k token vectors and adding to the main list else: sentence_embedding += torch.mean(token_vecs, dim=1).tolist() if output_attention is True: attention_layer = [al.tolist() for al in encoded_layers[-1]] attention_layer = np.array(attention_layer) if len(attention_layers) == 0: attention_layers = attention_layer else: attention_layers = np.concatenate( (attention_layers, attention_layer), axis=1) print("Embedding for batch {} out of {} batches Completed.".format( i, num_batches)) else: # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([objectives]).squeeze() tokens_tensor = tokens_tensor.to(device) # Predict hidden states features for each layer with torch.no_grad(): encoded_layers = model(tokens_tensor) # taking embeddings of the last layer. # token_vecs` is a tensor with shape [n x k x 1024] token_vecs = encoded_layers[0] # take the vector corresponing to the [CLS] token if it has a cls token. if model_name in ["bert", "albert", "electra"]: sentence_embedding = token_vecs[:, 0, :].tolist() # for those without a cls token, Calculate the average of all k token vectors and adding to the main list else: sentence_embedding = torch.mean(token_vecs, dim=1).tolist() if output_attention is True: attention_layers = [al.tolist() for al in encoded_layers[-1]] attention_layers = np.array(attention_layers) print( "Our final sentence embedding vector of shape:", len(sentence_embedding), len(sentence_embedding[0]), ) if output_attention: print("And the corresponding attention vector of shape:", attention_layers.shape) return sentence_embedding, attention_layers
class XLMEncoder(MetaModule): """XLM model using Hugging Face's transformers library. The following command was used to fine-tune XLM on the in-domain data (retrieved from .pth file):: python train.py --exp_name tlm_clm --dump_path './dumped/' \ --data_path '/mnt/shared/datasets/kiwi/parallel/en_de_indomain' \ --lgs 'ar-bg-de-el-en-es-fr-hi-ru-sw-th-tr-ur-vi-zh' \ --clm_steps 'en-de,de-en' --mlm_steps 'en-de,de-en' \ --reload_model 'models/mlm_tlm_xnli15_1024.pth' --encoder_only True \ --emb_dim 1024 --n_layers 12 --n_heads 8 --dropout '0.1' \ --attention_dropout '0.1' --gelu_activation true --batch_size 32 \ --bptt 256 --optimizer 'adam_inverse_sqrt,beta1=0.9,beta2=0.98,lr=0.0001,weight_decay=0' \ --epoch_size 200000 --validation_metrics _valid_mlm_ppl --max_vocab 95000 \ --tokens_per_batch 1200 --exp_id "5114" Old version was converted using hf-transformers util method:: convert_xlm_checkpoint_to_pytorch( self.config.model_name / 'indomain.pth', self.config.model_name / 'finetuned_wmt_en-de' ) Old settings in QE not really used for the best run and submission: .. code-block:: yaml fb-causal-lambda: 0.0 fb-keep-prob: 0.1 fb-mask-prob: 0.8 fb-model: data/trained_models/fb_pretrain/xnli/indomain.pth fb-pred-prob: 0.15 fb-rand-prob: 0.1 fb-src-lang: en fb-tgt-lang: de fb-tlm-lambda: 0.0 fb-vocab: data/trained_models/fb_pretrain/xnli/vocab_xnli_15.txt """ class Config(BaseConfig): encode_source: bool = False model_name: Union[str, Path] = 'xlm-mlm-tlm-xnli15-1024' """Pre-trained XLM model to use.""" source_language: str = 'en' target_language: str = 'de' use_mismatch_features: bool = False """Use Alibaba's mismatch features.""" use_predictor_features: bool = False """Use features originally proposed in the Predictor model.""" interleave_input: bool = False """Concatenate SOURCE and TARGET without internal padding (111222000 instead of 111002220)""" freeze: bool = False """Freeze XLM during training.""" use_mlp: bool = True """Apply a linear layer on top of XLM.""" hidden_size: int = 100 """Size of the linear layer on top of XLM.""" @validator('model_name', pre=True) def fix_relative_path(cls, v): if v not in XLM_PRETRAINED_MODEL_ARCHIVE_LIST: v = Path(v) if not v.is_absolute(): v = Path.cwd().joinpath(v) return v @validator('use_mismatch_features', 'use_predictor_features', pre=True) def no_implementation(cls, v): if v: raise NotImplementedError('Not yet implemented') return False def __init__(self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True): super().__init__(config=config) if pre_load_model: self.xlm = XLMModel.from_pretrained(self.config.model_name, output_hidden_states=True) else: xlm_config = XLMConfig.from_pretrained(self.config.model_name, output_hidden_states=True) self.xlm = XLMModel(xlm_config) self.source_lang_id = self.xlm.config.lang2id.get( self.config.source_language) self.target_lang_id = self.xlm.config.lang2id.get( self.config.target_language) if None in (self.source_lang_id, self.target_lang_id): raise ValueError( f'Invalid lang_id for XLM model.' f' Valid ids are: {self.xlm.config.lang2id.keys()}') self.mlp = None if self.config.use_mlp: self.mlp = nn.Sequential( nn.Linear(self.xlm.config.hidden_size, self.config.hidden_size), nn.Tanh(), ) output_size = self.config.hidden_size else: output_size = self.xlm.config.hidden_size self._sizes = { const.TARGET: output_size, const.TARGET_LOGITS: output_size, const.TARGET_SENTENCE: 2 * output_size, const.SOURCE: output_size, const.SOURCE_LOGITS: output_size, } self.vocabs = { const.TARGET: vocabs[const.TARGET], const.SOURCE: vocabs[const.SOURCE], } self.output_embeddings = self.xlm.embeddings if self.config.freeze: for param in self.xlm.parameters(): param.requires_grad = False def load_state_dict( self, state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]], strict: bool = True, ): try: keys = super().load_state_dict(state_dict, strict) except RuntimeError as e: if "position_ids" in str(e): # FIXME: hack to get around Transformers 3.1 breaking changes # https://github.com/huggingface/transformers/issues/6882 self.xlm.embeddings._non_persistent_buffers_set.add( 'position_ids') keys = super().load_state_dict(state_dict, strict) self.xlm.embeddings._non_persistent_buffers_set.discard( 'position_ids') else: raise e return keys @classmethod def input_data_encoders(cls, config: Config): return { const.SOURCE: XLMTextEncoder(tokenizer_name=config.model_name), const.TARGET: XLMTextEncoder(tokenizer_name=config.model_name), } def size(self, field=None): if field: return self._sizes[field] return self._sizes def forward( self, batch_inputs, *args, include_target_logits=False, include_source_logits=False, ): # XLM gets it's input as a concatenation of both embeddings # or as an interleave of inputs if self.config.interleave_input: merge_input_fn = self.interleave_input else: merge_input_fn = self.concat_input input_ids, _, attention_mask, position_ids, lang_ids = merge_input_fn( batch_a=batch_inputs[const.SOURCE], batch_b=batch_inputs[const.TARGET], pad_id=self.vocabs[const.TARGET].pad_id, lang_a=self.source_lang_id, lang_b=self.target_lang_id, ) # encoded_layers also includes the embedding layer # encoded_layers[-1] is the last layer last_layer, encoded_layers = self.xlm( input_ids=input_ids, attention_mask=attention_mask, langs=lang_ids, position_ids=position_ids, ) # TODO: select one of these strategies via cli # TODO: get a BETTER strategy # features = sum(encoded_layers[-5:-1]) # features = encoded_layers[-2] features = last_layer if self.config.use_mlp: features = self.mlp(features) # Build the feature dictionary to be returned to the system output_features = self.split_outputs( features, batch_inputs, interleaved=self.config.interleave_input, label_a=const.SOURCE, label_b=const.TARGET, ) # Convert pieces to tokens output_features[const.TARGET] = pieces_to_tokens( output_features[const.TARGET], batch_inputs[const.TARGET]) output_features[const.SOURCE] = pieces_to_tokens( output_features[const.SOURCE], batch_inputs[const.SOURCE]) source_len = batch_inputs[const.SOURCE].bounds_lengths target_len = batch_inputs[const.TARGET].bounds_lengths # NOTE: assuming here that features is already split into target and source source_features = output_features[const.SOURCE] target_features = output_features[const.TARGET] # Sentence-level features sentence_target_features = target_features[:, 0].unsqueeze( 1) + select_positions(target_features, (target_len - 1).unsqueeze(1)) sentence_source_features = source_features[:, 0].unsqueeze( 1) + select_positions(source_features, (source_len - 1).unsqueeze(1)) sentence_features = torch.cat( (sentence_target_features, sentence_source_features), dim=-1) output_features[const.TARGET_SENTENCE] = sentence_features output_features[const.TARGET] = target_features output_features[const.SOURCE] = source_features # Logits for multi-task fine-tuning if include_target_logits: output_features[const.TARGET_LOGITS] = torch.einsum( 'vh,bsh->bsv', self.output_embeddings.weight, output_features[const.TARGET], ) if include_source_logits: output_features[const.SOURCE_LOGITS] = torch.einsum( 'vh,bsh->bsv', self.output_embeddings.weight, output_features[const.SOURCE], ) # Additional features if self.config.use_mismatch_features: raise NotImplementedError return output_features @staticmethod def concat_input(batch_a, batch_b, pad_id, lang_a=None, lang_b=None): """Concatenate tensors of two batches into one tensor. Return: the concatenation, a mask of types (a as zeroes and b as ones) and concatenation of attention_mask. """ ids_a = batch_a.tensor ids_b = batch_b.tensor attention_mask_a = retrieve_tokens_mask(batch_a) attention_mask_b = retrieve_tokens_mask(batch_b) types_a = torch.zeros_like(ids_a) types_b = torch.ones_like(ids_b) position_ids_a = torch.arange(ids_a.size(1), dtype=torch.long, device=ids_a.device) position_ids_a = position_ids_a.unsqueeze(0).expand(ids_a.size()) position_ids_b = torch.arange(ids_b.size(1), dtype=torch.long, device=ids_b.device) position_ids_b = position_ids_b.unsqueeze(0).expand(ids_b.size()) input_ids = torch.cat((ids_a, ids_b), dim=1) token_type_ids = torch.cat((types_a, types_b), dim=1) attention_mask = torch.cat((attention_mask_a, attention_mask_b), dim=1) position_ids = torch.cat((position_ids_a, position_ids_b), dim=1) if lang_a is not None and lang_b is not None: lang_id_a = torch.ones_like(ids_a) * lang_a lang_id_b = torch.ones_like(ids_b) * lang_b lang_ids = torch.cat((lang_id_a, lang_id_b), dim=1) # lang_ids *= attention_mask.unsqueeze(-1).to(lang_ids.dtype) lang_ids *= attention_mask.to(lang_ids.dtype) return input_ids, token_type_ids, attention_mask, position_ids, lang_ids return input_ids, token_type_ids, attention_mask, position_ids @staticmethod def interleave_input(batch_a, batch_b, pad_id, lang_a=None, lang_b=None): """Interleave the source + target embeddings into one tensor. This means making the input as [batch, target [SEP] source]. Return: interleave of embds, mask of target (as zeroes) and source (as ones) and concatenation of attention_mask. """ ids_a = batch_a.tensor ids_b = batch_b.tensor batch_size = ids_a.size(0) lengths_a = batch_a.lengths lengths_b = batch_b.lengths # max_pair_length = ids_a.size(1) + ids_b.size(1) max_pair_length = lengths_a + lengths_b input_ids = torch.full( (batch_size, max_pair_length), pad_id, dtype=ids_a.dtype, device=ids_a.device, ) token_type_ids = torch.zeros_like(input_ids) attention_mask = torch.zeros_like(input_ids) for i in range(batch_size): # <s> and </s> are included in the mask (=1) len_a = lengths_a[i].item() len_b = lengths_b[i].item() input_ids[i, :len_b] = ids_b[i, :len_b] token_type_ids[i, :len_b] = 0 attention_mask[i, :len_b] = 1 input_ids[i, len_b:len_b + len_a] = ids_a[i, :len_a] token_type_ids[i, len_b:len_b + len_a] = 1 attention_mask[i, len_b:len_b + len_a] = 1 # TODO, why is attention mask 1 for all positions? return input_ids, token_type_ids, attention_mask @staticmethod def split_outputs( features: torch.Tensor, batch_inputs, interleaved: bool = False, label_a: str = const.SOURCE, label_b: str = const.TARGET, ): """Split contexts to get tag_side outputs. Arguments: features (tensor): XLM output: <s> source </s> </s> target </s> Shape of (bs, 1 + source_len + 2 + target_len + 1, 2) batch_inputs: interleaved (bool): whether the concat strategy was 'interleaved'. label_a: dictionary key for sequence A in ``features``. label_b: dictionary key for sequence B in ``features``. Return: dict of tensors, one per tag side. """ outputs = OrderedDict() if interleaved: raise NotImplementedError('interleaving not supported.') # TODO: fix code below to use the lengths information and not bounds # if interleaved, shift each source sample by its correspondent length lengths_a = batch_inputs[const.TARGET].lengths shift = lengths_a.unsqueeze(-1) range_vector = torch.arange(features.size(0), device=features.device).unsqueeze(1) target_bounds = batch_inputs[const.TARGET].bounds features_a = features[range_vector, target_bounds] # Shift bounds by target length and preserve padding source_bounds = batch_inputs[const.SOURCE].bounds m = (source_bounds != -1).long() # for masking out padding (which is -1) shifted_bounds = (source_bounds + shift) * m + source_bounds * (1 - m) features_b = features[range_vector, shifted_bounds] else: # otherwise, shift all by max_length lengths_a = batch_inputs[label_a].lengths # if we'd like to maintain the word pieces we merely select all features_a = features[:, :lengths_a.max()] features_b = features[:, lengths_a.max():] outputs[label_a] = features_a outputs[label_b] = features_b return outputs
def tokenizer_and_model(type_embedding): ######### #PORTUGUESE ######### if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) special_tokens_dict = { 'additional_special_tokens': ['[USER]', '[SYSTEM]'] } orig_num_tokens = len(tokenizer) num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict) total_num_tokens = orig_num_tokens + num_added_tokens model.resize_token_embeddings(total_num_tokens) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True, return_dict=True) return tokenizer, model