def __init__(self, auto_model: str, auto_path: str): super().__init__() if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
def get_attentions(): model_name = request.args.get('model') source = request.args.get('source') target = request.args.get('target') if model_name == 'XLM': model_version = 'xlm-mlm-ende-1024' model = XLMModel.from_pretrained(model_version, output_attentions=True) tokenizer = XLMTokenizer.from_pretrained(model_version) elif model_name == 'GPT-2': model_version = 'gpt2' model = GPT2Model.from_pretrained(model_version, output_attentions=True) tokenizer = GPT2Tokenizer.from_pretrained(model_version) else: # BERT model_version = 'bert-base-uncased' model = BertModel.from_pretrained(model_version, output_attentions=True) tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True) inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True) token_type_ids = inputs['token_type_ids'] input_ids = inputs['input_ids'] attention = model(input_ids, token_type_ids=token_type_ids)[-1] input_id_list = input_ids[0].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
def load_model_tokenizer(self, pretrained): """ Load transformer model and tokenizer for given pre-trained name :param pretrained: pre-trained name :return: model, tokenizer """ model = None tokenizer = None if self.method == "T5": if pretrained in T5_PRETRAINED_MODELS: model = T5ForConditionalGeneration.from_pretrained(pretrained) tokenizer = T5Tokenizer.from_pretrained(pretrained) elif self.method == "BART": if pretrained in BART_PRETRAINED_MODELS: model = BartForConditionalGeneration.from_pretrained(pretrained) tokenizer = BartTokenizer.from_pretrained(pretrained) elif self.method == "GPT-2": if pretrained in GPT2_PRETRAINED_MODELS: model = GPT2LMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = GPT2Tokenizer.from_pretrained(pretrained) elif self.method == "XLM": if pretrained in XLM_PRETRAINED_MODELS: model = XLMWithLMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = XLMTokenizer.from_pretrained(pretrained) else: pass return model, tokenizer
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def __init__( self, pretrained_embedding=None, architecture_function=None, text_input_column="clean_text", meta_input_list=("extension", "dayofweek", "hour", "min"), vocab_size=25000, seq_size=100, embedding_dim=200, loss="categorical_crossentropy", activation="softmax", batch_size=4096, n_epochs=15, bert_tokenizer="jplu/tf-camembert-base", bert_model="jplu/tf-camembert-base", **kwargs, ): self.architecture_function = architecture_function self.pretrained_embedding = pretrained_embedding if self.architecture_function.__name__ != "bert_model": self.tokenizer = Tokenizer(input_column=text_input_column) elif "camembert" in bert_tokenizer.lower(): # Prevent the HuggingFace dependency try: from transformers import CamembertTokenizer self.tokenizer = CamembertTokenizer.from_pretrained( bert_tokenizer) except ModuleNotFoundError: raise ( """Please install transformers 3.4.0 (only version currently supported) pip install melusine[transformers]""") elif "flaubert" in bert_tokenizer.lower(): # Prevent the HuggingFace dependency try: from transformers import XLMTokenizer self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer) except ModuleNotFoundError: raise ( """Please install transformers 3.4.0 (only version currently supported) pip install melusine[transformers]""") else: raise NotImplementedError( "Bert tokenizer {} not implemented".format(bert_tokenizer)) self.text_input_column = text_input_column self.meta_input_list = meta_input_list self.vocab_size = vocab_size self.seq_size = seq_size self.embedding_dim = embedding_dim self.loss = loss self.activation = activation self.batch_size = batch_size self.n_epochs = n_epochs self.bert_model = bert_model self.nb_labels = 0 self.nb_meta_features = 0 self.vocabulary = [] self.vocabulary_dict = {}
def _test_TFXLM(self, size, large=False): from transformers import TFXLMModel, XLMTokenizer tokenizer = XLMTokenizer.from_pretrained(size) model = TFXLMModel.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large, atol=0.005)
def test_model(modelname): model, log = XLMModel.from_pretrained(modelname, output_loading_info=True) tokenizer = XLMTokenizer.from_pretrained(modelname, do_lower_case=False) # this line is important: by default, XLMTokenizer removes diacritics, even with do_lower_case=False flag tokenizer.do_lowercase_and_remove_accent = False print("Dictionary values must be empty lists:") print(log)
def test_space_tokenization_and_xlm_uncased_tokenization_normalization(): text = "Jeff Immelt chose to focus on the incomprehensibility of accounting rules ." space_tokenized = text.split(" ") tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") target_tokenized = tokenizer.tokenize(text) normed_space_tokenized, normed_target_tokenized = tn.normalize_tokenizations( space_tokenized, target_tokenized, tokenizer) assert "".join(normed_space_tokenized) == "".join(normed_target_tokenized)
def create_tokenizer(bert_pretrained): """ Wrapper function returning a tokenizer for BERT. """ if bert_pretrained.startswith("xlm"): return XLMTokenizer.from_pretrained(bert_pretrained) else: return BertTokenizer.from_pretrained(bert_pretrained)
def init(args): BERTTool.multi_bert = XLMModel.from_pretrained( args.multi_bert.location) BERTTool.multi_tokener = XLMTokenizer.from_pretrained( args.multi_bert.location) BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids( ["<pad>"])[0] BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids( ["</s>"])[0] BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids( ["<s>"])[0]
def get_model_and_tokenizer(model_name, device, random_weights=False): model_name = model_name if model_name.startswith('xlnet'): model = XLNetModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(model_name) sep = u'▁' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('gpt2'): model = GPT2Model.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(model_name) sep = 'Ġ' sizes = { "gpt2": 768, "gpt2-medium": 1024, "gpt2-large": 1280, "gpt2-xl": 1600 } emb_dim = sizes[model_name] elif model_name.startswith('xlm'): model = XLMModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(model_name) sep = '</w>' elif model_name.startswith('bert'): model = BertModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('distilbert'): model = DistilBertModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 768 elif model_name.startswith('roberta'): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = 'Ġ' emb_dim = 1024 if "large" in model_name else 768 else: print('Unrecognized model name:', model_name) sys.exit() if random_weights: print('Randomizing weights') model.init_weights() return model, tokenizer, sep, emb_dim
def __init__(self): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu" ) self.tokenizer = XLMTokenizer.from_pretrained( 'allegro/herbert-klej-cased-tokenizer-v1' ) self.model = RobertaModel.from_pretrained( 'allegro/herbert-klej-cased-v1' ) self.model = self.model.to(self.device)
def test_TFXLMForQuestionAnsweringSimple(self): from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple pretrained_weights = 'xlm-mlm-enfr-1024' tokenizer = XLMTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFXLMForQuestionAnsweringSimple.from_pretrained( pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name log.info('In add_transformers_vocab') log.info(tokenizer_name) if tokenizer_name.startswith( "bert-" ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name: tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith( "roberta-"): # or 'roberta' in tokenizer_name: tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name: tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-roberta"): tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def get_model_and_tokenizer(model_name, device="cpu", random_weights=False, model_path=None): """ model_path: if given, initialize from path instead of official repo """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep
def __init__(self, model_type): """Constructor :param model_type: which model is used, xlm or mbert """ if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') model = XLMModel.from_pretrained('xlm-mlm-100-1280') self.embeddings = model.embeddings.weight elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') model = BertModel.from_pretrained('bert-base-multilingual-uncased') self.embeddings = model.embeddings.word_embeddings.weight self.emb_dim = self.embeddings.shape[1]
def test_TFXLMWithLMHeadModel(self): from transformers import XLMTokenizer, TFXLMWithLMHeadModel pretrained_weights = 'xlm-mlm-enfr-1024' tokenizer = XLMTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFXLMWithLMHeadModel.from_pretrained(pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
def __init__(self, model_type): """Constructor :param model_type: if and xlm or bert model is used """ # Instantiate model and tokenizers from pre-trained multilingual versions if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') self.model = XLMModel.from_pretrained('xlm-mlm-100-1280', output_hidden_states=True) elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-uncased') self.model = BertModel.from_pretrained( 'bert-base-multilingual-uncased', output_hidden_states=True) else: raise ValueError( 'Unrecognized model type. Only bert and xlm supported')
def get_embedding_for_text(text: str) -> (torch.tensor, torch.tensor): """ For a given sentence the function return embedding generated by BERT :param text: Sentence for which u want to get an embedding :return: (tensor of embeddings for each token in sentnece, average embedding of a sentence) """ tokenizer = XLMTokenizer.from_pretrained( join(dirname(realpath(__file__)), "models", "tokenizer")) bert_model = RobertaModel.from_pretrained( join(dirname(realpath(__file__)), "models", "bert")) encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE) outputs = bert_model(encoded_input) sequence_tokens_embedding = outputs[0].squeeze(dim=0) sentence_embedding = outputs[1].squeeze(dim=0) return sequence_tokens_embedding, sentence_embedding
def convert_id_to_token(indexed_tokens, model_name): if model_name == "bert": tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") elif model_name == "xlnet": tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") elif model_name == "xlm": tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") elif model_name == "electra": tokenizer = ElectraTokenizer.from_pretrained( "google/electra-small-discriminator") elif model_name == "albert": tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") word_tokens = [ tokenizer.convert_ids_to_tokens(indexed_token) for indexed_token in indexed_tokens ] return word_tokens
def __init__( self, pretrained_embedding=None, architecture_function=None, text_input_column="clean_text", meta_input_list=["extension", "dayofweek", "hour", "min"], vocab_size=25000, seq_size=100, embedding_dim=200, loss="categorical_crossentropy", activation="softmax", batch_size=4096, n_epochs=15, bert_tokenizer="jplu/tf-camembert-base", bert_model="jplu/tf-camembert-base", **kwargs, ): self.architecture_function = architecture_function self.pretrained_embedding = pretrained_embedding if self.architecture_function.__name__ != "bert_model": self.tokenizer = Tokenizer(input_column=text_input_column) elif "camembert" in bert_tokenizer.lower(): self.tokenizer = CamembertTokenizer.from_pretrained(bert_tokenizer) elif "flaubert" in bert_tokenizer.lower(): self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer) else: raise NotImplementedError( "Bert tokenizer {} not implemented".format(bert_tokenizer)) self.text_input_column = text_input_column self.meta_input_list = meta_input_list self.vocab_size = vocab_size self.seq_size = seq_size self.embedding_dim = embedding_dim self.loss = loss self.activation = activation self.batch_size = batch_size self.n_epochs = n_epochs self.bert_model = bert_model self.nb_labels = 0 self.nb_meta_features = 0 self.vocabulary = [] self.vocabulary_dict = {}
def __init__(self, from_pretrained=None, tokenizer="allegro/herbert-klej-cased-tokenizer-v1", embed_model="allegro/herbert-klej-cased-v1"): super().__init__() self.tokenizer = XLMTokenizer.from_pretrained(tokenizer) self.embed_model = RobertaModel.from_pretrained(embed_model, return_dict=True) self.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(768, 256), nn.LeakyReLU(), nn.Linear(256, 16), nn.LeakyReLU(), nn.Linear(16, 1), nn.Tanh()) if from_pretrained is not None: f = io.BytesIO( importlib.resources.read_binary(trained_models, f'{from_pretrained}.pth')) self.fc.load_state_dict(torch.load(f)) self.eval()
def get_embedding_for_list_of_texts( list_of_texts: List[str], ) -> (torch.tensor, torch.tensor): """ For a given list of sentences the function return embedding generated by BERT :param text: Sentence for which u want to get an embedding :return: (tensor of embeddings for each token in sentneces, average embedding of a sentences) """ tokenizer = XLMTokenizer.from_pretrained( join(dirname(realpath(__file__)), "models", "tokenizer")) bert_model = RobertaModel.from_pretrained( join(dirname(realpath(__file__)), "models", "bert")) emote_to_text = {} with open(join(dirname(realpath(__file__)), "emote_to_text.json"), encoding='utf8') as file: emote_to_text = json.load(file) list_of_texts = starmap( _replace_emotes_with_text, zip(list_of_texts, [emote_to_text] * len(list_of_texts))) list_of_texts = map(_remove_urls_from_text, list_of_texts) list_of_sentence_embeddings = [] list_of_sequence_embeddings = [] for text in list_of_texts: encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE) outputs = bert_model(encoded_input) sequence_tokens_embedding = outputs[0].squeeze(dim=0) sentence_embedding = outputs[1].squeeze(dim=0) list_of_sequence_embeddings.append(sequence_tokens_embedding) list_of_sentence_embeddings.append(sentence_embedding) seq_embeddings_tensor = merge(list_of_sequence_embeddings) sentence_embeddings_tensor = torch.stack(list_of_sentence_embeddings, dim=0) return seq_embeddings_tensor, sentence_embeddings_tensor
def build_tokenizer(model, add_cap_sign, textify_emoji, segment_hashtag, preprocess): if model == 'mbert': tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') elif model =='xlm': tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') tokenizer.add_tokens(['@USER']) # All Transformers models if add_cap_sign: tokenizer.add_tokens(['<has_cap>', '<all_cap>']) if textify_emoji: tokenizer.add_tokens(['<emoji>', '</emoji>']) if segment_hashtag: tokenizer.add_tokens(['<hashtag>', '</hashtag>']) #tokenizer.add_tokens([w.strip() for w in open('../resources/log_odds.txt').readlines()]) # TODO: this is not saved when calling `save_pretrained` if preprocess is not None: tokenizer.tokenize = compose(preprocess, tokenizer.tokenize) return tokenizer
def __init__(self, device): super().__init__() self.net = nn.Sequential( nn.Dropout(0.3), nn.Linear(768, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 2), # nn.Linear(768, 2), nn.Tanh() ) self.device = device self.tokenizer = XLMTokenizer.from_pretrained( # "models/politicalBERT") # "models/politicalHerBERT") "allegro/herbert-klej-cased-tokenizer-v1") self.model = RobertaModel.from_pretrained( # "models/politicalBERT", "models/politicalHerBERT", # "allegro/herbert-klej-cased-v1", return_dict=True)
rw_vocab = get_vocab(filename, 10000) filename2 = "SUBTLEX-US frequency list with PoS information text version.txt" pos_dict = get_pos_dict(filename2) GPT2 = ModelInfo(GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True), GPT2Tokenizer.from_pretrained('gpt2'), "Ġ", vocab, "GTP2") Roberta = ModelInfo( RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True), RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta") XLM = ModelInfo( XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', return_dict=True), XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM") T5 = ModelInfo( T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True), T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5") Albert = ModelInfo( AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True), AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert") TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'), TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_", vocab, "TXL") if __name__ == "__main__":
) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture elif args.LM == 'XLM': from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel config = XLMConfig(vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, ) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config) # 6-layer, 1024-hidden, 8-heads # XLM English-French model trained on the concatenation of English and French wikipedia else: print('need to define LM from Bert,RoBerta,XLM') print(model) def freeze_layer_fun(freeze_layer): for name, param in model.named_parameters(): if freeze_layer in name: print(name) param.requires_grad = False else:
def get_model_and_tokenizer( model_name, device="cpu", random_weights=False, model_path=None ): """ model_path: if given, initialize from path instead of official repo models typically cached in ~/.cache/torch/transformers/ """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define QARiB https://huggingface.co/qarib/bert-base-qarib elif model_name.startswith("qarib"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define AraBERT https://huggingface.co/aubmindlab/bert-base-arabert elif model_name.startswith("aubmindlab"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define ArabicBERT https://huggingface.co/asafaya/bert-base-arabic elif model_name.startswith("asafaya"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" #Define https://huggingface.co/UBC-NLP/MARBERT elif model_name.startswith("UBC-NLP"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("bert-base-multilingual"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True ).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained(model_name, output_hidden_states=True).to( device ) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep
from transformers import XLMTokenizer, RobertaModel from sentimentpl.models import SentimentPLModel import os import pandas as pd import pickle import numpy as np import tqdm import torch device = torch.device("cuda") tokenizer = XLMTokenizer.from_pretrained("models/politicalBERT") model = RobertaModel.from_pretrained("models/politicalBERT", return_dict=True) model = model.to(device) def text2vec(text): encoded = tokenizer.encode(text, return_tensors='pt') return model(encoded)['pooler_output'].detach().numpy()[0] def texts2vec(text): encoded = tokenizer(text, return_tensors='pt', padding=True) encoded = {k: v.to(device) for k, v in encoded.items()} output = model(**encoded) return output['pooler_output'].detach().cpu().numpy( ), output['last_hidden_state'].detach().cpu().numpy(), sentiment_model = SentimentPLModel(from_pretrained='latest').cuda()
def test_xlm_embeddings(): xlm_model: str = "xlm-mlm-en-2048" tokenizer = XLMTokenizer.from_pretrained(xlm_model) model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<s>" + s + "</s>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # # <s> 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s> # | | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLMEmbeddings( pretrained_model_name_or_path=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[10].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[10]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9], first_layer[10]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * model.embeddings.embedding_dim actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * model.embeddings.embedding_dim actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size