def init_model(self, device): """Initialize the language model and send it to the given device Note: Transformers v.4 and higher made default return_dict=True. Args: device (str): torch device (usually "cpu" or "cuda") Returns: model: a model for masked language modeling torch model """ model = None if self.model_name.lower().find('albert') >= 0: try: model = AlbertForMaskedLM.from_pretrained( self.model_name, return_dict=False).to(device) except: model = AlbertForMaskedLM.from_pretrained( self.model_name).to(device) else: try: model = BertForMaskedLM.from_pretrained( self.model_name, return_dict=False).to(device) except: model = BertForMaskedLM.from_pretrained( self.model_name).to(device) model.eval() return model
def __init__(self, vocab: Vocabulary, model_name: str = "bert-base", multi_choice: bool = False): super().__init__(vocab) self._model = None self._loss = CrossEntropyLoss() self.is_multi_choice = multi_choice if model_name.startswith('bert'): if self.is_multi_choice: self._model = BertMultiChoiceMLM.from_pretrained(model_name) else: self._model = BertForMaskedLM.from_pretrained(model_name) elif 'roberta' in model_name: if self.is_multi_choice: self._model = RobertaMultiChoiceMLM.from_pretrained(model_name) else: self._model = RobertaForMaskedLM.from_pretrained(model_name) elif 'albert' in model_name: self._model = AlbertForMaskedLM.from_pretrained(model_name) elif 'xlnet' in model_name: self._model = XLNetLMHeadModel.from_pretrained(model_name) else: raise ("Riquiered model is not supported.")
def __init__(self, config): super(LMDecodingModel, self).__init__() self.config = config self.dep_tree_baseline = config[MODEL_TYPE] == DEP_TREETRAIN_BASELINE self.albert = AlbertForMaskedLM.from_pretrained('albert-base-v2') self.albert_tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2')
def __init__(self, transformer_model, is_train): super(LMNER, self).__init__() config = AlbertConfig.from_pretrained(transformer_model) self.transformer_model = AlbertForMaskedLM.from_pretrained( transformer_model, config=config) # 是否对bert进行训练 for name, param in self.transformer_model.named_parameters(): param.requires_grad = is_train
def setUp(self): super(TestAlbertMaskModel, self).setUp() albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_base_hf" # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_large_hf" # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_xlarge_hf" self.tokenizer = BertTokenizer.from_pretrained(albert_pre_train) self.mask_model = AlbertForMaskedLM.from_pretrained(albert_pre_train) self.mask_token = self.tokenizer.mask_token self.mask_id = self.tokenizer.mask_token_id
def load_HFpretrained_weights(self): hf_state_dict = AlbertForMaskedLM.from_pretrained( FLAGS.hf_model_handle).state_dict() repl = { "albert.embeddings": 'embedder', 'word_embeddings': 'idx_to_embedding', 'albert.encoder.embedding_hidden_mapping_in': 'embedder.embedding_to_hidden', 'albert.encoder.albert_layer_groups.0.albert_layers.0': 'shared_encoder_block', 'attention.dense': 'multihead_attention.project_o', 'attention': 'multihead_attention', 'full_layer_layer_norm': 'feedforward.LayerNorm', 'query': 'project_q', 'key': 'project_k', 'value': 'project_v', 'ffn.': 'feedforward.linear_in.', 'ffn_output': 'feedforward.linear_out', 'predictions': 'lm_head', } # use these three lines to do the replacement repl = dict((re.escape(k), v) for k, v in repl.items()) pattern = re.compile("|".join(repl.keys())) updated_hf_state_dict = OrderedDict( (pattern.sub(lambda m: repl[re.escape(m.group(0))], k), v) for k, v in hf_state_dict.items()) # Allow for cutting the sequence length short updated_hf_state_dict[ 'embedder.position_embeddings.weight'] = updated_hf_state_dict[ 'embedder.position_embeddings.weight'][:FLAGS. max_seq_length, :].clone( ) missing, unexpected = self.load_state_dict(updated_hf_state_dict, strict=False) # Allowed discrepancies: don't care about pooler, and have optional relative attention bias, + there is a 'lm_head.bias' that is only used to set lm head decoder bias to zero, which I' currently ignoring :P ignored_hf_parameters = [ 'pooler', 'position_embeddings', 'lm_head.bias' ] allowed_from_scratch_params = [ 'relative_attention_bias', 'top_down_regressor', 'combiner', 'shared_top_down_predictor', 'shared_from_left_predictor', 'shared_from_right_predictor' ] for m in missing: if not any([s in m for s in allowed_from_scratch_params]): raise ValueError( f'Unexpected mismatch in loading state dict: {m} not present in pretrained.' ) for u in unexpected: if not any([s in u for s in ignored_hf_parameters]): raise ValueError( f'Unexpected mismatch in loading state dict: {u} in pretrained but not in current model.' ) log.info(f"Loaded pretrained weights from {FLAGS.hf_model_handle}")
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ): super(Transformer, self).__init__() self.config_keys = ['max_seq_length'] self.max_seq_length = max_seq_length config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) model_type = config.model_type if hasattr(config, 'model_type') else '' if model_type == 'albert': self.model = AlbertForMaskedLM.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) else: self.model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
def _from_pretrained(self, pretrain_name: str): r""" 根据模型名字,加载不同的模型. """ if 'albert' in pretrain_name: model = AlbertForMaskedLM.from_pretrained(pretrain_name) tokenizer = BertTokenizer.from_pretrained(pretrain_name) elif 'bert' in pretrain_name: tokenizer = AutoTokenizer.from_pretrained(pretrain_name) model = AutoModelWithLMHead.from_pretrained(pretrain_name) self.model = model self.tokenizer = tokenizer
def __init__(self, device): self.device = device self.bert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.bert_model.device(device) self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.albert_model.device(device)
def __init__( self, model=None, tokenizer=None, model_name="bert-large-uncased", mask_token="***mask***", disable_gpu=False, ): self.mask_token = mask_token self.delemmatizer = Delemmatizer() self.device = torch.device( "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" ) print("using model:", model_name) print("device:", self.device) if not model: if "distilbert" in model_name: self.bert = DistilBertForMaskedLM.from_pretrained(model_name) elif "Albert" in model_name: self.bert = AlbertForMaskedLM.from_pretrained(model_name) else: self.bert = BertForMaskedLM.from_pretrained(model_name) self.bert.to(self.device) else: self.bert = model if not tokenizer: if "distilbert" in model_name: self.tokenizer = DistilBertTokenizer.from_pretrained(model_name) elif "Albert" in model_name: self.tokenizer = AlbertTokenizer.from_pretrained(bert-large-uncased) else: self.tokenizer = BertTokenizer.from_pretrained(model_name) else: self.tokenizer = tokenizer self.bert.eval()
def _contextual_model_init(self): """ 基于上个下文的词相似计算初始化,加载词典,模型 :return: 无 """ pretrain_name = self.model_path + self.model_params[ 'pre_train_model_path'] logging.info('pretrain_name', pretrain_name) if 'albert' in pretrain_name: self._contextual_model = AlbertForMaskedLM.from_pretrained( pretrain_name) self._contextual_tokenizer = BertTokenizer.from_pretrained( pretrain_name) elif 'ernie' in pretrain_name or 'roberta' in pretrain_name: self._contextual_tokenizer = BertTokenizer.from_pretrained( pretrain_name) self._contextual_model = BertModel.from_pretrained(pretrain_name) else: # elif 'bert' in pretrain_name: self._contextual_tokenizer = AutoTokenizer.from_pretrained( pretrain_name) model_config = AutoConfig.from_pretrained(pretrain_name) self._contextual_model = AutoModel.from_pretrained( pretrain_name, config=model_config)
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2') names = proc.generate_pairs_of_random_names(number_of_pairs=100) with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) logger.info("finished reading in social data") output_df = run_pipeline(model=model, tokenizer=tokenizer, fictitious_entities=names, sentences=social_sents, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/albert_w_name/alberta_social_perf_2_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving social results")
from transformers import AlbertForMaskedLM, AlbertTokenizer import torch tokenizer = AlbertTokenizer.from_pretrained("albert-large-v2") model = AlbertForMaskedLM.from_pretrained("albert-large-v2") sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint." input = tokenizer.encode(sequence, return_tensors="pt") # 被Mask的字符的位置 mask_token_index = torch.where(input == tokenizer.mask_token_id)[1] #获取每个位置的logits, [batch_size, seq_length, vocab_size], torch.Size([1, 28, 30522]), 即最大的可能性 token_logits = model(input)[0] #只获取被mask处的单词的logits mask_token_logits = token_logits[0, mask_token_index, :] # 我们只取前5个可能的结果,从vocab_size众多结果中 top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() #打印前5个结果 for token in top_5_tokens: print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
def generate_embedding(objectives, model_name, batch_size=100, output_attention=False): """ Takes in a pandas dataframe and generates embeddings for the text column using the hugging face implemented models - Inputs: pd_dataframe (pandas dataframe): The dataframe containing all text column and their ids model_name (str): name of the model to be used for generating embeddings batch_size (int): batch size to use when generating embeddings for sentences - Output: sentence_embedding (tensor): tensor of shape n by 1024 where n is the number of sentence """ if model_name == "bert": # Load pre-trained bert model (weights) model = BertModel.from_pretrained("bert-base-uncased", output_attentions=output_attention) elif model_name == "xlnet": # Load pre-trained xlnet model (weights) model = XLNetModel.from_pretrained("xlnet-base-cased", output_attentions=output_attention) elif model_name == "xlm": # Load pre-trained xlm model (weights) model = XLMModel.from_pretrained("xlm-mlm-en-2048", output_attentions=output_attention) elif model_name == "electra": # Load pre-trained electra model (weights) model = ElectraModel.from_pretrained( "google/electra-small-discriminator", output_attentions=output_attention) elif model_name == "albert": # Load pre-trained albert model (weights) model = AlbertForMaskedLM.from_pretrained( "albert-base-v2", output_attentions=output_attention) else: print( "Please select an implemented model name. {} doesn't exist".format( model_name)) return sentences_per_batch = batch_size # setting up the device if torch.cuda.is_available(): dev = "cuda:0" else: dev = "cpu" device = torch.device(dev) print("using ", device) # Put the model in "evaluation" mode, meaning feed-forward operation. model.eval() model.to(device) num_sentences = len(objectives) sentence_embedding = [] attention_layers = None if num_sentences > sentences_per_batch: num_batches = num_sentences // sentences_per_batch for i in range(num_batches): start = i * sentences_per_batch end = (i + 1) * sentences_per_batch if i == num_batches - 1: end = num_sentences mini_objective = list(objectives[start:end]) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([mini_objective]).squeeze() tokens_tensor = tokens_tensor.to(device) # Predict hidden states features for each layer with torch.no_grad(): encoded_layers = model(tokens_tensor) # taking embeddings of the last layer. # token_vecs` is a tensor with shape [n x k x 1024] token_vecs = encoded_layers[0] # take the vector corresponing to the [CLS] token if it has a cls token. if model_name in ["bert", "albert", "electra"]: sentence_embedding += token_vecs[:, 0, :].tolist() # for those without a cls token, Calculate the average of all k token vectors and adding to the main list else: sentence_embedding += torch.mean(token_vecs, dim=1).tolist() if output_attention is True: attention_layer = [al.tolist() for al in encoded_layers[-1]] attention_layer = np.array(attention_layer) if len(attention_layers) == 0: attention_layers = attention_layer else: attention_layers = np.concatenate( (attention_layers, attention_layer), axis=1) print("Embedding for batch {} out of {} batches Completed.".format( i, num_batches)) else: # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([objectives]).squeeze() tokens_tensor = tokens_tensor.to(device) # Predict hidden states features for each layer with torch.no_grad(): encoded_layers = model(tokens_tensor) # taking embeddings of the last layer. # token_vecs` is a tensor with shape [n x k x 1024] token_vecs = encoded_layers[0] # take the vector corresponing to the [CLS] token if it has a cls token. if model_name in ["bert", "albert", "electra"]: sentence_embedding = token_vecs[:, 0, :].tolist() # for those without a cls token, Calculate the average of all k token vectors and adding to the main list else: sentence_embedding = torch.mean(token_vecs, dim=1).tolist() if output_attention is True: attention_layers = [al.tolist() for al in encoded_layers[-1]] attention_layers = np.array(attention_layers) print( "Our final sentence embedding vector of shape:", len(sentence_embedding), len(sentence_embedding[0]), ) if output_attention: print("And the corresponding attention vector of shape:", attention_layers.shape) return sentence_embedding, attention_layers
def __init__(self) -> None: self.lists = {} # M-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() self.lists["M-BERT"] = { "Tokenizer": self.bert_multilingual_tokenizer, "Model": self.bert_multilingual_model } print("====================================") print("[BERT] Google Multilingual BERT loaded") print("====================================") # KR-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.krbert_tokenizer = BertTokenizerFast.from_pretrained( 'snunlp/KR-Medium') self.krbert_model = BertForMaskedLM.from_pretrained( 'snunlp/KR-Medium').eval() self.lists["KR-Medium"] = { "Tokenizer": self.krbert_tokenizer, "Model": self.krbert_model } print("====================================") print("[BERT] KR-BERT loaded") print("====================================") # BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_kor_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.lists["bert-kor-base"] = { "Tokenizer": self.bert_kor_tokenizer, "Model": self.bert_kor_model } print("====================================") print("[BERT] BERT-kor-base loaded") print("====================================") # ALBERT from transformers import AlbertForMaskedLM self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.lists["albert-kor-base"] = { "Tokenizer": self.albert_tokenizer, "Model": self.albert_model } print("====================================") print("[BERT] ALBERT-kor-base loaded") print("====================================") # XLM-Roberta from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() self.lists["xlm-roberta-base"] = { "Tokenizer": self.xlmroberta_tokenizer, "Model": self.xlmroberta_model } print("====================================") print("[BERT] XLM-Roberta-kor loaded") print("====================================") from transformers import BertTokenizerFast, EncoderDecoderModel self.tokenizer_bertshared = BertTokenizerFast.from_pretrained( "kykim/bertshared-kor-base") self.bertshared_model = EncoderDecoderModel.from_pretrained( "kykim/bertshared-kor-base") self.lists["bertshared-kor-base"] = { "Tokenizer": self.tokenizer_bertshared, "Model": self.bertshared_model } print("====================================") print("[Seq2seq + BERT] bertshared-kor-base loaded") print("====================================") # gpt3-kor-small_based_on_gpt2 from transformers import BertTokenizerFast, GPT2LMHeadModel self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.model_gpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.lists["gpt3-kor-small_based_on_gpt2"] = { "Tokenizer": self.tokenizer_gpt3, "Model": self.model_gpt3 } print("====================================") print("[GPT3] gpt3-small-based-on-gpt2 loaded") print("====================================") # electra-base-kor from transformers import ElectraTokenizerFast, ElectraModel self.tokenizer_electra = ElectraTokenizerFast.from_pretrained( "kykim/electra-kor-base") self.electra_model = ElectraModel.from_pretrained( "kykim/electra-kor-base") self.lists["electra-kor-base"] = { "Tokenizer": self.tokenizer_electra, "Model": self.electra_model } print("====================================") print("[ELECTRA] electra-kor-base loaded") print("====================================") from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.lists["electra-kor-QA"] = { "Tokenizer": self.electra_tokenizer_QA, "Model": self.electra_model_QA } print("====================================") print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded") print("====================================")
def __init__(self, args) -> None: """Use ELM with fintuned language model for sentiment classification Args: args (dict): contain all the arguments needed. - model_name(str): the name of the transformer model - bsz(int): batch size - epoch: epochs to train - type(str): fintuned type - base: train only ELM - finetune_elm: train transformers with ELM directly - finetune_classifier: train transformers with classifier - finetune_classifier_elm: train transformers with classifier, and use elm replace the classifier - finetune_classifier_beta: train transformers with classifier, and use pinv to calculate beta in classifier - learning_rate(float): learning_rate for finetuning """ # load configuration self.model_name = args.get('model_name', 'bert-base-uncased') self.bsz = args.get('batch_size', 10) self.epoch = args.get('epoch_num', 2) self.learning_rate = args.get('learning_rate', 0.001) self.training_type = args.get('training_type', 'base') self.debug = args.get('debug', True) self.eval_epoch = args.get('eval_epoch', 1) self.lr_decay = args.get('learning_rate_decay', 0.99) if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') self.device = device self.n_gpu = torch.cuda.device_count() # load pretrained model if (self.model_name == 'bert-base-uncased') or \ (self.model_name == 'distilbert-base-uncased') or \ (self.model_name == 'albert-base-v2'): self.pretrained_model = AutoModel.from_pretrained(self.model_name) self.pretrained_tokenizer = AutoTokenizer.from_pretrained( self.model_name) input_shape = 768 output_shape = 256 elif (self.model_name == 'prajjwal1/bert-tiny'): self.pretrained_model = AutoModel.from_pretrained(self.model_name) self.pretrained_tokenizer = AutoTokenizer.from_pretrained( self.model_name, model_max_length=512) input_shape = 128 output_shape = 64 elif self.model_name == 'voidful/albert_chinese_xxlarge': self.pretrained_model = AlbertForMaskedLM.from_pretrained( self.model_name) self.pretrained_tokenizer = BertTokenizer.from_pretrained( self.model_name) input_shape = 768 output_shape = 256 else: raise TypeError("Unsupported model name") self.pretrained_model.to(device) device_ids = None if self.n_gpu > 1: device_ids = range(torch.cuda.device_count()) self.pretrained_model = DP(self.pretrained_model, device_ids=device_ids) # load specific model if (self.training_type == 'finetune_classifier') or \ (self.training_type == 'finetune_classifier_elm'): self.classifier = torch.nn.Sequential( torch.nn.Linear(input_shape, 2)) self.loss_func = torch.nn.CrossEntropyLoss() self.classifier.to(device) if self.n_gpu > 1: self.classifier = DP(self.classifier, device_ids=device_ids) if (self.training_type == 'base') or \ (self.training_type =='finetune_classifier_elm'): self.elm = classic_ELM(input_shape, output_shape) if (self.training_type == 'finetune_classifier_linear'): self.elm = classic_ELM(None, None) self.classifier = torch.nn.Sequential( OrderedDict([ ('w', torch.nn.Linear(input_shape, output_shape)), ('act', torch.nn.Sigmoid()), ('beta', torch.nn.Linear(output_shape, 2)), ])) self.loss_func = torch.nn.CrossEntropyLoss() self.classifier.to(device) if self.n_gpu > 1: self.classifier = DP(self.classifier, device_ids=device_ids) # load processor, trainer, evaluator, inferer. processors = { 'base': self.__processor_base__, 'finetune_classifier': self.__processor_base__, 'finetune_classifier_elm': self.__processor_base__, 'finetune_classifier_linear': self.__processor_base__, } trainers = { 'base': self.__train_base__, 'finetune_classifier': self.__train_finetune_classifier__, 'finetune_classifier_elm': self.__train_finetune_classifier_elm__, 'finetune_classifier_linear': self.__train_finetune_classifier_linear__, } evaluators = { 'base': self.__eval_base__, 'finetune_classifier': self.__eval_finetune_classifier__, 'finetune_classifier_elm': self.__eval_base__, 'finetune_classifier_linear': self.__eval_finetune_classifier_linear__, } inferers = { 'base': self.__infer_base__, 'finetune_classifier': self.__infer_finetune_classifier__, 'finetune_classifier_elm': self.__infer_finetune_classifier_elm__, 'finetune_classifier_linear': self.__infer_base__ } self.processor = processors[self.training_type] self.trainer = trainers[self.training_type] self.evaluator = evaluators[self.training_type] self.inferer = inferers[self.training_type]
def __init__(self, pretrained_model_name_or_path, config, device): super(MyAlbertForMaskedLM, self).__init__() self.model = AlbertForMaskedLM.from_pretrained( pretrained_model_name_or_path, config=config) self.device = device
# %% import torch import string from transformers import \ AlbertTokenizer, AlbertForMaskedLM,\ DistilBertTokenizer, DistilBertForMaskedLM, \ RobertaTokenizer, RobertaForMaskedLM albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval() albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2') albert_large_model = AlbertForMaskedLM.from_pretrained( 'albert-large-v2').eval() distilbert_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') distilbert_model = DistilBertForMaskedLM.from_pretrained( 'distilbert-base-cased').eval() roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval() top_k = 10 def decode(tokenizer, pred_idx, top_clean): ignore_tokens = string.punctuation + '[PAD]' tokens = [] for w in pred_idx:
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, predictions_file=None, layer_freeze_regexes: List[str] = None, probe_type: str = None, loss_on_all_vocab: bool = False, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._loss_on_all_vocab = loss_on_all_vocab self._predictions_file = predictions_file # TODO move to predict if predictions_file is not None and os.path.isfile(predictions_file): os.remove(predictions_file) self._pretrained_model = pretrained_model if 'roberta' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token if loss_on_all_vocab: self._transformer_model = RobertaForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = RobertaForMultiChoiceMaskedLM.from_pretrained( pretrained_model) elif 'xlnet' in pretrained_model: self._padding_value = 5 # The index of the XLNet padding token self._transformer_model = XLNetLMHeadModel.from_pretrained( pretrained_model) elif 'albert' in pretrained_model: if loss_on_all_vocab: self._transformer_model = AlbertForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token elif 'bert' in pretrained_model: if loss_on_all_vocab: self._transformer_model = BertForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token else: assert (ValueError) if probe_type == 'MLP': layer_freeze_regexes = ["embeddings", "encoder", "pooler"] elif probe_type == 'linear': layer_freeze_regexes = [ "embeddings", "encoder", "pooler", "dense", "LayerNorm", "layer_norm" ] for name, param in self._transformer_model.named_parameters(): if layer_freeze_regexes and requires_grad: grad = not any( [bool(re.search(r, name)) for r in layer_freeze_regexes]) else: grad = requires_grad if grad: param.requires_grad = True else: param.requires_grad = False # make sure decode gredients are on. if 'roberta' in pretrained_model: self._transformer_model.lm_head.decoder.weight.requires_grad = True self._transformer_model.lm_head.bias.requires_grad = True elif 'albert' in pretrained_model: pass elif 'bert' in pretrained_model: self._transformer_model.cls.predictions.decoder.weight.requires_grad = True self._transformer_model.cls.predictions.bias.requires_grad = True transformer_config = self._transformer_model.config transformer_config.num_labels = 1 self._output_dim = self._transformer_model.config.hidden_size self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._debug = 2
parser = argparse.ArgumentParser() parser.add_argument("-t", "--type_of_model", default = 'albert', help = "pretrained LM type") parser.add_argument("-p", "--path_to_pytorch_models", help = "path to pytorch_model") parser.add_argument("--config_and_vocab", help = "path to config.json and vocab.model") parser.add_argument("-s", "--step", type = str, help = "pretrained step") parser.add_argument("-d", "--data", help = "path where you put your processed ontonotes data") parser.add_argument("-o", "--output", help = "output file") args = parser.parse_args() print("Reconstruction. step = ", args.step) if args.type_of_model == 'albert': tokenizer = AlbertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model')) config = AlbertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json')) config.output_hidden_states = True model = AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(os.path.join( args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin'))) elif args.type_of_model == 'bert': tokenizer = BertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model')) config = BertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json')) config.output_hidden_states = True model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(os.path.join( args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin'))) else: raise NotImplementedError("The given model type %s is not supported" % args.type_of_model) device = 'cuda' if torch.cuda.is_available else 'cpu' model.eval().to(device)
from transformers import TFAlbertForMaskedLM, TFAlbertModel, TFAlbertForSequenceClassification, AlbertForMaskedLM import os checkpoint = "albert-base-v1" model = AlbertForMaskedLM.from_pretrained(checkpoint) if not os.path.exists("~/saved/" + checkpoint): os.makedirs("~/saved/" + checkpoint) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint, from_pt=True) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertModel.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForSequenceClassification.from_pretrained('~/saved/' + checkpoint) print("nice model")
def __init__(self, args, random_init='none'): assert (random_init in ['none', 'all', 'embedding']) super().__init__() self._model_device = 'cpu' model_name = args.model_name vocab_name = model_name if args.model_dir is not None: # load bert model from file model_name = str(args.model_dir) + "/" vocab_name = model_name logger.info("loading BERT model from {}".format(model_name)) # Load pre-trained model tokenizer (vocabulary) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if torch.cuda.device_count() > 1: torch.cuda.manual_seed_all(args.seed) config = AutoConfig.from_pretrained(model_name) if isinstance(config, AlbertConfig): self.model_type = 'albert' self.tokenizer = AlbertTokenizer.from_pretrained(vocab_name) self.mlm_model = AlbertForMaskedLM.from_pretrained(model_name) if random_init == 'all': logger.info('Random initialize model...') self.mlm_model = AlbertForMaskedLM(self.mlm_model.config) self.base_model = self.mlm_model.albert elif isinstance(config, RobertaConfig): self.model_type = 'roberta' self.tokenizer = RobertaTokenizer.from_pretrained(vocab_name) self.mlm_model = RobertaForMaskedLM.from_pretrained(model_name) if random_init == 'all': logger.info('Random initialize model...') self.mlm_model = RobertaForMaskedLM(self.mlm_model.config) self.base_model = self.mlm_model.roberta elif isinstance(config, BertConfig): self.model_type = 'bert' self.tokenizer = BertTokenizer.from_pretrained(vocab_name) self.mlm_model = BertForMaskedLM.from_pretrained(model_name) if random_init == 'all': logger.info('Random initialize model...') self.mlm_model = BertForMaskedLM(self.mlm_model.config) self.base_model = self.mlm_model.bert else: raise ValueError('Model %s not supported yet!' % (model_name)) self.mlm_model.eval() if random_init == 'embedding': logger.info('Random initialize embedding layer...') self.mlm_model._init_weights( self.base_model.embeddings.word_embeddings) # original vocab self.map_indices = None self.vocab = list(self.tokenizer.get_vocab().keys()) logger.info('Vocab size: %d' % len(self.vocab)) self._init_inverse_vocab() self.MASK = self.tokenizer.mask_token self.EOS = self.tokenizer.eos_token self.CLS = self.tokenizer.cls_token self.SEP = self.tokenizer.sep_token self.UNK = self.tokenizer.unk_token # print(self.MASK, self.EOS, self.CLS, self.SEP, self.UNK) self.pad_id = self.inverse_vocab[self.tokenizer.pad_token] self.unk_index = self.inverse_vocab[self.tokenizer.unk_token] # used to output top-k predictions self.k = args.k
# inspired by https://github.com/renatoviolin/next_word_prediction import torch import string import transformers transformers.logging.set_verbosity_error() from transformers import BertTokenizerFast, BertForMaskedLM bert_tokenizer = BertTokenizerFast.from_pretrained('kykim/bert-kor-base') bert_model = BertForMaskedLM.from_pretrained('kykim/bert-kor-base').eval() from transformers import AlbertForMaskedLM albert_tokenizer = BertTokenizerFast.from_pretrained('kykim/albert-kor-base') albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() # from transformers import BartForConditionalGeneration # roberta_tokenizer = BertTokenizerFast.from_pretrained('kykim/bart-kor-base') # roberta_model = BartForConditionalGeneration.from_pretrained('kykim/bart-kor-basee').eval() from transformers import BertTokenizerFast, BertForMaskedLM bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
from transformers import BertTokenizer, AlbertForMaskedLM import os # pretrained = 'voidful/albert_chinese_xlarge' pretrained = 'voidful/albert_chinese_large' tokenizer = BertTokenizer.from_pretrained(pretrained) model = AlbertForMaskedLM.from_pretrained(pretrained) model.save_pretrained('albert_model') tokenizer.save_pretrained('albert_model') os.remove("albert_model/special_tokens_map.json") os.remove("albert_model/tokenizer_config.json") os.system("mv albert_model ../")
def evaluate(args): """ Evaluate a masked language model using CrowS-Pairs dataset. """ print("Evaluating:") print("Input:", args.input_file) print("Model:", args.lm_model) print("=" * 100) logging.basicConfig(level=logging.INFO) # load data into panda DataFrame df_data = read_data(args.input_file) # supported masked language models if args.lm_model == "bert": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') uncased = True elif args.lm_model == "roberta": tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaForMaskedLM.from_pretrained('roberta-large') uncased = False elif args.lm_model == "albert": tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2') uncased = True model.eval() if torch.cuda.is_available(): model.to('cuda') mask_token = tokenizer.mask_token log_softmax = torch.nn.LogSoftmax(dim=0) vocab = tokenizer.get_vocab() with open(args.lm_model + ".vocab", "w") as f: f.write(json.dumps(vocab)) lm = {"model": model, "tokenizer": tokenizer, "mask_token": mask_token, "log_softmax": log_softmax, "uncased": uncased } # score each sentence. # each row in the dataframe has the sentid and score for pro and anti stereo. df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 'sent_more_score', 'sent_less_score', 'score', 'stereo_antistereo', 'bias_type']) total_stereo, total_antistereo = 0, 0 stereo_score, antistereo_score = 0, 0 N = 0 neutral = 0 total = len(df_data.index) with tqdm(total=total) as pbar: for index, data in df_data.iterrows(): direction = data['direction'] bias = data['bias_type'] score = mask_unigram(data, lm) for stype in score.keys(): score[stype] = round(score[stype], 3) N += 1 pair_score = 0 pbar.update(1) if score['sent1_score'] == score['sent2_score']: neutral += 1 else: if direction == 'stereo': total_stereo += 1 if score['sent1_score'] > score['sent2_score']: stereo_score += 1 pair_score = 1 elif direction == 'antistereo': total_antistereo += 1 if score['sent2_score'] > score['sent1_score']: antistereo_score += 1 pair_score = 1 sent_more, sent_less = '', '' if direction == 'stereo': sent_more = data['sent1'] sent_less = data['sent2'] sent_more_score = score['sent1_score'] sent_less_score = score['sent2_score'] else: sent_more = data['sent2'] sent_less = data['sent1'] sent_more_score = score['sent2_score'] sent_less_score = score['sent1_score'] df_score = df_score.append({'sent_more': sent_more, 'sent_less': sent_less, 'sent_more_score': sent_more_score, 'sent_less_score': sent_less_score, 'score': pair_score, 'stereo_antistereo': direction, 'bias_type': bias }, ignore_index=True) df_score.to_csv(args.output_file) print('=' * 100) print('Total examples:', N) print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2)) print('Stereotype score:', round(stereo_score / total_stereo * 100, 2)) if antistereo_score != 0: print('Anti-stereotype score:', round(antistereo_score / total_antistereo * 100, 2)) print("Num. neutral:", neutral, round(neutral / N * 100, 2)) print('=' * 100) print()
Roberta = ModelInfo( RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True), RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta") XLM = ModelInfo( XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', return_dict=True), XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM") T5 = ModelInfo( T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True), T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5") Albert = ModelInfo( AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True), AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert") TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'), TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_", vocab, "TXL") if __name__ == "__main__": sentences = [sample_sentences("sentences4lara.txt") for i in range(11)] sent_dict = dict(zip([str(x) for x in range(1, 11)], sentences)) sentence = sent_dict[sys.argv[2]] batch_size = 100
def run_benchmark(model_name, benchmark_file, results_file, logging_file): with open(benchmark_file, "r") as f: benchmark = json.load(f) model = AlbertForMaskedLM.from_pretrained(model_name) tokenizer = AlbertTokenizer.from_pretrained(model_name) fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer) # Each pattern will store its own statistics results = [] for pattern in patterns: result = {} result["false_positives"] = 0 result["false_negatives"] = 0 result["total_questions"] = 0 result["correct"] = 0 pattern["accuracy"] = 0.0 result["pattern"] = pattern["prompt"] results.append(result) with open(logging_file, "w") as log: for benchmark_question in benchmark: output = fill_mask(benchmark_question["question"]) output_str = output[0]["sequence"] + "\n" for o in output: output_str += str(o["token_str"][1:]) + " " + str(o["score"]) + "\n" print(output_str) log.write(output_str) # Update the correct patterns stats for result in results: if result["pattern"] == benchmark_question["pattern"]: result["total_questions"] += 1 if is_correct(output, benchmark_question["answer"]): result["correct"] += 1 print("correct") log.write("correct\n") else: print("incorrect") log.write("incorrect\n") if benchmark_question["answer"] == True: result["false_negatives"] += 1 else: result["false_positives"] += 1 break # Calculate each pattern's accuracy for result in results: result["accuracy"] = float(result["correct"])/result["total_questions"] # Calculate and append the overall statistics results.append(compute_overall_results(results)) results.append({"model_name": model_name, "datetime": str(datetime.datetime.now())}) # Store the results -- downside of no results until the end. with open(results_file, "w") as f: json.dump(results, f, indent=3)