Ejemplo n.º 1
0
    def init_model(self, device):
        """Initialize the language model and send it to the given device
        Note: Transformers v.4 and higher made default return_dict=True.
        Args:
            device (str): torch device (usually "cpu" or "cuda")

        Returns:
            model: a model for masked language modeling torch model
        """
        model = None
        if self.model_name.lower().find('albert') >= 0:
            try:
                model = AlbertForMaskedLM.from_pretrained(
                    self.model_name, return_dict=False).to(device)
            except:
                model = AlbertForMaskedLM.from_pretrained(
                    self.model_name).to(device)
        else:
            try:
                model = BertForMaskedLM.from_pretrained(
                    self.model_name, return_dict=False).to(device)
            except:
                model = BertForMaskedLM.from_pretrained(
                    self.model_name).to(device)
        model.eval()
        return model
Ejemplo n.º 2
0
    def __init__(self,
                 vocab: Vocabulary,
                 model_name: str = "bert-base",
                 multi_choice: bool = False):
        super().__init__(vocab)
        self._model = None
        self._loss = CrossEntropyLoss()
        self.is_multi_choice = multi_choice

        if model_name.startswith('bert'):
            if self.is_multi_choice:
                self._model = BertMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = BertForMaskedLM.from_pretrained(model_name)
        elif 'roberta' in model_name:
            if self.is_multi_choice:
                self._model = RobertaMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = RobertaForMaskedLM.from_pretrained(model_name)

        elif 'albert' in model_name:
            self._model = AlbertForMaskedLM.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self._model = XLNetLMHeadModel.from_pretrained(model_name)
        else:
            raise ("Riquiered model is not supported.")
Ejemplo n.º 3
0
 def __init__(self, config):
     super(LMDecodingModel, self).__init__()
     self.config = config
     self.dep_tree_baseline = config[MODEL_TYPE] == DEP_TREETRAIN_BASELINE
     self.albert = AlbertForMaskedLM.from_pretrained('albert-base-v2')
     self.albert_tokenizer = AlbertTokenizer.from_pretrained(
         'albert-base-v2')
Ejemplo n.º 4
0
 def __init__(self, transformer_model, is_train):
     super(LMNER, self).__init__()
     config = AlbertConfig.from_pretrained(transformer_model)
     self.transformer_model = AlbertForMaskedLM.from_pretrained(
         transformer_model, config=config)
     # 是否对bert进行训练
     for name, param in self.transformer_model.named_parameters():
         param.requires_grad = is_train
Ejemplo n.º 5
0
    def setUp(self):
        super(TestAlbertMaskModel, self).setUp()

        albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_base_hf"
        # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_large_hf"
        # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_xlarge_hf"
        self.tokenizer = BertTokenizer.from_pretrained(albert_pre_train)

        self.mask_model = AlbertForMaskedLM.from_pretrained(albert_pre_train)
        self.mask_token = self.tokenizer.mask_token
        self.mask_id = self.tokenizer.mask_token_id
Ejemplo n.º 6
0
 def load_HFpretrained_weights(self):
     hf_state_dict = AlbertForMaskedLM.from_pretrained(
         FLAGS.hf_model_handle).state_dict()
     repl = {
         "albert.embeddings": 'embedder',
         'word_embeddings': 'idx_to_embedding',
         'albert.encoder.embedding_hidden_mapping_in':
         'embedder.embedding_to_hidden',
         'albert.encoder.albert_layer_groups.0.albert_layers.0':
         'shared_encoder_block',
         'attention.dense': 'multihead_attention.project_o',
         'attention': 'multihead_attention',
         'full_layer_layer_norm': 'feedforward.LayerNorm',
         'query': 'project_q',
         'key': 'project_k',
         'value': 'project_v',
         'ffn.': 'feedforward.linear_in.',
         'ffn_output': 'feedforward.linear_out',
         'predictions': 'lm_head',
     }
     # use these three lines to do the replacement
     repl = dict((re.escape(k), v) for k, v in repl.items())
     pattern = re.compile("|".join(repl.keys()))
     updated_hf_state_dict = OrderedDict(
         (pattern.sub(lambda m: repl[re.escape(m.group(0))], k), v)
         for k, v in hf_state_dict.items())
     # Allow for cutting the sequence length short
     updated_hf_state_dict[
         'embedder.position_embeddings.weight'] = updated_hf_state_dict[
             'embedder.position_embeddings.weight'][:FLAGS.
                                                    max_seq_length, :].clone(
                                                    )
     missing, unexpected = self.load_state_dict(updated_hf_state_dict,
                                                strict=False)
     # Allowed discrepancies: don't care about pooler, and have optional relative attention bias, + there is a 'lm_head.bias' that is only used to set lm head decoder bias to zero, which I' currently ignoring :P
     ignored_hf_parameters = [
         'pooler', 'position_embeddings', 'lm_head.bias'
     ]
     allowed_from_scratch_params = [
         'relative_attention_bias', 'top_down_regressor', 'combiner',
         'shared_top_down_predictor', 'shared_from_left_predictor',
         'shared_from_right_predictor'
     ]
     for m in missing:
         if not any([s in m for s in allowed_from_scratch_params]):
             raise ValueError(
                 f'Unexpected mismatch in loading state dict: {m} not present in pretrained.'
             )
     for u in unexpected:
         if not any([s in u for s in ignored_hf_parameters]):
             raise ValueError(
                 f'Unexpected mismatch in loading state dict: {u} in pretrained but not in current model.'
             )
     log.info(f"Loaded pretrained weights from {FLAGS.hf_model_handle}")
Ejemplo n.º 7
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ):
        super(Transformer, self).__init__()
        self.config_keys = ['max_seq_length']
        self.max_seq_length = max_seq_length

        config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
        model_type = config.model_type if hasattr(config, 'model_type') else ''
        if model_type == 'albert':
            self.model = AlbertForMaskedLM.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
            self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
        else:
            self.model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
Ejemplo n.º 8
0
    def _from_pretrained(self, pretrain_name: str):
        r"""
        根据模型名字,加载不同的模型.
    """
        if 'albert' in pretrain_name:
            model = AlbertForMaskedLM.from_pretrained(pretrain_name)
            tokenizer = BertTokenizer.from_pretrained(pretrain_name)
        elif 'bert' in pretrain_name:
            tokenizer = AutoTokenizer.from_pretrained(pretrain_name)
            model = AutoModelWithLMHead.from_pretrained(pretrain_name)

        self.model = model
        self.tokenizer = tokenizer
Ejemplo n.º 9
0
    def __init__(self, device):
        self.device = device

        self.bert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.bert_model.device(device)

        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.albert_model.device(device)
Ejemplo n.º 10
0
    def __init__(
        self,
        model=None,
        tokenizer=None,
        model_name="bert-large-uncased",
        mask_token="***mask***",
        disable_gpu=False,
    ):
        self.mask_token = mask_token
        self.delemmatizer = Delemmatizer()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
        )
        print("using model:", model_name)
        print("device:", self.device)

        if not model:
            if "distilbert" in model_name:
                self.bert = DistilBertForMaskedLM.from_pretrained(model_name)
            elif "Albert" in model_name:
                self.bert = AlbertForMaskedLM.from_pretrained(model_name)
            else:
                self.bert = BertForMaskedLM.from_pretrained(model_name)
            self.bert.to(self.device)
        else:
            self.bert = model

        if not tokenizer:
            if "distilbert" in model_name:
                self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
            elif "Albert" in model_name:
                self.tokenizer = AlbertTokenizer.from_pretrained(bert-large-uncased)
            else:
                self.tokenizer = BertTokenizer.from_pretrained(model_name)
        else:
            self.tokenizer = tokenizer

        self.bert.eval()
Ejemplo n.º 11
0
 def _contextual_model_init(self):
     """  基于上个下文的词相似计算初始化,加载词典,模型
     :return: 无
     """
     pretrain_name = self.model_path + self.model_params[
         'pre_train_model_path']
     logging.info('pretrain_name', pretrain_name)
     if 'albert' in pretrain_name:
         self._contextual_model = AlbertForMaskedLM.from_pretrained(
             pretrain_name)
         self._contextual_tokenizer = BertTokenizer.from_pretrained(
             pretrain_name)
     elif 'ernie' in pretrain_name or 'roberta' in pretrain_name:
         self._contextual_tokenizer = BertTokenizer.from_pretrained(
             pretrain_name)
         self._contextual_model = BertModel.from_pretrained(pretrain_name)
     else:
         # elif 'bert' in pretrain_name:
         self._contextual_tokenizer = AutoTokenizer.from_pretrained(
             pretrain_name)
         model_config = AutoConfig.from_pretrained(pretrain_name)
         self._contextual_model = AutoModel.from_pretrained(
             pretrain_name, config=model_config)
Ejemplo n.º 12
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
    model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2')

    names = proc.generate_pairs_of_random_names(number_of_pairs=100)

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=model,
                             tokenizer=tokenizer,
                             fictitious_entities=names,
                             sentences=social_sents,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/albert_w_name/alberta_social_perf_2_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving social results")
Ejemplo n.º 13
0
from transformers import AlbertForMaskedLM, AlbertTokenizer
import torch
tokenizer = AlbertTokenizer.from_pretrained("albert-large-v2")
model = AlbertForMaskedLM.from_pretrained("albert-large-v2")
sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
input = tokenizer.encode(sequence, return_tensors="pt")
# 被Mask的字符的位置
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
#获取每个位置的logits, [batch_size, seq_length, vocab_size],  torch.Size([1, 28, 30522]), 即最大的可能性
token_logits = model(input)[0]
#只获取被mask处的单词的logits
mask_token_logits = token_logits[0, mask_token_index, :]
# 我们只取前5个可能的结果,从vocab_size众多结果中
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
#打印前5个结果
for token in top_5_tokens:
    print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
Ejemplo n.º 14
0
def generate_embedding(objectives,
                       model_name,
                       batch_size=100,
                       output_attention=False):
    """
    Takes in a pandas dataframe and generates embeddings for the text column using the hugging face implemented models
    - Inputs:
        pd_dataframe (pandas dataframe): The dataframe containing all text column and their ids
        model_name (str): name of the model to be used for generating embeddings
        batch_size (int): batch size to use when generating embeddings for sentences

    - Output:
        sentence_embedding (tensor): tensor of shape n by 1024 where n is the number of sentence

    """

    if model_name == "bert":
        # Load pre-trained bert model (weights)
        model = BertModel.from_pretrained("bert-base-uncased",
                                          output_attentions=output_attention)
    elif model_name == "xlnet":
        # Load pre-trained xlnet model (weights)
        model = XLNetModel.from_pretrained("xlnet-base-cased",
                                           output_attentions=output_attention)
    elif model_name == "xlm":
        # Load pre-trained xlm model (weights)
        model = XLMModel.from_pretrained("xlm-mlm-en-2048",
                                         output_attentions=output_attention)
    elif model_name == "electra":
        # Load pre-trained electra model (weights)
        model = ElectraModel.from_pretrained(
            "google/electra-small-discriminator",
            output_attentions=output_attention)
    elif model_name == "albert":
        # Load pre-trained albert model (weights)
        model = AlbertForMaskedLM.from_pretrained(
            "albert-base-v2", output_attentions=output_attention)
    else:
        print(
            "Please select an implemented model name. {} doesn't exist".format(
                model_name))
        return

    sentences_per_batch = batch_size

    # setting up the device
    if torch.cuda.is_available():
        dev = "cuda:0"
    else:
        dev = "cpu"
    device = torch.device(dev)
    print("using ", device)

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    model.to(device)
    num_sentences = len(objectives)
    sentence_embedding = []
    attention_layers = None

    if num_sentences > sentences_per_batch:
        num_batches = num_sentences // sentences_per_batch

        for i in range(num_batches):
            start = i * sentences_per_batch
            end = (i + 1) * sentences_per_batch
            if i == num_batches - 1:
                end = num_sentences
            mini_objective = list(objectives[start:end])

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([mini_objective]).squeeze()
            tokens_tensor = tokens_tensor.to(device)

            # Predict hidden states features for each layer
            with torch.no_grad():
                encoded_layers = model(tokens_tensor)

            # taking embeddings of the last layer.
            # token_vecs` is a tensor with shape [n x k x 1024]
            token_vecs = encoded_layers[0]

            # take the vector corresponing to the [CLS] token if it has a cls token.
            if model_name in ["bert", "albert", "electra"]:
                sentence_embedding += token_vecs[:, 0, :].tolist()
            # for those without a cls token, Calculate the average of all k  token vectors and adding to the main list
            else:
                sentence_embedding += torch.mean(token_vecs, dim=1).tolist()
            if output_attention is True:
                attention_layer = [al.tolist() for al in encoded_layers[-1]]
                attention_layer = np.array(attention_layer)
                if len(attention_layers) == 0:
                    attention_layers = attention_layer
                else:
                    attention_layers = np.concatenate(
                        (attention_layers, attention_layer), axis=1)

            print("Embedding for batch {} out of {} batches Completed.".format(
                i, num_batches))
    else:
        # Convert inputs to PyTorch tensors

        tokens_tensor = torch.tensor([objectives]).squeeze()
        tokens_tensor = tokens_tensor.to(device)

        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers = model(tokens_tensor)

        # taking embeddings of the last layer.
        # token_vecs` is a tensor with shape [n x k x 1024]
        token_vecs = encoded_layers[0]

        # take the vector corresponing to the [CLS] token if it has a cls token.
        if model_name in ["bert", "albert", "electra"]:
            sentence_embedding = token_vecs[:, 0, :].tolist()
        # for those without a cls token, Calculate the average of all k  token vectors and adding to the main list
        else:
            sentence_embedding = torch.mean(token_vecs, dim=1).tolist()

        if output_attention is True:
            attention_layers = [al.tolist() for al in encoded_layers[-1]]
            attention_layers = np.array(attention_layers)

    print(
        "Our final sentence embedding vector of shape:",
        len(sentence_embedding),
        len(sentence_embedding[0]),
    )
    if output_attention:
        print("And the corresponding attention vector of shape:",
              attention_layers.shape)
    return sentence_embedding, attention_layers
Ejemplo n.º 15
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT loaded")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT loaded")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base loaded")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base loaded")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor loaded")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base loaded")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 loaded")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base loaded")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded")
        print("====================================")
Ejemplo n.º 16
0
    def __init__(self, args) -> None:
        """Use ELM with fintuned language model for sentiment classification

        Args:
            args (dict): contain all the arguments needed.
                - model_name(str): the name of the transformer model
                - bsz(int): batch size
                - epoch: epochs to train
                - type(str): fintuned type
                  - base: train only ELM
                  - finetune_elm: train transformers with ELM directly
                  - finetune_classifier: train transformers with classifier
                  - finetune_classifier_elm: train transformers with classifier,
                    and use elm replace the classifier
                  - finetune_classifier_beta: train transformers with classifier,
                    and use pinv to calculate beta in classifier
                - learning_rate(float): learning_rate for finetuning
        """
        # load configuration
        self.model_name = args.get('model_name', 'bert-base-uncased')
        self.bsz = args.get('batch_size', 10)
        self.epoch = args.get('epoch_num', 2)
        self.learning_rate = args.get('learning_rate', 0.001)
        self.training_type = args.get('training_type', 'base')
        self.debug = args.get('debug', True)
        self.eval_epoch = args.get('eval_epoch', 1)
        self.lr_decay = args.get('learning_rate_decay', 0.99)
        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        self.device = device
        self.n_gpu = torch.cuda.device_count()

        # load pretrained model
        if (self.model_name == 'bert-base-uncased') or \
                (self.model_name == 'distilbert-base-uncased') or \
                (self.model_name == 'albert-base-v2'):
            self.pretrained_model = AutoModel.from_pretrained(self.model_name)
            self.pretrained_tokenizer = AutoTokenizer.from_pretrained(
                self.model_name)
            input_shape = 768
            output_shape = 256
        elif (self.model_name == 'prajjwal1/bert-tiny'):
            self.pretrained_model = AutoModel.from_pretrained(self.model_name)
            self.pretrained_tokenizer = AutoTokenizer.from_pretrained(
                self.model_name, model_max_length=512)
            input_shape = 128
            output_shape = 64
        elif self.model_name == 'voidful/albert_chinese_xxlarge':
            self.pretrained_model = AlbertForMaskedLM.from_pretrained(
                self.model_name)
            self.pretrained_tokenizer = BertTokenizer.from_pretrained(
                self.model_name)
            input_shape = 768
            output_shape = 256
        else:
            raise TypeError("Unsupported model name")
        self.pretrained_model.to(device)
        device_ids = None
        if self.n_gpu > 1:
            device_ids = range(torch.cuda.device_count())
            self.pretrained_model = DP(self.pretrained_model,
                                       device_ids=device_ids)

        # load specific model
        if (self.training_type == 'finetune_classifier') or \
            (self.training_type == 'finetune_classifier_elm'):
            self.classifier = torch.nn.Sequential(
                torch.nn.Linear(input_shape, 2))
            self.loss_func = torch.nn.CrossEntropyLoss()
            self.classifier.to(device)
            if self.n_gpu > 1:
                self.classifier = DP(self.classifier, device_ids=device_ids)
        if (self.training_type == 'base') or \
            (self.training_type =='finetune_classifier_elm'):
            self.elm = classic_ELM(input_shape, output_shape)
        if (self.training_type == 'finetune_classifier_linear'):
            self.elm = classic_ELM(None, None)
            self.classifier = torch.nn.Sequential(
                OrderedDict([
                    ('w', torch.nn.Linear(input_shape, output_shape)),
                    ('act', torch.nn.Sigmoid()),
                    ('beta', torch.nn.Linear(output_shape, 2)),
                ]))
            self.loss_func = torch.nn.CrossEntropyLoss()
            self.classifier.to(device)
            if self.n_gpu > 1:
                self.classifier = DP(self.classifier, device_ids=device_ids)

        # load processor, trainer, evaluator, inferer.
        processors = {
            'base': self.__processor_base__,
            'finetune_classifier': self.__processor_base__,
            'finetune_classifier_elm': self.__processor_base__,
            'finetune_classifier_linear': self.__processor_base__,
        }
        trainers = {
            'base':
            self.__train_base__,
            'finetune_classifier':
            self.__train_finetune_classifier__,
            'finetune_classifier_elm':
            self.__train_finetune_classifier_elm__,
            'finetune_classifier_linear':
            self.__train_finetune_classifier_linear__,
        }
        evaluators = {
            'base': self.__eval_base__,
            'finetune_classifier': self.__eval_finetune_classifier__,
            'finetune_classifier_elm': self.__eval_base__,
            'finetune_classifier_linear':
            self.__eval_finetune_classifier_linear__,
        }
        inferers = {
            'base': self.__infer_base__,
            'finetune_classifier': self.__infer_finetune_classifier__,
            'finetune_classifier_elm': self.__infer_finetune_classifier_elm__,
            'finetune_classifier_linear': self.__infer_base__
        }
        self.processor = processors[self.training_type]
        self.trainer = trainers[self.training_type]
        self.evaluator = evaluators[self.training_type]
        self.inferer = inferers[self.training_type]
Ejemplo n.º 17
0
 def __init__(self, pretrained_model_name_or_path, config, device):
     super(MyAlbertForMaskedLM, self).__init__()
     self.model = AlbertForMaskedLM.from_pretrained(
         pretrained_model_name_or_path, config=config)
     self.device = device
Ejemplo n.º 18
0
# %%
import torch
import string

from transformers import \
    AlbertTokenizer, AlbertForMaskedLM,\
    DistilBertTokenizer, DistilBertForMaskedLM, \
    RobertaTokenizer, RobertaForMaskedLM

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval()

albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')
albert_large_model = AlbertForMaskedLM.from_pretrained(
    'albert-large-v2').eval()

distilbert_tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-cased')
distilbert_model = DistilBertForMaskedLM.from_pretrained(
    'distilbert-base-cased').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval()

top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 predictions_file=None,
                 layer_freeze_regexes: List[str] = None,
                 probe_type: str = None,
                 loss_on_all_vocab: bool = False,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._loss_on_all_vocab = loss_on_all_vocab

        self._predictions_file = predictions_file

        # TODO move to predict
        if predictions_file is not None and os.path.isfile(predictions_file):
            os.remove(predictions_file)

        self._pretrained_model = pretrained_model
        if 'roberta' in pretrained_model:
            self._padding_value = 1  # The index of the RoBERTa padding token
            if loss_on_all_vocab:
                self._transformer_model = RobertaForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = RobertaForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
        elif 'xlnet' in pretrained_model:
            self._padding_value = 5  # The index of the XLNet padding token
            self._transformer_model = XLNetLMHeadModel.from_pretrained(
                pretrained_model)
        elif 'albert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = AlbertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        elif 'bert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = BertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        else:
            assert (ValueError)

        if probe_type == 'MLP':
            layer_freeze_regexes = ["embeddings", "encoder", "pooler"]
        elif probe_type == 'linear':
            layer_freeze_regexes = [
                "embeddings", "encoder", "pooler", "dense", "LayerNorm",
                "layer_norm"
            ]

        for name, param in self._transformer_model.named_parameters():
            if layer_freeze_regexes and requires_grad:
                grad = not any(
                    [bool(re.search(r, name)) for r in layer_freeze_regexes])
            else:
                grad = requires_grad
            if grad:
                param.requires_grad = True
            else:
                param.requires_grad = False

        # make sure decode gredients are on.
        if 'roberta' in pretrained_model:
            self._transformer_model.lm_head.decoder.weight.requires_grad = True
            self._transformer_model.lm_head.bias.requires_grad = True
        elif 'albert' in pretrained_model:
            pass
        elif 'bert' in pretrained_model:
            self._transformer_model.cls.predictions.decoder.weight.requires_grad = True
            self._transformer_model.cls.predictions.bias.requires_grad = True

        transformer_config = self._transformer_model.config
        transformer_config.num_labels = 1
        self._output_dim = self._transformer_model.config.hidden_size

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
        self._debug = 2
Ejemplo n.º 20
0
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--type_of_model", default = 'albert', help = "pretrained LM type")
parser.add_argument("-p", "--path_to_pytorch_models", help = "path to pytorch_model")
parser.add_argument("--config_and_vocab", help = "path to config.json and vocab.model")
parser.add_argument("-s", "--step", type = str, help = "pretrained step")
parser.add_argument("-d", "--data", help = "path where you put your processed ontonotes data")
parser.add_argument("-o", "--output", help = "output file")
args = parser.parse_args()
print("Reconstruction. step = ", args.step)
if args.type_of_model == 'albert':
  tokenizer = AlbertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model'))
  config = AlbertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json'))
  config.output_hidden_states = True
  model = AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None,
    config = config,
    state_dict = torch.load(os.path.join(
      args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin')))
elif args.type_of_model == 'bert':
  tokenizer = BertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model'))
  config = BertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json'))
  config.output_hidden_states = True
  model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None,
    config = config,
    state_dict = torch.load(os.path.join(
      args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin')))
else:
  raise NotImplementedError("The given model type %s is not supported" % args.type_of_model)


device = 'cuda' if torch.cuda.is_available else 'cpu'    
model.eval().to(device)
Ejemplo n.º 21
0
from transformers import TFAlbertForMaskedLM, TFAlbertModel, TFAlbertForSequenceClassification, AlbertForMaskedLM
import os

checkpoint = "albert-base-v1"

model = AlbertForMaskedLM.from_pretrained(checkpoint)

if not os.path.exists("~/saved/" + checkpoint):
    os.makedirs("~/saved/" + checkpoint)
    

model.save_pretrained("~/saved/" + checkpoint)
model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint, from_pt=True)
model.save_pretrained("~/saved/" + checkpoint)
model = TFAlbertModel.from_pretrained('~/saved/' + checkpoint)
model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint)
model = TFAlbertForSequenceClassification.from_pretrained('~/saved/' + checkpoint)


print("nice model") 
Ejemplo n.º 22
0
    def __init__(self, args, random_init='none'):
        assert (random_init in ['none', 'all', 'embedding'])

        super().__init__()

        self._model_device = 'cpu'

        model_name = args.model_name
        vocab_name = model_name

        if args.model_dir is not None:
            # load bert model from file
            model_name = str(args.model_dir) + "/"
            vocab_name = model_name
            logger.info("loading BERT model from {}".format(model_name))

        # Load pre-trained model tokenizer (vocabulary)
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)
        if torch.cuda.device_count() > 1:
            torch.cuda.manual_seed_all(args.seed)

        config = AutoConfig.from_pretrained(model_name)
        if isinstance(config, AlbertConfig):
            self.model_type = 'albert'
            self.tokenizer = AlbertTokenizer.from_pretrained(vocab_name)
            self.mlm_model = AlbertForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = AlbertForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.albert
        elif isinstance(config, RobertaConfig):
            self.model_type = 'roberta'
            self.tokenizer = RobertaTokenizer.from_pretrained(vocab_name)
            self.mlm_model = RobertaForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = RobertaForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.roberta
        elif isinstance(config, BertConfig):
            self.model_type = 'bert'
            self.tokenizer = BertTokenizer.from_pretrained(vocab_name)
            self.mlm_model = BertForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = BertForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.bert
        else:
            raise ValueError('Model %s not supported yet!' % (model_name))

        self.mlm_model.eval()

        if random_init == 'embedding':
            logger.info('Random initialize embedding layer...')
            self.mlm_model._init_weights(
                self.base_model.embeddings.word_embeddings)

        # original vocab
        self.map_indices = None
        self.vocab = list(self.tokenizer.get_vocab().keys())
        logger.info('Vocab size: %d' % len(self.vocab))
        self._init_inverse_vocab()

        self.MASK = self.tokenizer.mask_token
        self.EOS = self.tokenizer.eos_token
        self.CLS = self.tokenizer.cls_token
        self.SEP = self.tokenizer.sep_token
        self.UNK = self.tokenizer.unk_token
        # print(self.MASK, self.EOS, self.CLS, self.SEP, self.UNK)

        self.pad_id = self.inverse_vocab[self.tokenizer.pad_token]
        self.unk_index = self.inverse_vocab[self.tokenizer.unk_token]

        # used to output top-k predictions
        self.k = args.k
# inspired by https://github.com/renatoviolin/next_word_prediction

import torch
import string
import transformers

transformers.logging.set_verbosity_error()

from transformers import BertTokenizerFast, BertForMaskedLM
bert_tokenizer = BertTokenizerFast.from_pretrained('kykim/bert-kor-base')
bert_model = BertForMaskedLM.from_pretrained('kykim/bert-kor-base').eval()

from transformers import AlbertForMaskedLM
albert_tokenizer = BertTokenizerFast.from_pretrained('kykim/albert-kor-base')
albert_model = AlbertForMaskedLM.from_pretrained(
    'kykim/albert-kor-base').eval()

# from transformers import BartForConditionalGeneration
# roberta_tokenizer = BertTokenizerFast.from_pretrained('kykim/bart-kor-base')
# roberta_model = BartForConditionalGeneration.from_pretrained('kykim/bart-kor-basee').eval()

from transformers import BertTokenizerFast, BertForMaskedLM
bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-multilingual-cased')
bert_multilingual_model = BertForMaskedLM.from_pretrained(
    'bert-base-multilingual-cased').eval()

from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
    'xlm-roberta-base')
xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
Ejemplo n.º 24
0
from transformers import BertTokenizer, AlbertForMaskedLM
import os
# pretrained = 'voidful/albert_chinese_xlarge'
pretrained = 'voidful/albert_chinese_large'
tokenizer = BertTokenizer.from_pretrained(pretrained)
model = AlbertForMaskedLM.from_pretrained(pretrained)

model.save_pretrained('albert_model')
tokenizer.save_pretrained('albert_model')
os.remove("albert_model/special_tokens_map.json")
os.remove("albert_model/tokenizer_config.json")
os.system("mv albert_model ../")
Ejemplo n.º 25
0
def evaluate(args):
    """
    Evaluate a masked language model using CrowS-Pairs dataset.
    """

    print("Evaluating:")
    print("Input:", args.input_file)
    print("Model:", args.lm_model)
    print("=" * 100)

    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data(args.input_file)

    # supported masked language models
    if args.lm_model == "bert":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
        uncased = True
    elif args.lm_model == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        model = RobertaForMaskedLM.from_pretrained('roberta-large')
        uncased = False
    elif args.lm_model == "albert":
        tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
        model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2')
        uncased = True

    model.eval()
    if torch.cuda.is_available():
        model.to('cuda')

    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open(args.lm_model + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": uncased
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'])


    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)


    df_score.to_csv(args.output_file)
    print('=' * 100)
    print('Total examples:', N)
    print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2))
    print('Stereotype score:', round(stereo_score  / total_stereo * 100, 2))
    if antistereo_score != 0:
        print('Anti-stereotype score:', round(antistereo_score  / total_antistereo * 100, 2))
    print("Num. neutral:", neutral, round(neutral / N * 100, 2))
    print('=' * 100)
    print()
Roberta = ModelInfo(
    RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True),
    RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta")

XLM = ModelInfo(
    XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024',
                                       return_dict=True),
    XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM")

T5 = ModelInfo(
    T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True),
    T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5")

Albert = ModelInfo(
    AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True),
    AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert")

TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'),
                TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_",
                vocab, "TXL")

if __name__ == "__main__":

    sentences = [sample_sentences("sentences4lara.txt") for i in range(11)]

    sent_dict = dict(zip([str(x) for x in range(1, 11)], sentences))

    sentence = sent_dict[sys.argv[2]]

    batch_size = 100
Ejemplo n.º 27
0
def run_benchmark(model_name, benchmark_file, results_file, logging_file):
    with open(benchmark_file, "r") as f:
        benchmark = json.load(f)

    model = AlbertForMaskedLM.from_pretrained(model_name)
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

    # Each pattern will store its own statistics
    results = []
    for pattern in patterns:
        result = {}
        result["false_positives"] = 0
        result["false_negatives"] = 0
        result["total_questions"] = 0
        result["correct"] = 0
        pattern["accuracy"] = 0.0
        result["pattern"] = pattern["prompt"]
        results.append(result)


    with open(logging_file, "w") as log:

        for benchmark_question in benchmark:

            output = fill_mask(benchmark_question["question"])

            output_str = output[0]["sequence"] + "\n"
            for o in output:
                output_str += str(o["token_str"][1:]) + " " + str(o["score"]) + "\n"
            print(output_str)
            log.write(output_str)

            # Update the correct patterns stats
            for result in results:
                if result["pattern"] == benchmark_question["pattern"]:
                    result["total_questions"] += 1
                    if is_correct(output, benchmark_question["answer"]):
                        result["correct"] += 1
                        print("correct")
                        log.write("correct\n")
                    else:
                        print("incorrect")
                        log.write("incorrect\n")
                        if benchmark_question["answer"] == True:
                            result["false_negatives"] += 1
                        else:
                            result["false_positives"] += 1
                    break

    # Calculate each pattern's accuracy
    for result in results:
        result["accuracy"] = float(result["correct"])/result["total_questions"]

    # Calculate and append the overall statistics
    results.append(compute_overall_results(results))
    results.append({"model_name": model_name, "datetime": str(datetime.datetime.now())})

    # Store the results -- downside of no results until the end.
    with open(results_file, "w") as f:
        json.dump(results, f, indent=3)