def main(args):
    with open(args.config) as fp:
        data = json.loads(fp.read())
    config = AlbertConfig(**data)
    model = AlbertForMaskedLM(config)
    model: AlbertForMaskedLM = load_tf_weights_in_albert(model, config, args.checkpoint)
    model.save_pretrained(args.output)
Esempio n. 2
0
    def init_model(self, device):
        """Initialize the language model and send it to the given device
        Note: Transformers v.4 and higher made default return_dict=True.
        Args:
            device (str): torch device (usually "cpu" or "cuda")

        Returns:
            model: a model for masked language modeling torch model
        """
        model = None
        if self.model_name.lower().find('albert') >= 0:
            try:
                model = AlbertForMaskedLM.from_pretrained(
                    self.model_name, return_dict=False).to(device)
            except:
                model = AlbertForMaskedLM.from_pretrained(
                    self.model_name).to(device)
        else:
            try:
                model = BertForMaskedLM.from_pretrained(
                    self.model_name, return_dict=False).to(device)
            except:
                model = BertForMaskedLM.from_pretrained(
                    self.model_name).to(device)
        model.eval()
        return model
Esempio n. 3
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForMaskedLM(config)
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
def main(args):
    with open(args.config) as fp:
        data = json.loads(fp.read())
    config = AlbertConfig(**data)
    model = AlbertForMaskedLM(config)
    model: AlbertForMaskedLM = load_tf_weights_in_albert(
        model, config, args.checkpoint)
    model.save_pretrained(args.output)

    tokenizer = AlbertTokenizer.from_pretrained(args.spiece, keep_accents=True)
    tokenizer.save_pretrained(args.output)
Esempio n. 5
0
def albert_convert_tf_checkpoint_to_pytorch(tf_checkpoint_path,
                                            albert_config_file,
                                            pytorch_dump_path):
    from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForMaskedLM(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Esempio n. 6
0
    def __init__(self,
                 vocab: Vocabulary,
                 model_name: str = "bert-base",
                 multi_choice: bool = False):
        super().__init__(vocab)
        self._model = None
        self._loss = CrossEntropyLoss()
        self.is_multi_choice = multi_choice

        if model_name.startswith('bert'):
            if self.is_multi_choice:
                self._model = BertMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = BertForMaskedLM.from_pretrained(model_name)
        elif 'roberta' in model_name:
            if self.is_multi_choice:
                self._model = RobertaMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = RobertaForMaskedLM.from_pretrained(model_name)

        elif 'albert' in model_name:
            self._model = AlbertForMaskedLM.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self._model = XLNetLMHeadModel.from_pretrained(model_name)
        else:
            raise ("Riquiered model is not supported.")
Esempio n. 7
0
 def __init__(self, config):
     super(LMDecodingModel, self).__init__()
     self.config = config
     self.dep_tree_baseline = config[MODEL_TYPE] == DEP_TREETRAIN_BASELINE
     self.albert = AlbertForMaskedLM.from_pretrained('albert-base-v2')
     self.albert_tokenizer = AlbertTokenizer.from_pretrained(
         'albert-base-v2')
Esempio n. 8
0
 def __init__(self, transformer_model, is_train):
     super(LMNER, self).__init__()
     config = AlbertConfig.from_pretrained(transformer_model)
     self.transformer_model = AlbertForMaskedLM.from_pretrained(
         transformer_model, config=config)
     # 是否对bert进行训练
     for name, param in self.transformer_model.named_parameters():
         param.requires_grad = is_train
 def create_and_check_albert_for_masked_lm(self, config, input_ids,
                                           token_type_ids, input_mask,
                                           sequence_labels,
                                           token_labels, choice_labels):
     model = AlbertForMaskedLM(config=config)
     model.eval()
     loss, prediction_scores = model(input_ids,
                                     attention_mask=input_mask,
                                     token_type_ids=token_type_ids,
                                     masked_lm_labels=token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
Esempio n. 10
0
    def setUp(self):
        super(TestAlbertMaskModel, self).setUp()

        albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_base_hf"
        # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_large_hf"
        # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_xlarge_hf"
        self.tokenizer = BertTokenizer.from_pretrained(albert_pre_train)

        self.mask_model = AlbertForMaskedLM.from_pretrained(albert_pre_train)
        self.mask_token = self.tokenizer.mask_token
        self.mask_id = self.tokenizer.mask_token_id
Esempio n. 11
0
 def load_HFpretrained_weights(self):
     hf_state_dict = AlbertForMaskedLM.from_pretrained(
         FLAGS.hf_model_handle).state_dict()
     repl = {
         "albert.embeddings": 'embedder',
         'word_embeddings': 'idx_to_embedding',
         'albert.encoder.embedding_hidden_mapping_in':
         'embedder.embedding_to_hidden',
         'albert.encoder.albert_layer_groups.0.albert_layers.0':
         'shared_encoder_block',
         'attention.dense': 'multihead_attention.project_o',
         'attention': 'multihead_attention',
         'full_layer_layer_norm': 'feedforward.LayerNorm',
         'query': 'project_q',
         'key': 'project_k',
         'value': 'project_v',
         'ffn.': 'feedforward.linear_in.',
         'ffn_output': 'feedforward.linear_out',
         'predictions': 'lm_head',
     }
     # use these three lines to do the replacement
     repl = dict((re.escape(k), v) for k, v in repl.items())
     pattern = re.compile("|".join(repl.keys()))
     updated_hf_state_dict = OrderedDict(
         (pattern.sub(lambda m: repl[re.escape(m.group(0))], k), v)
         for k, v in hf_state_dict.items())
     # Allow for cutting the sequence length short
     updated_hf_state_dict[
         'embedder.position_embeddings.weight'] = updated_hf_state_dict[
             'embedder.position_embeddings.weight'][:FLAGS.
                                                    max_seq_length, :].clone(
                                                    )
     missing, unexpected = self.load_state_dict(updated_hf_state_dict,
                                                strict=False)
     # Allowed discrepancies: don't care about pooler, and have optional relative attention bias, + there is a 'lm_head.bias' that is only used to set lm head decoder bias to zero, which I' currently ignoring :P
     ignored_hf_parameters = [
         'pooler', 'position_embeddings', 'lm_head.bias'
     ]
     allowed_from_scratch_params = [
         'relative_attention_bias', 'top_down_regressor', 'combiner',
         'shared_top_down_predictor', 'shared_from_left_predictor',
         'shared_from_right_predictor'
     ]
     for m in missing:
         if not any([s in m for s in allowed_from_scratch_params]):
             raise ValueError(
                 f'Unexpected mismatch in loading state dict: {m} not present in pretrained.'
             )
     for u in unexpected:
         if not any([s in u for s in ignored_hf_parameters]):
             raise ValueError(
                 f'Unexpected mismatch in loading state dict: {u} in pretrained but not in current model.'
             )
     log.info(f"Loaded pretrained weights from {FLAGS.hf_model_handle}")
Esempio n. 12
0
    def _from_pretrained(self, pretrain_name: str):
        r"""
        根据模型名字,加载不同的模型.
    """
        if 'albert' in pretrain_name:
            model = AlbertForMaskedLM.from_pretrained(pretrain_name)
            tokenizer = BertTokenizer.from_pretrained(pretrain_name)
        elif 'bert' in pretrain_name:
            tokenizer = AutoTokenizer.from_pretrained(pretrain_name)
            model = AutoModelWithLMHead.from_pretrained(pretrain_name)

        self.model = model
        self.tokenizer = tokenizer
Esempio n. 13
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ):
        super(Transformer, self).__init__()
        self.config_keys = ['max_seq_length']
        self.max_seq_length = max_seq_length

        config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
        model_type = config.model_type if hasattr(config, 'model_type') else ''
        if model_type == 'albert':
            self.model = AlbertForMaskedLM.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
            self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
        else:
            self.model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
 def create_and_check_for_masked_lm(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = AlbertForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
Esempio n. 15
0
    def __init__(self, device):
        self.device = device

        self.bert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.bert_model.device(device)

        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.albert_model.device(device)
Esempio n. 16
0
def get_model(args):
    if args.model_size == 'debug':
        num_hidden_layers = 1
        embedding_size = 8
        hidden_size = 16
        intermediate_size = 32
        num_attention_heads = 2
        args.gen_ratio = 2

    elif args.model_size == 'tiny':
        num_hidden_layers = 4
        embedding_size = 128
        hidden_size = 336
        intermediate_size = 1344
        num_attention_heads = 12
    elif args.model_size == 'small':
        num_hidden_layers = 12
        embedding_size = 128
        hidden_size = 256
        intermediate_size = 1024
        num_attention_heads = 4
    elif args.model_size == 'base':
        num_hidden_layers = 12
        embedding_size = 768
        hidden_size = 768
        intermediate_size = 3072
        num_attention_heads = 12

    else:
        raise Exception('Which model? small, base, large')

    config = AlbertConfig(
        max_position_embeddings=args.seq_length,
        vocab_size=args.vocab_size,
        num_hidden_layers=num_hidden_layers,
        embedding_size=embedding_size,
        hidden_size=hidden_size // args.gen_ratio,
        intermediate_size=intermediate_size // args.gen_ratio,
        num_attention_heads=num_attention_heads // args.gen_ratio,
    )

    model = AlbertForMaskedLM(config)
    return model
Esempio n. 17
0
 def create_and_check_albert_for_masked_lm(self, config, input_ids,
                                           token_type_ids, input_mask,
                                           sequence_labels, token_labels,
                                           choice_labels):
     model = AlbertForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids,
                    labels=token_labels)
     self.parent.assertListEqual(
         list(result["logits"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
Esempio n. 18
0
    def __init__(
        self,
        model=None,
        tokenizer=None,
        model_name="bert-large-uncased",
        mask_token="***mask***",
        disable_gpu=False,
    ):
        self.mask_token = mask_token
        self.delemmatizer = Delemmatizer()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu"
        )
        print("using model:", model_name)
        print("device:", self.device)

        if not model:
            if "distilbert" in model_name:
                self.bert = DistilBertForMaskedLM.from_pretrained(model_name)
            elif "Albert" in model_name:
                self.bert = AlbertForMaskedLM.from_pretrained(model_name)
            else:
                self.bert = BertForMaskedLM.from_pretrained(model_name)
            self.bert.to(self.device)
        else:
            self.bert = model

        if not tokenizer:
            if "distilbert" in model_name:
                self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
            elif "Albert" in model_name:
                self.tokenizer = AlbertTokenizer.from_pretrained(bert-large-uncased)
            else:
                self.tokenizer = BertTokenizer.from_pretrained(model_name)
        else:
            self.tokenizer = tokenizer

        self.bert.eval()
Esempio n. 19
0
 def _contextual_model_init(self):
     """  基于上个下文的词相似计算初始化,加载词典,模型
     :return: 无
     """
     pretrain_name = self.model_path + self.model_params[
         'pre_train_model_path']
     logging.info('pretrain_name', pretrain_name)
     if 'albert' in pretrain_name:
         self._contextual_model = AlbertForMaskedLM.from_pretrained(
             pretrain_name)
         self._contextual_tokenizer = BertTokenizer.from_pretrained(
             pretrain_name)
     elif 'ernie' in pretrain_name or 'roberta' in pretrain_name:
         self._contextual_tokenizer = BertTokenizer.from_pretrained(
             pretrain_name)
         self._contextual_model = BertModel.from_pretrained(pretrain_name)
     else:
         # elif 'bert' in pretrain_name:
         self._contextual_tokenizer = AutoTokenizer.from_pretrained(
             pretrain_name)
         model_config = AutoConfig.from_pretrained(pretrain_name)
         self._contextual_model = AutoModel.from_pretrained(
             pretrain_name, config=model_config)
Esempio n. 20
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
    model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2')

    names = proc.generate_pairs_of_random_names(number_of_pairs=100)

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=model,
                             tokenizer=tokenizer,
                             fictitious_entities=names,
                             sentences=social_sents,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/albert_w_name/alberta_social_perf_2_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving social results")
Esempio n. 21
0
from transformers import BertTokenizer, AlbertForMaskedLM
import os
# pretrained = 'voidful/albert_chinese_xlarge'
pretrained = 'voidful/albert_chinese_large'
tokenizer = BertTokenizer.from_pretrained(pretrained)
model = AlbertForMaskedLM.from_pretrained(pretrained)

model.save_pretrained('albert_model')
tokenizer.save_pretrained('albert_model')
os.remove("albert_model/special_tokens_map.json")
os.remove("albert_model/tokenizer_config.json")
os.system("mv albert_model ../")
Esempio n. 22
0
# %%
import torch
import string

from transformers import \
    AlbertTokenizer, AlbertForMaskedLM,\
    DistilBertTokenizer, DistilBertForMaskedLM, \
    RobertaTokenizer, RobertaForMaskedLM

albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval()

albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')
albert_large_model = AlbertForMaskedLM.from_pretrained(
    'albert-large-v2').eval()

distilbert_tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-cased')
distilbert_model = DistilBertForMaskedLM.from_pretrained(
    'distilbert-base-cased').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval()

top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
def main(tokenizer_path,
         dataset_path,
         save_path='alectra-small',
         max_steps=1e6,
         accumulate_grad_batches=1,
         gpus=None,
         num_tpu_cores=None,
         distributed_backend=None,
         val_check_interval=0.25,
         val_check_percent=0.25,
         generator_type='albert',
         num_hidden_groups=1,
         d_loss_weight=50,
         mlm_prob=0.15,
         learning_rate=5e-4,
         warmup_steps=10000,
         batch_size=128,
         num_workers=2,
         tie_embedding_proj=False,
         tie_encoder=True,
         shuffle=True,
         lr_schedule='linear',
         resume_from_checkpoint=None,
         use_polyaxon=False):
    # init tokenizer.  only need it for the special chars.
    tokenizer = BertWordPieceTokenizer(tokenizer_path)

    # init generator.
    if generator_type == 'albert':
        generator_config = AlbertConfig(
            vocab_size=tokenizer._tokenizer.get_vocab_size(),
            hidden_size=256,
            embedding_size=128,
            num_hidden_layers=3,
            num_attention_heads=1,
            num_hidden_groups=num_hidden_groups,
            intermediate_size=1024,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            classifier_dropout_prob=0.1,
            max_position_embeddings=128)
        generator = AlbertForMaskedLM(generator_config)
    elif generator_type == 'bert':
        generator_config = BertConfig(
            vocab_size=tokenizer._tokenizer.get_vocab_size(),
            hidden_size=128,
            num_hidden_layers=3,
            num_attention_heads=1,
            intermediate_size=256,
            max_position_embeddings=128)
        generator = BertForMaskedLM(generator_config)
        tie_weights(generator.cls.predictions.decoder,
                    generator.bert.embeddings.word_embeddings)
    else:
        raise Exception(f"invalid generator type: {generator_type}")

    # init discriminator.
    discriminator_config = AlbertConfig(
        vocab_size=tokenizer._tokenizer.get_vocab_size(),
        hidden_size=256,
        embedding_size=128,
        num_hidden_layers=12,
        num_attention_heads=4,
        num_hidden_groups=num_hidden_groups,
        intermediate_size=1024,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        classifier_dropout_prob=0.1,
        max_position_embeddings=128)
    discriminator = AlbertForTokenClassification(discriminator_config)

    # tie the embeddingg weights.
    tie_weights(discriminator.base_model.embeddings.word_embeddings,
                generator.base_model.embeddings.word_embeddings)
    tie_weights(discriminator.base_model.embeddings.position_embeddings,
                generator.base_model.embeddings.position_embeddings)
    tie_weights(discriminator.base_model.embeddings.token_type_embeddings,
                generator.base_model.embeddings.token_type_embeddings)

    if generator_type == 'albert' and tie_encoder:
        print('tying albert encoder layers')
        discriminator.albert.encoder.albert_layer_groups = generator.albert.encoder.albert_layer_groups
    if generator_type == 'albert' and tie_embedding_proj:
        print('tying embedding projection layers')
        discriminator.albert.encoder.embedding_hidden_mapping_in = generator.albert.encoder.embedding_hidden_mapping_in

    # init training module.
    training_config = DiscLMTrainingModuleConfig(max_steps,
                                                 d_loss_weight=d_loss_weight,
                                                 save_path=save_path,
                                                 weight_decay=0.01,
                                                 learning_rate=learning_rate,
                                                 epsilon=1e-6,
                                                 lr_schedule=lr_schedule,
                                                 warmup_steps=warmup_steps)
    if use_polyaxon:
        checkpoint_fn = polyaxon_checkpoint_fn
    else:
        checkpoint_fn = None
    lightning_module = DiscLMTrainingModule(generator,
                                            discriminator,
                                            training_config,
                                            checkpoint_fn=checkpoint_fn)

    # init trainer.
    trainer = Trainer(accumulate_grad_batches=accumulate_grad_batches,
                      gpus=gpus,
                      num_tpu_cores=num_tpu_cores,
                      distributed_backend=distributed_backend,
                      max_steps=max_steps,
                      resume_from_checkpoint=resume_from_checkpoint,
                      val_check_percent=val_check_percent,
                      val_check_interval=val_check_interval)

    # init dataloaders.
    train_loader, val_loader, _ = get_dataloaders(tokenizer, dataset_path,
                                                  trainer, mlm_prob,
                                                  batch_size, num_workers,
                                                  shuffle)

    # train.
    trainer.fit(lightning_module, train_loader, val_loader)

    # save the model.
    output_path = os.path.join(save_path, 'discriminator', 'final')
    os.makedirs(output_path, exist_ok=True)
    lightning_module.discriminator.base_model.save_pretrained(output_path)
    if checkpoint_fn:
        checkpoint_fn(lightning_module)
Esempio n. 24
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT loaded")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT loaded")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base loaded")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base loaded")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor loaded")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base loaded")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 loaded")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base loaded")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded")
        print("====================================")
Esempio n. 25
0
from transformers import AlbertForMaskedLM, AlbertTokenizer
import torch
tokenizer = AlbertTokenizer.from_pretrained("albert-large-v2")
model = AlbertForMaskedLM.from_pretrained("albert-large-v2")
sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
input = tokenizer.encode(sequence, return_tensors="pt")
# 被Mask的字符的位置
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
#获取每个位置的logits, [batch_size, seq_length, vocab_size],  torch.Size([1, 28, 30522]), 即最大的可能性
token_logits = model(input)[0]
#只获取被mask处的单词的logits
mask_token_logits = token_logits[0, mask_token_index, :]
# 我们只取前5个可能的结果,从vocab_size众多结果中
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
#打印前5个结果
for token in top_5_tokens:
    print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
Esempio n. 26
0
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--type_of_model", default = 'albert', help = "pretrained LM type")
parser.add_argument("-p", "--path_to_pytorch_models", help = "path to pytorch_model")
parser.add_argument("--config_and_vocab", help = "path to config.json and vocab.model")
parser.add_argument("-s", "--step", type = str, help = "pretrained step")
parser.add_argument("-d", "--data", help = "path where you put your processed ontonotes data")
parser.add_argument("-o", "--output", help = "output file")
args = parser.parse_args()
print("Reconstruction. step = ", args.step)
if args.type_of_model == 'albert':
  tokenizer = AlbertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model'))
  config = AlbertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json'))
  config.output_hidden_states = True
  model = AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None,
    config = config,
    state_dict = torch.load(os.path.join(
      args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin')))
elif args.type_of_model == 'bert':
  tokenizer = BertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model'))
  config = BertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json'))
  config.output_hidden_states = True
  model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None,
    config = config,
    state_dict = torch.load(os.path.join(
      args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin')))
else:
  raise NotImplementedError("The given model type %s is not supported" % args.type_of_model)


device = 'cuda' if torch.cuda.is_available else 'cpu'    
model.eval().to(device)
Esempio n. 27
0
def init_process(local_rank, backend, config, albert_config, logger):
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    torch.cuda.set_device(local_rank)

    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if local_rank != 0:
        logger.setLevel(logging.WARNING)
    
    if local_rank == 0:
        writer = SummaryWriter()
        if not os.path.exists("save"):
            os.mkdir("save")
        save_path = "save/model_{}.pt".format(re.sub("\s+", "_", time.asctime()))

    reader = Reader(config)
    start = time.time()
    logger.info("Loading data...")
    reader.load_data()
    end = time.time()
    logger.info("Loaded. {} secs".format(end-start))

    model = AlbertForMaskedLM(albert_config).cuda()
    optimizer = Adam(model.parameters(), lr=config.lr)

    if config.save_path is not None:
        load(model, optimizer, config.save_path, local_rank)

    train.global_step = 0
    train.max_iter = len(list(reader.make_batch("train")))
    validate.max_iter = len(list(reader.make_batch("dev")))

    min_loss = 1e+10
    early_stop_count = config.early_stop_count

    # logger.info("Validate...")
    # loss = validate(model, reader, config, local_rank)
    # logger.info("loss: {:.4f}".format(loss))

    for epoch in range(config.max_epochs):
        logger.info("Train...")
        start = time.time()

        if local_rank == 0:
            train_test(model, reader, optimizer, config, local_rank, writer)
        else:
            train_test(model, reader, optimizer, config, local_rank)
        
        exit(0)

        end = time.time()
        logger.info("epoch: {}, {:.4f} secs".format(epoch+1, end-start))

        logger.info("Validate...")
        loss = validate(model, reader, config, local_rank)
        logger.info("loss: {:.4f}".format(loss))
        
        if local_rank == 0:
            writer.add_scalar("Val/loss", loss, epoch+1)

        if loss < min_loss:  # save model
            if local_rank == 0:
                save(model, optimizer, save_path)
                logger.info("Saved to {}.".format(os.path.abspath(save_path)))
            
            min_loss = loss
            early_stop_count = config.early_stop_count
        else:  # ealry stopping
            if early_stop_count == 0:
                if epoch < config.min_epochs:
                    early_stop_count += 1
                    logger.info("Too early to stop training.")
                    logger.info("early stop count: {}".format(early_stop_count))
                else:
                    logger.info("Early stopped.")
                    break
            elif early_stop_count == 2:
                lr = lr / 2
                logger.info("learning rate schedule: {}".format(lr))
                for param in optimizer.param_groups:
                    param["lr"] = lr
            early_stop_count -= 1
            logger.info("early stop count: {}".format(early_stop_count))
    logger.info("Training finished.")
Esempio n. 28
0
def run_benchmark(model_name, benchmark_file, results_file, logging_file):
    with open(benchmark_file, "r") as f:
        benchmark = json.load(f)

    model = AlbertForMaskedLM.from_pretrained(model_name)
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

    # Each pattern will store its own statistics
    results = []
    for pattern in patterns:
        result = {}
        result["false_positives"] = 0
        result["false_negatives"] = 0
        result["total_questions"] = 0
        result["correct"] = 0
        pattern["accuracy"] = 0.0
        result["pattern"] = pattern["prompt"]
        results.append(result)


    with open(logging_file, "w") as log:

        for benchmark_question in benchmark:

            output = fill_mask(benchmark_question["question"])

            output_str = output[0]["sequence"] + "\n"
            for o in output:
                output_str += str(o["token_str"][1:]) + " " + str(o["score"]) + "\n"
            print(output_str)
            log.write(output_str)

            # Update the correct patterns stats
            for result in results:
                if result["pattern"] == benchmark_question["pattern"]:
                    result["total_questions"] += 1
                    if is_correct(output, benchmark_question["answer"]):
                        result["correct"] += 1
                        print("correct")
                        log.write("correct\n")
                    else:
                        print("incorrect")
                        log.write("incorrect\n")
                        if benchmark_question["answer"] == True:
                            result["false_negatives"] += 1
                        else:
                            result["false_positives"] += 1
                    break

    # Calculate each pattern's accuracy
    for result in results:
        result["accuracy"] = float(result["correct"])/result["total_questions"]

    # Calculate and append the overall statistics
    results.append(compute_overall_results(results))
    results.append({"model_name": model_name, "datetime": str(datetime.datetime.now())})

    # Store the results -- downside of no results until the end.
    with open(results_file, "w") as f:
        json.dump(results, f, indent=3)
Roberta = ModelInfo(
    RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True),
    RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta")

XLM = ModelInfo(
    XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024',
                                       return_dict=True),
    XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM")

T5 = ModelInfo(
    T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True),
    T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5")

Albert = ModelInfo(
    AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True),
    AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert")

TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'),
                TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_",
                vocab, "TXL")

if __name__ == "__main__":

    sentences = [sample_sentences("sentences4lara.txt") for i in range(11)]

    sent_dict = dict(zip([str(x) for x in range(1, 11)], sentences))

    sentence = sent_dict[sys.argv[2]]

    batch_size = 100
Esempio n. 30
0
def evaluate(args):
    """
    Evaluate a masked language model using CrowS-Pairs dataset.
    """

    print("Evaluating:")
    print("Input:", args.input_file)
    print("Model:", args.lm_model)
    print("=" * 100)

    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data(args.input_file)

    # supported masked language models
    if args.lm_model == "bert":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
        uncased = True
    elif args.lm_model == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        model = RobertaForMaskedLM.from_pretrained('roberta-large')
        uncased = False
    elif args.lm_model == "albert":
        tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
        model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2')
        uncased = True

    model.eval()
    if torch.cuda.is_available():
        model.to('cuda')

    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open(args.lm_model + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": uncased
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'])


    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)


    df_score.to_csv(args.output_file)
    print('=' * 100)
    print('Total examples:', N)
    print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2))
    print('Stereotype score:', round(stereo_score  / total_stereo * 100, 2))
    if antistereo_score != 0:
        print('Anti-stereotype score:', round(antistereo_score  / total_antistereo * 100, 2))
    print("Num. neutral:", neutral, round(neutral / N * 100, 2))
    print('=' * 100)
    print()