Example #1
0
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
import string

from transformers import BertTokenizer, BertForMaskedLM

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval()

xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
    'xlm-roberta-base').eval()

bart_tokenizer = BartTokenizer.from_pretrained('bart-large')
bart_model = BartForConditionalGeneration.from_pretrained('bart-large').eval()

electra_tokenizer = ElectraTokenizer.from_pretrained(
    'google/electra-small-generator')
electra_model = ElectraForMaskedLM.from_pretrained(
    'google/electra-small-generator').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()

top_k = 10

Example #2
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT loaded")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT loaded")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base loaded")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base loaded")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor loaded")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base loaded")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 loaded")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base loaded")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded")
        print("====================================")
Example #3
0
if __name__ == '__main__':

    line_by_line_f = '/rwproject/kdd-db/20-rayw1/data/line_by_line_post.txt'

    model_in = 'xlm-roberta-base'
    config_tag = '-post'
    model_out = '/rwproject/kdd-db/20-rayw1/language_models/' + model_in + config_tag
    output_dir = '/rwproject/kdd-db/20-rayw1/language_models/output' + config_tag

    convert_weibo_text_into_line_by_line(
        weibo_dir='/rwproject/kdd-db/20-rayw1/rumdect/weibo_json', line_by_line_f=line_by_line_f)

    print('Loading models...')
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
    model = XLMRobertaForMaskedLM.from_pretrained(model_in, return_dict=True)

    print('Loading dataset...')
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=line_by_line_f,
        block_size=128,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        return len(self.data)


if __name__ == "__main__":

    batch_size = 10
    # TODO min_num_words = 3, should we use this?
    # tokens_file = "/home/layer6/recsys/unsupervised_data/xlmr_trainvalsubmit_only_tokens.p"  # don't compute them if not in train/val/submit set
    # output_file = "/home/layer6/recsys/embeddings/xlmr/"
    # model_checkpoint = "/home/layer6/recsys/xlm-r/checkpoints/run/checkpoint-1"
    tokens_file = "/home/kevin/Projects/xlm-r/data/xlmr_all_tweet_tokens_leaderboard.p"  # don't compute them if not in train/val/submit set
    output_dir = "/media/kevin/datahdd/data/embeddings"
    model_checkpoint = "/home/kevin/Projects/xlm-r/out/checkpoint-500"


    model = XLMRobertaForMaskedLM.from_pretrained(model_checkpoint)
    model = model.roberta

    tokenizer = FakeTokenizer()

    def collate(batch):
        tokens = [b[0] for b in batch]
        lens = [len(x) for x in tokens]

        tokens = pad_sequence(tokens, batch_first=True, padding_value=tokenizer.pad_token_id)
        attention_mask = (tokens != tokenizer.pad_token_id).int()

        return tokens, attention_mask, [b[1] for b in batch], torch.tensor(lens).unsqueeze(1)

    def mean_emb_no_pad(H, L):
        mask = torch.arange(H.shape[1]).repeat(H.shape[0], 1)
Example #5
0
        '--per_gpu_train_batch_size',
        '2',  # 32GB gpu with fp32
        '--gradient_accumulation_steps',
        '32',
        '--evaluate_during_training',
        '--do_train',
        '--do_eval',
    ])
training_args.val_datapath = 'wikitext-103-raw/wiki.valid.raw'
training_args.train_datapath = 'wikitext-103-raw/wiki.train.raw'

# Choose GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

xlmr_base = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-base')
xlmr_base_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
logger.info('Evaluating xlm-roberta-base (seqlen: 512) for refernece ...')
pretrain_and_evaluate(training_args,
                      xlmr_base,
                      xlmr_base_tokenizer,
                      eval_only=True,
                      model_path=None)

model_path = f'{training_args.output_dir}/roberta-base-{model_args.max_pos}'
if not os.path.exists(model_path):
    os.makedirs(model_path)

logger.info(f'Converting roberta-base into roberta-base-{model_args.max_pos}')
model, tokenizer = create_long_model(
    save_model_to=model_path,