from transformers import BartTokenizer, BartForConditionalGeneration from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM from transformers import XLNetTokenizer, XLNetLMHeadModel import torch import string from transformers import BertTokenizer, BertForMaskedLM bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval() xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval() xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() bart_tokenizer = BartTokenizer.from_pretrained('bart-large') bart_model = BartForConditionalGeneration.from_pretrained('bart-large').eval() electra_tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-small-generator') electra_model = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator').eval() roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval() top_k = 10
def __init__(self) -> None: self.lists = {} # M-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() self.lists["M-BERT"] = { "Tokenizer": self.bert_multilingual_tokenizer, "Model": self.bert_multilingual_model } print("====================================") print("[BERT] Google Multilingual BERT loaded") print("====================================") # KR-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.krbert_tokenizer = BertTokenizerFast.from_pretrained( 'snunlp/KR-Medium') self.krbert_model = BertForMaskedLM.from_pretrained( 'snunlp/KR-Medium').eval() self.lists["KR-Medium"] = { "Tokenizer": self.krbert_tokenizer, "Model": self.krbert_model } print("====================================") print("[BERT] KR-BERT loaded") print("====================================") # BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_kor_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.lists["bert-kor-base"] = { "Tokenizer": self.bert_kor_tokenizer, "Model": self.bert_kor_model } print("====================================") print("[BERT] BERT-kor-base loaded") print("====================================") # ALBERT from transformers import AlbertForMaskedLM self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.lists["albert-kor-base"] = { "Tokenizer": self.albert_tokenizer, "Model": self.albert_model } print("====================================") print("[BERT] ALBERT-kor-base loaded") print("====================================") # XLM-Roberta from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() self.lists["xlm-roberta-base"] = { "Tokenizer": self.xlmroberta_tokenizer, "Model": self.xlmroberta_model } print("====================================") print("[BERT] XLM-Roberta-kor loaded") print("====================================") from transformers import BertTokenizerFast, EncoderDecoderModel self.tokenizer_bertshared = BertTokenizerFast.from_pretrained( "kykim/bertshared-kor-base") self.bertshared_model = EncoderDecoderModel.from_pretrained( "kykim/bertshared-kor-base") self.lists["bertshared-kor-base"] = { "Tokenizer": self.tokenizer_bertshared, "Model": self.bertshared_model } print("====================================") print("[Seq2seq + BERT] bertshared-kor-base loaded") print("====================================") # gpt3-kor-small_based_on_gpt2 from transformers import BertTokenizerFast, GPT2LMHeadModel self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.model_gpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.lists["gpt3-kor-small_based_on_gpt2"] = { "Tokenizer": self.tokenizer_gpt3, "Model": self.model_gpt3 } print("====================================") print("[GPT3] gpt3-small-based-on-gpt2 loaded") print("====================================") # electra-base-kor from transformers import ElectraTokenizerFast, ElectraModel self.tokenizer_electra = ElectraTokenizerFast.from_pretrained( "kykim/electra-kor-base") self.electra_model = ElectraModel.from_pretrained( "kykim/electra-kor-base") self.lists["electra-kor-base"] = { "Tokenizer": self.tokenizer_electra, "Model": self.electra_model } print("====================================") print("[ELECTRA] electra-kor-base loaded") print("====================================") from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.lists["electra-kor-QA"] = { "Tokenizer": self.electra_tokenizer_QA, "Model": self.electra_model_QA } print("====================================") print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded") print("====================================")
if __name__ == '__main__': line_by_line_f = '/rwproject/kdd-db/20-rayw1/data/line_by_line_post.txt' model_in = 'xlm-roberta-base' config_tag = '-post' model_out = '/rwproject/kdd-db/20-rayw1/language_models/' + model_in + config_tag output_dir = '/rwproject/kdd-db/20-rayw1/language_models/output' + config_tag convert_weibo_text_into_line_by_line( weibo_dir='/rwproject/kdd-db/20-rayw1/rumdect/weibo_json', line_by_line_f=line_by_line_f) print('Loading models...') tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') model = XLMRobertaForMaskedLM.from_pretrained(model_in, return_dict=True) print('Loading dataset...') dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=line_by_line_f, block_size=128, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True,
return len(self.data) if __name__ == "__main__": batch_size = 10 # TODO min_num_words = 3, should we use this? # tokens_file = "/home/layer6/recsys/unsupervised_data/xlmr_trainvalsubmit_only_tokens.p" # don't compute them if not in train/val/submit set # output_file = "/home/layer6/recsys/embeddings/xlmr/" # model_checkpoint = "/home/layer6/recsys/xlm-r/checkpoints/run/checkpoint-1" tokens_file = "/home/kevin/Projects/xlm-r/data/xlmr_all_tweet_tokens_leaderboard.p" # don't compute them if not in train/val/submit set output_dir = "/media/kevin/datahdd/data/embeddings" model_checkpoint = "/home/kevin/Projects/xlm-r/out/checkpoint-500" model = XLMRobertaForMaskedLM.from_pretrained(model_checkpoint) model = model.roberta tokenizer = FakeTokenizer() def collate(batch): tokens = [b[0] for b in batch] lens = [len(x) for x in tokens] tokens = pad_sequence(tokens, batch_first=True, padding_value=tokenizer.pad_token_id) attention_mask = (tokens != tokenizer.pad_token_id).int() return tokens, attention_mask, [b[1] for b in batch], torch.tensor(lens).unsqueeze(1) def mean_emb_no_pad(H, L): mask = torch.arange(H.shape[1]).repeat(H.shape[0], 1)
'--per_gpu_train_batch_size', '2', # 32GB gpu with fp32 '--gradient_accumulation_steps', '32', '--evaluate_during_training', '--do_train', '--do_eval', ]) training_args.val_datapath = 'wikitext-103-raw/wiki.valid.raw' training_args.train_datapath = 'wikitext-103-raw/wiki.train.raw' # Choose GPU import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" xlmr_base = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-base') xlmr_base_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') logger.info('Evaluating xlm-roberta-base (seqlen: 512) for refernece ...') pretrain_and_evaluate(training_args, xlmr_base, xlmr_base_tokenizer, eval_only=True, model_path=None) model_path = f'{training_args.output_dir}/roberta-base-{model_args.max_pos}' if not os.path.exists(model_path): os.makedirs(model_path) logger.info(f'Converting roberta-base into roberta-base-{model_args.max_pos}') model, tokenizer = create_long_model( save_model_to=model_path,