def from_pretrained(cls, model_name: str): return cls( RobertaForMaskedLM.from_pretrained( model_name, output_attentions=True, output_hidden_states=True, output_additional_info=True, ), RobertaAligner.from_pretrained(model_name), )
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = RobertaForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def main(): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMaskedLM.from_pretrained('roberta-base') #train_data_file = "D:\\Work\\sandbox\\data\\train_citation.jsonl" train_data_file = "D:\\Work\\sandbox\\data\\train_scierc.jsonl" #train_data_file = "D:\\Work\\sandbox\\data\\train_chemprot.jsonl" train_data = datasets.load_dataset("json", data_files=train_data_file)["train"] def tokenization(batched_text): tokenized_batch = tokenizer(batched_text['text'], padding=True, truncation=True, return_special_tokens_mask=True) return tokenized_batch train_data = train_data.map(tokenization, batched=True, batch_size=len(train_data), remove_columns=["text", "label"]) train_data.set_format('torch', columns=['input_ids']) #static masking. Possible TODO: use dynamic masking data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir="./tapt_scierc", overwrite_output_dir=True, num_train_epochs=100, per_device_train_batch_size=2, per_device_eval_batch_size=2, learning_rate=1e-4, #lr_scheduler_type="constant", adam_epsilon=1e-6, adam_beta1=0.9, adam_beta2=0.98, weight_decay=0.01, warmup_ratio=0.06, fp16=True, eval_accumulation_steps=20, save_steps=5000, save_total_limit=2, seed=2) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_data) trainer.train() trainer.save_model("./tapt_scierc")
def train_MLM(vocf,outmodel,data_df): bs=8 #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt fvoc=open(vocf) vlen=len(fvoc.readlines()) fvoc.close() config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \ num_hidden_layers=6,type_vocab_size=1,hidden_size=768) model=RobertaForMaskedLM(config=config) model.num_parameters() dataset=tokDataset(data_df,ttk) # Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn) # data_collator = DataCollatorForLanguageModeling( # tokenizer=ttk, mlm=True, mlm_probability=0.15 # ) data_collator=collate_fn( tokenizer=ttk, mlm=True, mlm_probability=0.15 ) training_args = TrainingArguments( output_dir=outmodel,#embedding model path overwrite_output_dir=True, num_train_epochs=2, per_device_train_batch_size=bs, save_steps=10_000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset, data_collator=data_collator, prediction_loss_only=True ) trainer.train() trainer.save_model(outmodel) print('LM train done: ')
def test_inference_masked_lm(self): model = RobertaForMaskedLM.from_pretrained("roberta-base") input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11, 50265)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. expected_slice = torch.Tensor( [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]] ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
def __init__(self, args): # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') if args.model_path is not None: print("Testing CoLAKE...") print('loading model parameters from {}...'.format( args.model_path)) config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) self.model = RobertaForMaskedLM(config=config) states_dict = torch.load(os.path.join(args.model_path, 'model.bin')) self.model.load_state_dict(states_dict, strict=False) else: print("Testing RoBERTa baseline...") self.model = RobertaForMaskedLM.from_pretrained('roberta-base') self._build_vocab() self._init_inverse_vocab() self._model_device = 'cpu' self.max_sentence_length = args.max_sentence_length
def __init__(self, cfg, device): super().__init__() tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256) _config = RobertaConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=512, num_hidden_layers=4, num_attention_heads=8, max_position_embeddings=256, pad_token_id=1, eos_token_id=0, bos_token_id=2, output_attentions=False, output_hidden_states=False ) _model = RobertaForMaskedLM(_config) _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin')) _model.eval() self.tokenizer = tokenizer self._model = _model self.device = device self.pad_token = 0 self.batch_size = cfg.batch_size self.proj = None if cfg.proj_lang: self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
def __init__(self, train_file_path: str, dev_file_path: str, test_file_path: str, lm_file_path: str, train_batch_size: int, test_batch_size: int, lr: float, lm_weights_file_path: str, epochs: int, lm_pretrain: str, task: int, train_scratch: str, model_path: str, joke_classification_path: str, add_joke_model: str,word2vec : str): ''' :param train_file_path: Path to the train file :param test_file_path: Path to the test file :param train_batch_size: Size of the batch during training :param test_batch_size: Size of the batch during testing :param lr: learning rate ''' super(RBERT, self).__init__() self.bert_model = RobertaForMaskedLM.from_pretrained('roberta-base', output_hidden_states=True) if lm_pretrain != 'true': pass # self.load_joke_lm_weights(lm_weights_file_path) self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.train_file_path = train_file_path self.lm_file_path = lm_file_path # self.lstm = nn.LSTM(768*2,768*2,bidirectional=False) self.attention = nn_nlp.Attention(768 * 2) self.word2vec = word2vec self.dev_file_path = dev_file_path self.test_file_path = test_file_path self.joke_classification_path = joke_classification_path self.lr = lr self.task = task if word2vec=='true': self.gensim_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) else: self.gensim_model = None self.prelu = nn.PReLU() self.add_joke_model = add_joke_model self.epochs = epochs self.linear_joke = nn.Sequential(nn.Dropout(0.3), nn.Linear(768, 2)) self.linear_reg1 = nn.Sequential( nn.Dropout(0.3), nn.Linear(768 * 8, 1024)) if self.task: self.final_linear = nn.Sequential(nn.Dropout(0.3), nn.Linear(1024, 1)) else: self.final_linear = nn.Sequential(nn.Dropout(0.3), nn.Linear(100, 2)) if train_scratch == 'true': self.load_state_dict(torch.load(model_path))
def make_model_and_tok(gpuid): tokenizer = RobertaTokenizer.from_pretrained("roberta-base") model = RobertaForMaskedLM.from_pretrained('roberta-base') _ = model.eval() _ = model.to(gpuid) for param in model.parameters(): param.requires_grad = False pred_model = model.roberta enco_model = pred_model.embeddings.word_embeddings return (model, enco_model, pred_model, tokenizer)
def create_and_check_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_long_model(save_model_to, attention_window, max_pos): model = RobertaForMaskedLM.from_pretrained('roberta-base') tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=max_pos) config = model.config #pdb.set_trace() # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) model.roberta.embeddings.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), ) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.roberta.embeddings.position_embeddings.weight[2:] k += step model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.roberta.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = layer.attention.self.query longformer_self_attn.key_global = layer.attention.self.key longformer_self_attn.value_global = layer.attention.self.value layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
def init_model(model_name: str, device: torch.device, cache_dir): """ Initialize a pre-trained LM :param model_name: from MODEL_CLASSES :param device: CUDA / CPU device :return: the model and tokenizer """ logger.info(f'Initializing {model_name}') tokenizer = RobertaTokenizer.from_pretrained(model_name, cache_dir=cache_dir) model = RobertaForMaskedLM.from_pretrained(model_name, cache_dir=cache_dir) model.to(device) model.eval() return model, tokenizer
def main(args): data = np.load(args.data, allow_pickle=True) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.convert_tokens_to_ids("</s>")), ("<s>", tokenizer.convert_tokens_to_ids("<s>")), ) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) dataset = PhoneDatasetMLM(data, tokenizer) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=args.output_dir, overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=64, logging_steps=2, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(args.output_dir)
def __init__(self, config): super().__init__() self.train_config = config self.roberta = RobertaForMaskedLM.from_pretrained('roberta-base') _ = self.roberta.eval() for param in self.roberta.parameters(): param.requires_grad = False self.pred_model = self.roberta.roberta self.enc_model = self.pred_model.embeddings.word_embeddings self.proj_head = DVProjectionHead_EmbActi() self.lossfunc = nn.BCEWithLogitsLoss() self.acc = Accuracy(threshold=0.0) self.f1 = F1(threshold=0.0)
def __init__(self, model_path='roberta-base', temperature=1.0, top_k=None, top_p=None, device='cuda'): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) self.model_path = model_path self.tokenizer = RobertaTokenizer.from_pretrained(model_path) self.model = RobertaForMaskedLM.from_pretrained(model_path) self.model.to(self.device) self.model.eval()
def test_inference_masked_lm(self): model = RobertaForMaskedLM.from_pretrained("roberta-base") input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11, 50265)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. expected_slice = torch.tensor( [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]] ) # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base') # roberta.eval() # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach() self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def __init__( self, tokenizer: HfRoBERTaTextEncoder, hparams: HyperOptArgumentParser, lm_head: bool = False, ) -> None: super().__init__(768 if "base" in hparams.pretrained_model else 1024, tokenizer) self._n_layers = 13 if "base" in hparams.pretrained_model else 25 self.padding_idx = self.tokenizer.padding_index if not lm_head: self.model = RobertaModel.from_pretrained( hparams.pretrained_model, output_hidden_states=True ) else: mlm_model = RobertaForMaskedLM.from_pretrained( hparams.pretrained_model, output_hidden_states=True ) self.model = mlm_model.roberta self.lm_head = mlm_model.lm_head
def main(args): # Import the custom trained tokenizer tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer) # Define the model config = RobertaConfig(vocab_size=32000) model = RobertaForMaskedLM(config=config) # Import the dataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=args.data, block_size=128, ) # Initialize the data collector data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer) # Set all of the training arguments training_args = TrainingArguments( output_dir=args.output, overwrite_output_dir=True, num_train_epochs=10, per_gpu_train_batch_size=24, save_steps=10_000, save_total_limit=10, ) # Train the model trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() # Save the mode trainer.save_model("./roBERTaCODE_{}_{}".format(args.language, args.size))
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def test_tokenize(self): current_dir = os.path.dirname(os.path.realpath(__file__)) vocab_path = os.path.join(current_dir, 'data', 'vocab.txt') tokenized_smiles = [ 12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 38, 23, 18, 17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23, 20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 13 ] model = RobertaForMaskedLM.from_pretrained( 'seyonec/SMILES_tokenized_PubChem_shard00_50k') model.num_parameters() tokenizer = SmilesTokenizer( vocab_path, max_len=model.config.max_position_embeddings) assert tokenized_smiles == tokenizer.encode( "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1" )
def test_sequence(): result_dir = "../results/" tok_dir = "tokenizer_model/" tokenizer = train.get_tok(tok_dir) csv_path = "../utils/test.csv" txt_name = "test.txt" utils.make_train_txt(csv_path, txt_name) dp = DP("../utils/test.csv") param = dp.param.to_numpy()[-7:] param = param.tolist() param.append("probability") mod_dir = "transformer_model/checkpoint-33000/" model = RobertaForMaskedLM.from_pretrained(mod_dir) fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer, device=0) PD, PB, ill, count = start(fill_mask) denorm = dp.denormalize(ill) l = [] for x in range(len(PD)): tmp = [] for j in range(7): tmp.append(random.randrange(denorm[x][j][0], denorm[x][j][1] + 1)) tmp.append(PB[x]) l.append(tmp) df = pd.DataFrame(l, columns=param) df.to_csv(result_dir + "result.csv", index=False, encoding="cp949")
def train_mod(txt_dir, tokenizer, model_dir): config = RobertaConfig( vocab_size=3305, max_position_embeddings=1024, num_attention_heads=12, num_hidden_layers=6, output_attentions=True, type_vocab_size=1, ) dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=txt_dir, block_size=1024) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=model_dir, overwrite_output_dir=True, num_train_epochs=1000, per_gpu_train_batch_size=16, save_steps=1000, save_total_limit=37, prediction_loss_only=True, ) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=dataset) trainer.train() trainer.save_model(model_dir)
def train(self): if self.has_started(): last_checkpoint = self.get_latest_checkpoint() logger.info(f"Resuming training from: {last_checkpoint}") model = AutoModelForMaskedLM.from_pretrained(last_checkpoint, config=self.config) else: model = RobertaForMaskedLM(config=self.config) trainer = Trainer( model=model, args=self.training_args, data_collator=self.data_collator, train_dataset=self.dataset, prediction_loss_only=True, ) trainer.train() trainer.save_model(f"{self.model_dir}") self.upload()
def evaluate(args): """ Args: ckpt: model checkpoints. hparams_file: the string should end with "hparams.yaml" """ trainer = Trainer(gpus=args.gpus, distributed_backend=args.distributed_backend, deterministic=True) # reload test dataloader # print(trainer.test()) print("path_to_model_checkpoint", args.path_to_model_checkpoint) # print(BertForQA) model = BertForQA.load_from_checkpoint( checkpoint_path=args.path_to_model_checkpoint, hparams_file=args.path_to_model_hparams_file, map_location=None, batch_size=args.eval_batch_size, ) mlm_model = RobertaForMaskedLM.from_pretrained( './cached_models/roberta_squad1_covidmlm(train_and_dev)_3epoch/') model.model.roberta.load_state_dict(mlm_model.roberta.state_dict()) # mlm_model = RobertaForMaskedLM.from_pretrained('./cached_models/roberta_squad1_2epoch_covidmlm_3epoch/') # model.model.roberta.load_state_dict(mlm_model.roberta.state_dict()) # # evaluate ner # model = BertForNERTask.load_from_checkpoint( # checkpoint_path=args.path_to_model_checkpoint, # hparams_file=args.path_to_model_hparams_file, # map_location=None, # batch_size=args.eval_batch_size # ) trainer.test(model=model)
# %% import torch import string from transformers import RobertaTokenizer, RobertaForMaskedLM roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval() top_k = 10 def decode(tokenizer, pred_idx, top_clean): ignore_tokens = string.punctuation + '[PAD]' tokens = [] for w in pred_idx: token = ''.join(tokenizer.decode(w).split()) if token not in ignore_tokens: tokens.append(token.replace('##', '')) return '\n'.join(tokens[:top_clean]) def encode(tokenizer, text_sentence, add_special_tokens=True): text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token) # if <mask> is the last token, append a "." so that models dont predict punctuation. if tokenizer.mask_token == text_sentence.split()[-1]: text_sentence += ' .' input_ids = torch.tensor([ tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens) ]) mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
file_in = sys.argv[1] file_out = sys.argv[2] all_data_dict = dict() max_length = 100 tail_hidd_list = list() #device = "cpu" device = "cuda" pretrained_weights = 'roberta-base' tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights) fine_tuned_weight = 'roberta-base' model = RobertaForMaskedLM.from_pretrained(pretrained_weights, output_hidden_states=True, return_dict=True) #model.load_state_dict(torch.load(fine_tuned_weight), strict=False) #model.to(device).half() model.to(device) model.eval() num_samples = 1000000 old = torch.FloatTensor(768) with open(file_in) as f: #data = json.load(f) for index, d in tqdm(enumerate(f)): if index == 1000000: break
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False) model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config) #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config) # 12-layer, 768-hidden, 12-heads, 110M parameters. elif args.LM == 'RoBerta': from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM config = RobertaConfig(vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture elif args.LM == 'XLM': from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel config = XLMConfig(vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, ) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config) # 6-layer, 1024-hidden, 8-heads
if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare model class tempmodel(nn.Module): def __init__(self, roberta, insert_net, delete_net): super().__init__() self.roberta = roberta self.insert_net = insert_net self.delete_net = delete_net roberta = RobertaForMaskedLM.from_pretrained("roberta-base") insert_net = nn.Linear(768, 3) delete_net = nn.Linear(768, 3) roberta.load_state_dict( torch.load(os.path.join(args.from_dir, 'bert_model.bin'))) # if args.delete: # insert_net.load_state_dict(torch.load(os.path.join(args.from_dir, 'insert_model.bin'))) # else: from weight_init import weight_init insert_net.apply(weight_init) delete_net.apply(weight_init) # init CTRL code roberta.roberta.embeddings.word_embeddings.weight[ 50261, :].data = roberta.roberta.embeddings.word_embeddings.weight[ 0, :].data roberta.roberta.embeddings.word_embeddings.weight[
("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained(SAVE_MODEL, max_len=512) model = RobertaForMaskedLM(config=config) print(model.num_parameters()) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=PATH + "/kant.txt", block_size=128, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir=SAVE_MODEL, overwrite_output_dir=True,
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 tokenizer = RobertaTokenizer.from_pretrained('roberta-large') # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict) # # Initializing a RoBERTa configuration # config = RobertaConfig.from_pretrained('roberta-base') # # Initializing a model from the configuration # roberta = RobertaForMaskedLM(config) # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta.load_state_dict(state_dict) roberta = HappyROBERTA('roberta-large') config = RobertaConfig.from_pretrained('roberta-large') mlm = RobertaForMaskedLM(config) #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt' checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt' state_dict = torch.load(checkpoint_path)["model"] mlm.load_state_dict(state_dict) mlm.eval() roberta.mlm = mlm fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/truism_data/physical_data_sentences_2.json", "r") as f: physical_sents = json.load(f) with open("../data/truism_data/physical_data_2.json", "r") as f: physical_config = json.load(f) with open("../data/finetune_data/sample_from_sets/test_keys.json", "r") as f: test_keys = json.load(f) phy_filtered = {} for key in test_keys['phy']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in phy_filtered.keys(): phy_filtered[index] = {} phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] elif ling_pert not in phy_filtered[index].keys(): phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] else: phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] # physical_sents = {k: physical_sents[k] for k in ('11', '16')} # physical_config = {k: physical_config[k] for k in ('11', '16')} logger.info("finished reading in physical data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=phy_filtered, config=physical_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical dataset results") with open("../data/truism_data/material_data_sentences_2.json", "r") as f: material_sents = json.load(f) with open("../data/truism_data/material_data_2.json", "r") as f: material_config = json.load(f) mat_filtered = {} for key in test_keys['mat']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in mat_filtered.keys(): mat_filtered[index] = {} mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] elif ling_pert not in mat_filtered[index].keys(): mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] else: mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] logger.info("finished reading in material data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=mat_filtered, config=material_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical material results") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) soc_filtered = {} for key in test_keys['soc']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in soc_filtered.keys(): soc_filtered[index] = {} soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] elif ling_pert not in soc_filtered[index].keys(): soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] else: soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] logger.info("finished reading in social data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=soc_filtered, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical social results")