def fit_and_train(self, train_df, val_df, val_train_df, require_grad): NUM_LABELS = 2 max_value = 0 best_model = None tokenizer = RobertaTokenizer.from_pretrained(pre_trained_model_name, do_lower_case=True) trainset = DialogueDataset(train_df, "train", tokenizer=tokenizer) trainloader = DataLoader(trainset, batch_size=self.batch_size, collate_fn=self.create_mini_batch) val_trainset = DialogueDataset(val_train_df, "train", tokenizer=tokenizer) val_trainloader = DataLoader(val_trainset, batch_size=self.batch_size, collate_fn=self.create_mini_batch) valset = DialogueDataset(val_df, 'test', tokenizer=tokenizer) valloader = DataLoader(valset, batch_size=val_batch_size, collate_fn=self.create_mini_batch) config = RobertaConfig.from_pretrained(pre_trained_model_name) config.num_labels = 2 config.type_vocab_size = 2 model = RobertaForSequenceClassification(config) # model = CustomRobertatModel() # model = BertForSequenceClassification.from_pretrained(pre_trained_model_name, num_labels=NUM_LABELS) # model = BertForNextSentencePrediction.from_pretrained(pre_trained_model_name) # if require_grad: # for param in model.parameters(): # param.requires_grad = True model.train() if self.gpu: model = model.cuda(device) for epo in range(self.epoch): total = 0 total_loss = 0 # optimizer = AdamW(model.parameters(), # lr = self.lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 # eps = 1e-8 # args.adam_epsilon - default is 1e-8. # ) optimizer = optim.Adam(model.parameters(), lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01, eps=1e-6) # Total number of training steps is number of batches * number of epochs. total_steps = len(trainloader) * self.epoch # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=1000, t_total=total_steps) for data in trainloader: if self.gpu: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x.type(torch.LongTensor).cuda(device) for x in data] else: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x for x in data] outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) # (tensor(0.6968, grad_fn=<NllLossBackward>), tensor([[-0.0359, -0.0432]], grad_fn=<AddmmBackward>)) loss = outputs[0] # (tensor(0.0086, device='cuda:1', grad_fn=<NllLossBackward>), tensor([[ 2.3423, -2.4149]], device='cuda:1', grad_fn=<AddmmBackward>)) loss.backward( ) # calculate gradientopt = torch.optim.SGD(model.parameters(), lr=self.lr, momentum=0.9) # opt = torch.optim.Adam(model.parameters(), lr = self.lr) # opt = torch.optim.SGD(model.parameters(), lr=self.lr, momentum=0.9) # opt.step() #update parameter # opt.zero_grad() # Clip the norm of the gradients to 1.0. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient optimizer.step() # Update the learning rate. scheduler.step() # Clear out the gradients (by default they accumulate) model.zero_grad() total += len(tokens_tensors) total_loss += loss.item() * len(tokens_tensors) # outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors) # loss_f = nn.CrossEntropyLoss() # loss = loss_f(outputs[0], labels) # loss.backward() # calculate gradientopt = torch.optim.SGD(model.parameters(), lr=self.lr, momentum=0.9) # opt = torch.optim.Adam(model.parameters(), lr = self.lr) # opt.step() #update parameter # opt.zero_grad() # total += len(tokens_tensors) # total_loss += loss.item() * len(tokens_tensors) del data, tokens_tensors, segments_tensors, \ masks_tensors, labels print(f'Epoch : {epo+1}/{self.epoch} , Training Loss : {loss}', end='\r') self.loss_list.append(total_loss / total) print( f'Epoch : {epo+1}/{self.epoch} , Training Loss : {self.loss_list[epo]}', end=',') with open(f'./train_loss_{model_type}.txt', 'w') as f: for i in self.loss_list: f.write(str(i) + '\n') model.eval() numebr = 0 ans = [] with torch.no_grad(): for data in valloader: if self.gpu: tokens_tensors, segments_tensors, masks_tensors, _ = [ x.type(torch.LongTensor).cuda(device) if x is not None else None for x in data ] else: tokens_tensors, segments_tensors, masks_tensors, _ = [ x for x in data ] outputs = model( input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, ) # (tensor([[-0.0359, -0.0432]], grad_fn=<AddmmBackward>)) values = outputs[0].data[:, 1].tolist() ans += values print(f'count : {numebr}', end='\r') numebr += val_batch_size count = 0 val_len = 0 val_df['prob'] = ans groups = val_df.groupby('question') for index, data in groups: val_len += 1 if 'candidate_id' in val_df.columns: pred_id = data.loc[data['prob'].idxmax(), 'candidate_id'] if data.loc[data['prob'].idxmax(), 'ans'] == pred_id: count += 1 val_accu = count / val_len if val_accu >= max_value: max_value = val_accu self.model = model best_model = model torch.save(model.state_dict(), f'./model/{model_name}_torch_dict') self.val_accu_list.append(val_accu) print( f'Epoch : {epo+1}/{self.epoch}, Validation Accuracy : {self.val_accu_list[epo]}', end=',') with open(f'./val_accu_{model_type}.txt', 'w') as f: for i in self.val_accu_list: f.write(str(i) + '\n') ## Eventually fine tuned with validation data for epo in range(val_fine_tuned_epo): total = 0 total_loss = 0 optimizer = AdamW( best_model.parameters(), lr=self. lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) total_steps = len(val_trainloader) * 1 scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=10, t_total=total_steps) for data in val_trainloader: if self.gpu: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x.type(torch.LongTensor).cuda(device) for x in data] else: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x for x in data] outputs = best_model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() total += len(tokens_tensors) total_loss += loss.item() * len(tokens_tensors) del data, tokens_tensors, segments_tensors, \ masks_tensors, labels # check if fine tune with validation work torch.save(best_model.state_dict(), f'./model/{model_name}_torch_dict_tuned_val')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=["vlsp_2018_single", \ "vlsp_2018_NLI_M", "vlsp_2018_QA_M", "vlsp_2018_NLI_B", "vlsp_2018_QA_B"], help="The name of the task to train.") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument('--bpe-codes', default=None, required=True, type=str, help='path to fastBPE BPE') parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--do_save_model", default=False, action='store_true', help="Whether to save checkpoint.") parser.add_argument("--eval_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--accumulate_gradients", type=int, default=1, help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # prepare dataloaders processors = { "vlsp_2018_single":VLSP_2018_single_Processor, "vlsp_2018_NLI_M":VLSP_2018_NLI_M_Processor, "vlsp_2018_QA_M":VLSP_2018_QA_M_Processor, "vlsp_2018_NLI_B":VLSP_2018_NLI_B_Processor, "vlsp_2018_QA_B":VLSP_2018_QA_B_Processor, } processor = processors[args.task_name]() label_list = processor.get_labels() bert_config = RobertaConfig.from_pretrained(args.bert_config_file) bert_config.num_labels = len(label_list) label2id = {} id2label = {} for (i, label) in enumerate(label_list): label2id[label] = i id2label[str(i)] = label bert_config.label2id = label2id bert_config.id2label = id2label if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) bpe = fastBPE(args) vocab = Dictionary() vocab.add_from_file(args.vocab_file) # training set train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, bpe, vocab) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # dev set dev_examples = processor.get_dev_examples(args.data_dir) dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, bpe, vocab) all_dev_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_dev_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_dev_input_ids, all_dev_input_mask, all_dev_segment_ids, all_dev_label_ids) dev_dataloader = DataLoader(dev_data, batch_size=args.eval_batch_size, shuffle=False) # test set if args.eval_test: test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, bpe, vocab) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False) # model and optimizer model = RobertaForSequenceClassification(bert_config) if args.init_checkpoint is not None: model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) # train output_log_file = os.path.join(args.output_dir, "log.txt") print("output_log_file=",output_log_file) with open(output_log_file, "w") as writer: if args.eval_test: writer.write("epoch\tglobal_step\tloss\tdev_loss\tdev_accuracy\ttest_loss\ttest_accuracy\n") else: writer.write("epoch\tglobal_step\tloss\n") global_step = 0 epoch=0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch+=1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch #RoBERTa not use token_type_ids loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if(args.do_save_model): if(n_gpu > 1): torch.save(model.module.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin')) else: torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin')) #dev eval model.eval() dev_loss, dev_accuracy = 0, 0 nb_dev_steps, nb_dev_examples = 0, 0 with open(os.path.join(args.output_dir, "dev_ep_"+str(epoch)+".txt"),"w") as f_dev: for input_ids, input_mask, segment_ids, label_ids in dev_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_dev_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_dev.write(str(outputs[output_i])) for ou in logits[output_i]: f_dev.write(" "+str(ou)) f_dev.write("\n") tmp_dev_accuracy=np.sum(outputs == label_ids) dev_loss += tmp_dev_test_loss.mean().item() dev_accuracy += tmp_dev_accuracy nb_dev_examples += input_ids.size(0) nb_dev_steps += 1 dev_loss = dev_loss / nb_dev_steps dev_accuracy = dev_accuracy / nb_dev_examples # eval_test if args.eval_test: model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 with open(os.path.join(args.output_dir, "test_ep_"+str(epoch)+".txt"),"w") as f_test: for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_test.write(str(outputs[output_i])) for ou in logits[output_i]: f_test.write(" "+str(ou)) f_test.write("\n") tmp_test_accuracy=np.sum(outputs == label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = collections.OrderedDict() if args.eval_test: result = {'epoch': epoch, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps, 'dev_loss': dev_loss, 'dev_accuracy': dev_accuracy, 'test_loss': test_loss, 'test_accuracy': test_accuracy} else: result = {'epoch': epoch, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n")
def convert_roberta_checkpoint_to_pytorch( roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool ): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout roberta_sent_encoder = roberta.model.encoder.sentence_encoder config = RobertaConfig( vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0] print("Our BERT config:", config) model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] # self attention self_attn: BertSelfAttention = layer.attention.self assert ( roberta_layer.self_attn.k_proj.weight.data.shape == roberta_layer.self_attn.q_proj.weight.data.shape == roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size((config.hidden_size, config.hidden_size)) ) self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias # end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)