def evaluate(classifier_model: BertForMultipleChoice, dataloader: DataLoader, device: torch.device): """ モデルの評価を行う。 結果やラベルはDict形式で返される。 """ classifier_model.eval() count_steps = 0 total_loss = 0 preds = None correct_labels = None for batch_idx, batch in enumerate(tqdm(dataloader)): with torch.no_grad(): batch = tuple(t for t in batch) bert_inputs = { "input_ids": batch[0].to(device), "attention_mask": batch[1].to(device), "token_type_ids": batch[2].to(device), "labels": batch[3].to(device) } classifier_outputs = classifier_model(**bert_inputs) loss, logits = classifier_outputs[:2] count_steps += 1 total_loss += loss.item() if preds is None: preds = logits.detach().cpu().numpy() correct_labels = bert_inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) correct_labels = np.append( correct_labels, bert_inputs["labels"].detach().cpu().numpy(), axis=0) pred_labels = np.argmax(preds, axis=1) accuracy = calc_accuracy(pred_labels, correct_labels) eval_loss = total_loss / count_steps ret = { "pred_labels": pred_labels, "correct_labels": correct_labels, "logits": preds, "accuracy": accuracy, "eval_loss": eval_loss } return ret
def test(test_x, test_y): test_set = MyDataset(test_x, test_y) test_loader = DataLoader(test_set, batch_size=conf['valid_bs'], collate_fn=collate_fn, shuffle=False, num_workers=conf['num_workers']) model = BertForMultipleChoice.from_pretrained(conf['model']).to(conf['device']) predictions = [] for fold in [0, 1, 2, 3, 4]: # 把训练后的五个模型挨个进行预测 y_pred = [] model.load_state_dict(torch.load('../save/{}_fold_{}.pt'.format(conf['model'].split('/')[-1], fold))) with torch.no_grad(): tk = tqdm(test_loader, total=len(test_loader), position=0, leave=True, ncols=50) for idx, (input_ids, attention_mask, token_type_ids, y) in enumerate(tk): input_ids, attention_mask, token_type_ids, y = input_ids.to(conf['device']), attention_mask.to( conf['device']), token_type_ids.to(conf['device']), y.to(conf['device']).long() output = model(input_ids, attention_mask, token_type_ids).logits.cpu().numpy() y_pred.extend(output) predictions += [y_pred] return predictions
def get_bert_model_and_tokenizer(ifModel=True): from transformers import BertTokenizer, BertForMultipleChoice if ifModel: model = BertForMultipleChoice.from_pretrained('bert-base-uncased') else: model = None tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') return model, tokenizer
def __init__(self, args): super(Model, self).__init__() model = BertForMultipleChoice.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS) self.model = model train_dataloader, val_dataloader, test_dataloader = get_dataloader(args.data_dir) self._train_dataloader = train_dataloader self._val_dataloader = val_dataloader self._test_dataloader = test_dataloader
def main(batch_size,num_epochs,lr,train_input_dir,dev1_input_dir,result_save_dir): logger.info("seed: {}".format(SEED)) logger.info("batch_size: {} num_epochs: {} lr: {}".format(batch_size,num_epochs,lr)) #Create dataloaders. logger.info("Create train dataset from {}.".format(train_input_dir)) train_dataset=create_dataset(train_input_dir,num_examples=-1,num_options=4) logger.info("Create dev1 dataloader from {}.".format(dev1_input_dir)) dev1_dataset=create_dataset(dev1_input_dir,num_examples=-1,num_options=20) dev1_dataloader=DataLoader(dev1_dataset,batch_size=4,shuffle=False,drop_last=False) #Create a classifier model. logger.info("Create a classifier model.") classifier_model=BertForMultipleChoice.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") classifier_model.to(device) #Create an optimizer and a scheduler. num_iterations=len(train_dataset)//batch_size total_steps = num_iterations*num_epochs optimizer=AdamW(classifier_model.parameters(),lr=lr,eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) #Create a directory to save the results in. os.makedirs(result_save_dir,exist_ok=True) logger.info("Start model training.") for epoch in range(num_epochs): logger.info("===== Epoch {}/{} =====".format(epoch+1,num_epochs)) train_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True,drop_last=False) mean_loss=train(classifier_model,optimizer,scheduler,train_dataloader) logger.info("Mean loss: {}".format(mean_loss)) #Save model parameters. checkpoint_filepath=os.path.join(result_save_dir,"checkpoint_{}.pt".format(epoch+1)) torch.save(classifier_model.state_dict(),checkpoint_filepath) pred_labels,correct_labels,accuracy=evaluate(classifier_model,dev1_dataloader) logger.info("Accuracy: {}".format(accuracy)) #Save results as text files. res_filepath=os.path.join(result_save_dir,"result_eval_{}.txt".format(epoch+1)) labels_filepath=os.path.join(result_save_dir,"labels_eval_{}.txt".format(epoch+1)) with open(res_filepath,"w") as w: w.write("Accuracy: {}\n".format(accuracy)) with open(labels_filepath,"w") as w: for pred_label,correct_label in zip(pred_labels,correct_labels): w.write("{} {}\n".format(pred_label,correct_label)) logger.info("Finished model training.")
def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_choices = self.num_choices model = BertForMultipleChoice(config=config) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() loss, logits = model(multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result)
def train(classifier_model: BertForMultipleChoice, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler.LambdaLR, dataloader: TensorDataset, device: torch.device, logger: logging.Logger, logging_steps: int) -> float: """ モデルの訓練を行う。 """ classifier_model.train() count_steps = 0 total_loss = 0 for batch_idx, batch in enumerate(dataloader): batch = tuple(t for t in batch) bert_inputs = { "input_ids": batch[0].to(device), "attention_mask": batch[1].to(device), "token_type_ids": batch[2].to(device), "labels": batch[3].to(device) } classifier_model.zero_grad() #Forward propagation classifier_outputs = classifier_model(**bert_inputs) loss = classifier_outputs[0] #Backward propagation loss.backward() torch.nn.utils.clip_grad_norm_(classifier_model.parameters(), 1.0) #Update parameters optimizer.step() scheduler.step() count_steps += 1 total_loss += loss.item() if batch_idx % logging_steps == 0: logger.info("Step: {}\tLoss: {}\tlr: {}".format( batch_idx, loss.item(), optimizer.param_groups[0]["lr"])) return total_loss / count_steps
def main(test_input_dir, model_filepath, result_save_dir): #Create a dataloader. test_dataset = create_dataset(test_input_dir, num_examples=-1, num_options=20) test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, drop_last=True) #Create a classifier model. logger.info("Load model parameters from {}.".format(model_filepath)) classifier_model = BertForMultipleChoice.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") classifier_model.to(device) parameters = None if torch.cuda.is_available(): parameters = torch.load(model_filepath) else: parameters = torch.load(model_filepath, map_location=torch.device("cpu")) classifier_model.load_state_dict(parameters) #Create a directory to save the results in. os.makedirs(result_save_dir, exist_ok=True) logger.info("Start model evaluation.") pred_labels, correct_labels, accuracy = evaluate(classifier_model, test_dataloader) logger.info("Accuracy: {}".format(accuracy)) #Save results as text files. res_filepath = os.path.join(result_save_dir, "result_eval.txt") labels_filepath = os.path.join(result_save_dir, "labels_eval.txt") with open(res_filepath, "w") as w: w.write("Accuracy: {}\n".format(accuracy)) with open(labels_filepath, "w") as w: for pred_label, correct_label in zip(pred_labels, correct_labels): w.write("{} {}\n".format(pred_label, correct_label)) logger.info("Finished model evaluation.")
def main(test_input_dir,model_dir,test_upper_bound,result_save_dir): logger.info("Seed: {}".format(SEED)) #Create a dataloader. logger.info("Create test dataloader from {}.".format(test_input_dir)) test_dataset=create_dataset(test_input_dir,num_examples=-1,num_options=20) test_dataloader=DataLoader(test_dataset,batch_size=4,shuffle=False,drop_last=False) #Create a classifier model. logger.info("Create a classifier model.") classifier_model=BertForMultipleChoice.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") classifier_model.to(device) #Create a directory to save the results in. logger.info("Results will be saved in {}.".format(result_save_dir)) os.makedirs(result_save_dir,exist_ok=True) logger.info("Start model evaluation.") for i in range(test_upper_bound): model_filepath=os.path.join(model_dir,"checkpoint_{}.pt".format(i+1)) logger.info("Load model parameters from {}.".format(model_filepath)) parameters=torch.load(model_filepath,map_location=device) classifier_model.load_state_dict(parameters) pred_labels,correct_labels,accuracy=evaluate(classifier_model,test_dataloader) logger.info("Accuracy: {}".format(accuracy)) #Save results as text files. res_filepath=os.path.join(result_save_dir,"result_test_{}.txt".format(i+1)) labels_filepath=os.path.join(result_save_dir,"labels_test_{}.txt".format(i+1)) with open(res_filepath,"w") as w: w.write("Accuracy: {}\n".format(accuracy)) with open(labels_filepath,"w") as w: for pred_label,correct_label in zip(pred_labels,correct_labels): w.write("{} {}\n".format(pred_label,correct_label)) logger.info("Finished model evaluation.")
def _prepare_model(self, freeze, task_name='default'): """Prepare a model to be trained Arguments: freeze {bool} -- Whether to freeze the BERT layers. Returns: [BertForMultipleChoice] -- BertForMultipleChoice model to train """ config = BertConfig.from_pretrained( self.bert_model, num_labels=self.num_choices, finetuning_task=task_name, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(self.local_rank)), ) model = BertForMultipleChoice.from_pretrained( self.bert_model, from_tf=bool(".ckpt" in self.bert_model), config=config, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(self.local_rank)), ) if self.fp16: model.half() model.to(self.device) if freeze: for param in model.bert.parameters(): param.requires_grad = False if self.local_rank != -1: model = DDP(model) elif self.n_gpu > 1: model = torch.nn.DataParallel(model) return model
def test_save_load(trained_model, mcqa_dataset, tmpdir): model_path = str(tmpdir) trained_model.save_model(model_path) mdl_clone = Model(bert_model="bert-base-uncased", device="cpu") config = BertConfig.from_pretrained(model_path, num_labels=4) mdl_clone.model = BertForMultipleChoice.from_pretrained(model_path, config=config) for param1, param2 in zip(mdl_clone.model.parameters(), trained_model.model.parameters()): assert param1.data.allclose(param2.data) mdl_clone.fit(mcqa_dataset.get_dataset(), train_batch_size=1, num_train_epochs=1) _ = mdl_clone.predict_proba(mcqa_dataset.get_dataset(), eval_batch_size=1)
def create_and_check_for_multiple_choice( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_choices = self.num_choices model = BertForMultipleChoice(config=config) model.to(torch_device) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def _get_model(self): """Prepares desired BERT model for specified task""" if self.task == 'ner': self.bert = BertForTokenClassification.from_pretrained( self.weight_path, num_labels=self.num_labels) elif self.task == 'rel_ex': # TODO: Add special rel_ex for entity extraction self.bert = BertForSequenceClassification.from_pretrained( self.weight_path, num_labels=self.num_labels) # Resize to account for added {'<e1>', '</e1>', '<e2>', '</e2>'} self.bert.resize_token_embeddings(len(self.tokenizer)) elif self.task == 'seq_clf': self.bert = BertForSequenceClassification.from_pretrained( self.weight_path, num_labels=self.num_labels) elif self.task == 'mc': self.bert = BertForMultipleChoice.from_pretrained( self.weight_path, num_labels=self.num_labels) elif self.task == 'qa': self.bert = BertForQuestionAnswering.from_pretrained( self.weight_path, num_labels=self.num_labels) else: raise NotImplementedError( f"{self.task} is not an implemented task, use ['ner', 'rel_ex', 'seq_clf', 'mc', 'qa']" )
def main(args): test_input_dir: str = args.test_input_dir bert_model_dir: str = args.bert_model_dir result_save_dir: str = args.result_save_dir test_index_lower_bound: int = args.test_index_lower_bound test_index_upper_bound: int = args.test_index_upper_bound logger.info("{}からテスト用データローダを作成します。".format(test_input_dir)) test_dataset = mf.create_dataset(test_input_dir, num_examples=-1, num_options=20) test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False) logger.info("{}から事前学習済みの重みを読み込みます。".format(bert_model_dir)) classifier_model = BertForMultipleChoice.from_pretrained(bert_model_dir) classifier_model.to(device) for i in range(test_index_lower_bound, test_index_upper_bound): checkpoint_filepath = os.path.join(result_save_dir, "checkpoint_{}.pt".format(i)) logger.info("{}からチェックポイントを読み込みます。".format(checkpoint_filepath)) if os.path.exists(checkpoint_filepath) == False: raise RuntimeError("チェックポイントが存在しません。") parameters = torch.load(checkpoint_filepath, map_location=device) classifier_model.load_state_dict(parameters) result_save_filepath = os.path.join(result_save_dir, "result_test_{}.txt".format(i)) labels_save_filepath = os.path.join(result_save_dir, "labels_test_{}.txt".format(i)) logits_save_filepath = os.path.join(result_save_dir, "logits_test_{}.txt".format(i)) mf.evaluate_and_save_result(classifier_model, test_dataloader, result_save_filepath, labels_save_filepath, logits_save_filepath, device, logger)
def __init__(self, args, tokenizer): """ :param args: """ super(MetaLearner, self).__init__() # self.num_labels = args.num_labels self.outer_batch_size = args.outer_batch_size self.inner_batch_size = args.inner_batch_size self.outer_update_lr = args.outer_update_lr self.inner_update_lr = args.inner_update_lr self.inner_update_step = args.inner_update_step self.inner_update_step_eval = args.inner_update_step_eval self.bert_model = args.bert_model self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.model = BertForMultipleChoice.from_pretrained(self.bert_model) self.model.to(self.device) self.outer_optimizer = Adam(self.model.parameters(), lr=self.outer_update_lr) self.tokenizer = tokenizer self.model.train()
use_gpu = torch.cuda.is_available() use_multi_gpu = False and use_gpu device_ids = [0, 1, 2, 3, 4, 5, 6, 7] tokenizer = BertTokenizerFast.from_pretrained(model_path) print('train loader') train_loader = process('train', tokenizer, batch_size, max_length=max_length) print('valid loader') valid_loader = process('valid', tokenizer, batch_size, max_length=max_length) if os.path.exists(f'{model_name}.bin'): print('load model') model = torch.load(f'{model_name}.bin') else: model = BertForMultipleChoice.from_pretrained(model_path) if use_multi_gpu: model = torch.nn.DataParallel(model, device_ids=device_ids) if use_gpu: model.cuda() optim = AdamW(model.parameters(), lr=lr) num_training_steps = len(train_loader) * epochs // accumulation_steps # num_warmup_steps = num_training_steps * 0.1 // accumulation_steps num_warmup_steps = 0 warm_up = get_cosine_schedule_with_warmup( optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
def main(config, model_filename): if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) if not os.path.exists(config.cache_dir): os.makedirs(config.cache_dir) model_file = os.path.join( config.output_dir, model_filename) # Prepare the device # gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] gpu_ids = [3] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) # Set Random Seeds random.seed(config.seed) torch.manual_seed(config.seed) np.random.seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True tokenizer = BertTokenizer.from_pretrained('./new_bert') model = BertForMultipleChoice.from_pretrained('./new_bert') # ./xlnet_model cache_train_dataset = "cached_dataset_train_Bert_class" cache_dev_dataset = "cached_dataset_dev_Bert_class" if os.path.exists(config.cache_dir + '/' + cache_train_dataset): logger.info("Loading features from cached file %s", config.cache_dir + '/' + cache_train_dataset) train_dataset = torch.load(config.cache_dir + '/' + cache_train_dataset) dev_dataset = torch.load(config.cache_dir + '/' + cache_dev_dataset) else: train_dataset, dev_dataset, test_dataset = load_data(config.data_path, device, tokenizer, config.cache_dir,32,480) logger.info("save cached file in %s", config.cache_dir) torch.save(train_dataset,config.cache_dir + '/' + cache_train_dataset) torch.save(dev_dataset,config.cache_dir + '/' + cache_dev_dataset) train_sampler = RandomSampler(train_dataset) dev_sampler =RandomSampler(dev_dataset) train_dataloader = DataLoader(train_dataset,sampler= train_sampler,batch_size= config.train_batch_size,num_workers=8,pin_memory=False) dev_dataloader = DataLoader(dev_dataset,sampler= dev_sampler,batch_size= config.dev_batch_size,num_workers=8,pin_memory=False) # train_iterator = trange(int(config.epoch_num)) # if config.model_name == "GAReader": # from Bert_GAReader.GAReader.GAReader import GAReader # model = GAReader( # config.bert_word_dim, config.output_dim, config.hidden_size, # config.rnn_num_layers, config.ga_layers, config.bidirectional, # config.dropout, bert_config) # print(model) # no_decay = ['bias', 'LayerNorm.weight'] # optimizer = optim.Adam(model.parameters(), lr=config.lr) param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay) and 'bert' not in n] , 'weight_decay': 0.01,'lr':3e-4}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay) and 'bert' not in n], 'weight_decay': 0.0,'lr':3e-4}, {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay) and 'bert' in n], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay) and 'bert' in n], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.lr,eps=1e-8) # optimizer = optim.AdamW(optimizer_grouped_parameter,lr=config.lr) scheduler = get_linear_schedule_with_warmup(optimizer,16000,200000) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) if config.do_train: train(config.epoch_num, model, train_dataloader, dev_dataloader, optimizer, criterion, ['0', '1', '2', '3', '4'], model_file, config.log_dir, config.print_step, config.clip,device,scheduler) model.load_state_dict(torch.load(model_file)) test_loss, test_acc, test_report = evaluate( model, dev_dataloader, criterion, ['0', '1', '2', '3','4'],device) print("-------------- Test -------------") print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}".format( test_loss, test_acc, test_report['macro avg']['f1-score'], test_report['weighted avg']['f1-score']))
def main(): # bert_config = modeling.BertConfig.from_json_file(FLAGS["bert_config_file"]) # if FLAGS.max_seq_length > bert_config.max_position_embeddings: # raise ValueError( # "Cannot use sequence length %d because the BERT model " # "was only trained up to sequence length %d" % # (FLAGS.max_seq_length, bert_config.max_position_embeddings)) processor = CommonsenseQAProcessor(split=FLAGS["split"]) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( # vocab_file=FLAGS["vocab_file"], do_lower_case=FLAGS["do_lower_case"] ) model = BertForMultipleChoice.from_pretrained('bert-base-uncased', return_dict=True) # model.train() # TODO TPU handling train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS["do_train"]: train_examples = processor.get_train_examples(FLAGS["data_dir"]) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS["do_train"]: print("train started") # train_file = os.path.join(FLAGS.output_dir, "train.tf_record") input_ids_batch, token_type_ids_batch, labels_batch = file_based_convert_examples_to_features( train_examples, label_list, FLAGS["max_seq_length"], tokenizer) # train_input_fn = file_based_input_fn_builder( # input_file=train_file, # seq_length=FLAGS.max_seq_length, # is_training=True, # drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS["do_eval"]: print("eval started") eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") eval_seq_length = file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS["do_predict"]: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") predict_seq_length = file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) test_predictions_file = os.path.join( FLAGS.output_dir, "test_results.csv") with tf.gfile.GFile(test_predictions_file, "w") as writer: tf.logging.info("***** Predict results *****") for example, prediction in zip(predict_examples, result): output_line = ",".join([ str(example.qid), str(CommonsenseQAProcessor.LABELS[np.argmax(prediction)]) ] + [ str(class_probability) for class_probability in prediction ]) + "\n" writer.write(output_line)
def __len__(self): return len(self.labels) train_dataset = TextDataset(train, train_label) test_dataset = TextDataset(val, val_label) # In[ ]: train_dataset[100] # In[ ]: import torch from transformers import BertForMultipleChoice, AdamW, get_linear_schedule_with_warmup model = BertForMultipleChoice.from_pretrained('bert-base-chinese') # device = 'cpu' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # In[ ]: train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
tk = tqdm(test_loader, total=len(test_loader), position=0, leave=True, ncols=50) for idx, (input_ids, attention_mask, token_type_ids, y) in enumerate(tk): input_ids, attention_mask, token_type_ids, y = input_ids.to(conf['device']), attention_mask.to( conf['device']), token_type_ids.to(conf['device']), y.to(conf['device']).long() output = model(input_ids, attention_mask, token_type_ids).logits.cpu().numpy() y_pred.extend(output) predictions += [y_pred] return predictions if __name__ == '__main__': init_seeds(conf['seed']) model = BertForMultipleChoice.from_pretrained(conf['model']).to(conf['device']) # 模型 optimizer = AdamW(model.parameters(), lr=conf['lr'], weight_decay=conf['weight_decay']) # AdamW优化器 if os.path.exists("../save/chinese_wwm_ext_L-12_H-768_A-12_fold_0.pt"): ## test_y全为0 test_x, test_y, q_id = read_valid() predictions = test(test_x, test_y) else: X, y = read_data() # train_X, train_y, test_X, test_y = train_test_split(X, y, test_size=0.3, random_state=44) tokenizer = BertTokenizer.from_pretrained(conf['model']) # 加载bert的分词器 # 交叉验证 folds = StratifiedKFold(n_splits=conf['fold_num'], shuffle=True, random_state=conf['seed']).split(np.arange(len(X)), y) train(folds, model, optimizer) ## test_y全为0
def main2(model_filename, model2_filename, result_save_dir): """ Main function Conducts test with two models. Assumes that the first model is trained with image features and the second model is trained with text features only. Parameters ---------- model_filename: str Filename of the first saved model model2_filename: str Filename of the second saved model result_save_dir: str Directory to save the test result in. """ #Load contexts. logger.info("Start loading contexts.") context_dict = load_contexts(CANDIDATE_ENTITIES_FILENAME) logger.info("Finished loading contexts.") logger.info("Number of contexts: {}".format(len(context_dict))) #Create models. model = BertForMultipleChoice.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") model2 = BertForMultipleChoice.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") if torch.cuda.is_available(): model.cuda() model2.cuda() #If there exist cached files for the model parameters, then load them. if os.path.exists(model_filename): logger.info("Load parameters from {}.".format(model_filename)) model.load_state_dict(torch.load(model_filename)) if os.path.exists(model2_filename): logger.info("Load parameters from {}.".format(model2_filename)) model.load_state_dict(torch.load(model2_filename)) #Test test_dataset = None test_dataset2 = None #Load cached features if cache files exist. if os.path.exists(DEV2_FEATURES_CACHE_DIR + "input_ids.pt"): logger.info("Load features from cached files.") input_ids = torch.load(DEV2_FEATURES_CACHE_DIR + "input_ids.pt") attention_mask = torch.load(DEV2_FEATURES_CACHE_DIR + "attention_mask.pt") token_type_ids = torch.load(DEV2_FEATURES_CACHE_DIR + "token_type_ids.pt") labels = torch.load(DEV2_FEATURES_CACHE_DIR + "labels.pt") test_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, token_type_ids, labels) else: logger.info("Start loading examples.") logger.info("JSON filename: {}".format(DEV2_JSON_FILENAME)) examples = load_examples(DEV2_JSON_FILENAME, option_num=20, use_fixed_label=False) logger.info("Finished loading examples.") logger.info("Number of examples: {}".format(len(examples))) logger.info("Start converting examples to features.") input_ids, attention_mask, token_type_ids, labels = convert_examples_to_features( examples, context_dict, article_dict, option_num=20, max_seq_length=512, image_features_length=50) logger.info("Finished converting examples to features.") os.makedirs(DEV2_FEATURES_CACHE_DIR, exist_ok=True) torch.save(input_ids, DEV2_FEATURES_CACHE_DIR + "input_ids.pt") torch.save(attention_mask, DEV2_FEATURES_CACHE_DIR + "attention_mask.pt") torch.save(token_type_ids, DEV2_FEATURES_CACHE_DIR + "token_type_ids.pt") torch.save(labels, DEV2_FEATURES_CACHE_DIR + "labels.pt") logger.info("Saved cache files in {}.".format(DEV2_FEATURES_CACHE_DIR)) test_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, token_type_ids, labels) #Load cached features if cache files exist. if os.path.exists(DEV2_FEATURES_CACHE_DIR + "input_ids_text_only.pt"): logger.info("Load text-only features from cached files.") input_ids = torch.load(DEV2_FEATURES_CACHE_DIR + "input_ids_text_only.pt") attention_mask = torch.load(DEV2_FEATURES_CACHE_DIR + "attention_mask_text_only.pt") token_type_ids = torch.load(DEV2_FEATURES_CACHE_DIR + "token_type_ids_text_only.pt") labels = torch.load(DEV2_FEATURES_CACHE_DIR + "labels_text_only.pt") test_dataset2 = torch.utils.data.TensorDataset(input_ids, attention_mask, token_type_ids, labels) else: logger.info("Start loading examples.") logger.info("JSON filename: {}".format(DEV2_JSON_FILENAME)) examples = load_examples(DEV2_JSON_FILENAME, option_num=20, use_fixed_label=False) logger.info("Finished loading examples.") logger.info("Number of examples: {}".format(len(examples))) logger.info("Start converting examples to text-only features.") input_ids, attention_mask, token_type_ids, labels = convert_examples_to_features_text_only( examples, context_dict, option_num=20, max_seq_length=512) logger.info("Finished converting examples to text-only features.") #os.makedirs(DEV2_FEATURES_CACHE_DIR,exist_ok=True) torch.save(input_ids, DEV2_FEATURES_CACHE_DIR + "input_ids_text_only.pt") torch.save(attention_mask, DEV2_FEATURES_CACHE_DIR + "attention_mask_text_only.pt") torch.save(token_type_ids, DEV2_FEATURES_CACHE_DIR + "token_type_ids_text_only.pt") torch.save(labels, DEV2_FEATURES_CACHE_DIR + "labels_text_only.pt") logger.info("Saved cache files in {}.".format(DEV2_FEATURES_CACHE_DIR)) test_dataset2 = torch.utils.data.TensorDataset(input_ids, attention_mask, token_type_ids, labels) test_with_two_models(model, model2, test_dataset, test_dataset2, batch_size=4, result_filename=result_save_dir + "result.txt", labels_filename=result_save_dir + "labels.txt")
def main(): ''' NOTES: 1、This is the main function for training a model to achieve your downstream task in natural language processing, such as question&answer match, sequence classification and so on. 2、You could load any other pretrained model which huggingface have supported, for example: hfl/chinese-bert-wwm. 3、Happy for sharing this project to others, if you also do, light the star up and bring a link. 4、Great wishes in modeling, enjoy it !!! ''' PATH = 'drive/MyDrive/drive/haihua/data/' SEED = 2020 EPOCHS = 5 BATCH_SIZE = 16 MAX_LENGTH = 128 LEARNING_RATE = 1e-5 NAME = 'hfl/chinese-bert-wwm' fix_seed(SEED) train = load_data(PATH, train_test='train') test = load_data(PATH, train_test='validation') print('train example: context={}, pair={}, label={}'.format( train[0].context, train[0].pair, train[0].label)) print('test example: context={}, pair={}, label={}'.format( test[0].context, test[0].pair, test[0].label)) print('Data loaded!!') print('***************************') train_dataloader, valid_dataloader = process(train, NAME, BATCH_SIZE, MAX_LENGTH, threshold=0.8) del train print('train data process done !!') print('###########################') test_dataloader = process(test, NAME, BATCH_SIZE, MAX_LENGTH) del test print('test data process done !!') print('###########################') bert = BertForMultipleChoice.from_pretrained(NAME) optimizer = AdamW(bert.parameters(), lr=LEARNING_RATE) total_steps = len(train_dataloader) * EPOCHS # change learning rate dynamically in total steps, # during warmup phase and train period scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) bert.cuda() for epoch in range(EPOCHS): print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS)) print('Training...') bert.train() start_train = time.time() total_train_loss = 0 # fgm = FGM(bert) #* for step, batch in enumerate(train_dataloader): if step % 200 == 0: elapsed = format_time(time.time() - start_train) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) batch_input_ids = batch[1].cuda() batch_token_type_ids = batch[2].cuda() batch_attention_masks = batch[3].cuda() batch_labels = batch[4].cuda() outputs = bert(batch_input_ids, batch_attention_masks, batch_token_type_ids, labels=batch_labels) bert.zero_grad() outputs.loss.backward() torch.nn.utils.clip_grad_norm_(bert.parameters(), 1.0) # score down # fgm.attack() #* # outputs = bert(batch_input_ids, # batch_attention_masks, batch_token_type_ids, labels=batch_labels) #* # loss_adv = outputs.loss #* # loss_adv.backward() #* # fgm.restore() #* del batch_input_ids, batch_token_type_ids, batch_attention_masks, batch_labels optimizer.step() scheduler.step() total_train_loss += outputs.loss.item() average_train_loss = total_train_loss / len(train_dataloader) training_time = format_time(time.time() - start_train) print(" Average training CrossEntropyLoss: {0:.2f}".format( average_train_loss)) print(" Training epcoh took: {:}".format(training_time)) print('Running Validation...') bert.eval() start_eval = time.time() total_eval_loss = 0 total_eval_f1 = 0 for step, batch in enumerate(valid_dataloader): if step % 200 == 0: elapsed = format_time(time.time() - start_train) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(valid_dataloader), elapsed)) batch_input_ids = batch[1].cuda() batch_token_type_ids = batch[2].cuda() batch_attention_masks = batch[3].cuda() batch_labels = batch[4].cuda() with torch.no_grad(): outputs = bert(batch_input_ids, batch_attention_masks, batch_token_type_ids, labels=batch_labels) total_eval_loss += outputs.loss.item() average_eval_loss = total_eval_loss / len(valid_dataloader) total_eval_f1 += flat_accuracy(outputs.logits, batch_labels) del batch_input_ids, batch_token_type_ids, batch_attention_masks, batch_labels validation_time = format_time(time.time() - start_eval) print(" Average eval CrossEntropyLoss: {0:.2f}".format( average_eval_loss)) print(" Eval auc score: {0:.2f}".format(total_eval_f1)) print(' Validation took: {:}'.format(validation_time)) print('Start predict ...') sub_id = [] predictions = [] for step, batch in enumerate(test_dataloader): batch_ids = batch[0] batch_input_ids = batch[1].cuda() batch_token_type_ids = batch[2].cuda() batch_attention_masks = batch[3].cuda() with torch.no_grad(): outputs = bert(batch_input_ids, batch_attention_masks, batch_token_type_ids) ids = batch_ids.tolist() logits = outputs.logits.detach().cpu().numpy() flat_predictions = np.argmax(logits, axis=1).flatten().tolist() sub_id += ids predictions += flat_predictions def convert_id(x): if len(str(x)) < 6: return '0' * (6 - len(str(x))) + str(x) return str(x) def convert_label(x): res = ['A', 'B', 'C', 'D'] return res[x] sub = pd.DataFrame() sub['id'] = sub_id sub['label'] = predictions sub['label'] = sub['label'].apply(convert_label) sub.sort_values('id', inplace=True) sub['id'] = sub['id'].apply(convert_id) sub.to_csv('/content/drive/MyDrive/drive/haihua/output/sub.csv', index=False) print('Everything Done !!')
def main(test_input_dir, im_features_dir, test_upper_bound, result_save_dir): #Load a list of options. logger.info("Load a list of options.") test_options = load_options_list( os.path.join(test_input_dir, "options_list.txt")) #Create a dataloader. logger.info("Create a test dataloader from {}.".format(test_input_dir)) test_dataloader = create_dataloader(test_input_dir, 4, num_options=20, shuffle=False, drop_last=False) #Load a pre-trained BERT model. logger.info("Load a pre-trained BERT model.") bert_model = BertModel.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") bert_model.to(device) #Create a BertForMultipleChoice model. logger.info("Create a BertForMultipleChoice model.") bfmc_model = BertForMultipleChoice.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") bfmc_model.to(device) #Create a directory to save the results in. #os.makedirs(result_save_dir,exist_ok=True) logger.info("Start test.") for i in range(test_upper_bound): parameters_filepath = os.path.join(result_save_dir, "checkpoint_{}.pt".format(i + 1)) logger.info( "Load model parameters from {}.".format(parameters_filepath)) if torch.cuda.is_available(): bfmc_model.load_state_dict(torch.load(parameters_filepath)) else: bfmc_model.load_state_dict( torch.load(parameters_filepath, map_location=torch.device("cpu"))) pred_labels, correct_labels, accuracy = test(bert_model, bfmc_model, test_options, im_features_dir, test_dataloader) logger.info("Accuracy: {}".format(accuracy)) #Save results as text files. res_filepath = os.path.join(result_save_dir, "result_test_{}.txt".format(i + 1)) labels_filepath = os.path.join(result_save_dir, "labels_test_{}.txt".format(i + 1)) with open(res_filepath, "w") as w: w.write("Accuracy: {}\n".format(accuracy)) with open(labels_filepath, "w") as w: for pred_label, correct_label in zip(pred_labels, correct_labels): w.write("{} {}\n".format(pred_label, correct_label)) logger.info("Finished model test.")
def train(): # 检查配置,获取超参数 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device:{} n_gpu:{}".format(device, n_gpu)) seed = hyperparameters["seed"] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) max_seq_length = hyperparameters["max_sent_length"] gradient_accumulation_steps = hyperparameters["gradient_accumulation_steps"] num_epochs = hyperparameters["num_epoch"] train_batch_size = hyperparameters["train_batch_size"] // hyperparameters["gradient_accumulation_steps"] tokenizer = BertTokenizer.from_pretrained("bert-large-uncased", do_lower_case=True) model = BertForMultipleChoice.from_pretrained("bert-large-uncased") model.to(device) # 优化器 param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # 载入数据 train_examples = read_examples('../dataset/train_bert.txt') dev_examples = read_examples('../dataset/test_bert.txt') nTrain = len(train_examples) nDev = len(dev_examples) num_train_optimization_steps = int(nTrain / train_batch_size / gradient_accumulation_steps) * num_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=hyperparameters["learning_rate"]) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * num_train_optimization_steps), num_training_steps=num_train_optimization_steps) global_step = 0 train_features = convert_examples_to_features(train_examples, tokenizer, max_seq_length) dev_features = convert_examples_to_features(dev_examples, tokenizer, max_seq_length) train_dataloader = get_train_dataloader(train_features, train_batch_size) dev_dataloader = get_eval_dataloader(dev_features, hyperparameters["eval_batch_size"]) print("Num of train features:{}".format(nTrain)) print("Num of dev features:{}".format(nDev)) best_dev_accuracy = 0 best_dev_epoch = 0 no_up = 0 epoch_tqdm = trange(int(num_epochs), desc="Epoch") for epoch in epoch_tqdm: model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, label_ids = batch loss, logits = model(input_ids=input_ids, labels=label_ids)[:2] if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 train_loss, train_accuracy = evaluate(model, device, train_dataloader, "Train") dev_loss, dev_accuracy = evaluate(model, device, dev_dataloader, "Dev") if dev_accuracy > best_dev_accuracy: best_dev_accuracy = dev_accuracy best_dev_epoch = epoch + 1 no_up = 0 else: no_up += 1 tqdm.write("\t ***** Eval results (Epoch %s) *****" % str(epoch + 1)) tqdm.write("\t train_accuracy = %s" % str(train_accuracy)) tqdm.write("\t dev_accuracy = %s" % str(dev_accuracy)) tqdm.write("") tqdm.write("\t best_dev_accuracy = %s" % str(best_dev_accuracy)) tqdm.write("\t best_dev_epoch = %s" % str(best_dev_epoch)) tqdm.write("\t no_up = %s" % str(no_up)) tqdm.write("") if no_up >= hyperparameters["patience"]: epoch_tqdm.close() break
def main(test_input_dir, model_dir, example_filepath, count_dir, nqis_filepath, ignores_filepath, test_upper_bound, result_save_dir): logger.info("Seed: {}".format(SEED)) #Create a dataloader. logger.info("Create test dataloader from {}.".format(test_input_dir)) test_dataset = create_dataset(test_input_dir, num_examples=-1, num_options=20) test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, drop_last=True) #Create a classifier model. logger.info("Create a classifier model.") classifier_model = BertForMultipleChoice.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") classifier_model.to(device) #Load examples. examples = load_examples(example_filepath) #Load nqis. nqis = {} with open(nqis_filepath, "r", encoding="utf_8") as r: lines = r.read().splitlines() for line in lines: splits = line.split("\t") word = splits[0] count = int(splits[1]) nqis[word] = count #Load ignores. ignores = [] with open(ignores_filepath, "r", encoding="utf_8") as r: lines = r.read().splitlines() for line in lines: ignores.append(line) mecab = MeCab.Tagger() #Create a directory to save the results in. logger.info("Results will be saved in {}.".format(result_save_dir)) os.makedirs(result_save_dir, exist_ok=True) logger.info("Start model evaluation.") for i in range(test_upper_bound): model_filepath = os.path.join(model_dir, "checkpoint_{}.pt".format(i + 1)) logger.info("Load model parameters from {}.".format(model_filepath)) parameters = torch.load(model_filepath, map_location=device) classifier_model.load_state_dict(parameters) pred_labels, correct_labels, accuracy = evaluate( classifier_model, test_dataloader, examples, mecab, count_dir, nqis, ignores) logger.info("Accuracy: {}".format(accuracy)) #Save results as text files. res_filepath = os.path.join(result_save_dir, "result_test_{}.txt".format(i + 1)) labels_filepath = os.path.join(result_save_dir, "labels_test_{}.txt".format(i + 1)) with open(res_filepath, "w") as w: w.write("Accuracy: {}\n".format(accuracy)) with open(labels_filepath, "w") as w: for pred_label, correct_label in zip(pred_labels, correct_labels): w.write("{} {}\n".format(pred_label, correct_label)) logger.info("Finished model evaluation.")
import torch from transformers import BertTokenizer, BertForMultipleChoice import logging logging.basicConfig(level=logging.INFO) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] print(loss) print(classification_scores)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) else: # Downloading and loading the swag dataset from the hub. datasets = load_dataset("swag", "regular") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Preprocessing the datasets. def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = sum(first_sentences, []) second_sentences = sum(second_sentences, []) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, truncation=True, max_length=max_seq_length, padding="max_length" if data_args.pad_to_max_length else False, ) # Un-flatten return { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } if training_args.do_train: train_dataset = datasets["train"] if "train" not in datasets: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorForMultipleChoice( tokenizer=tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Metric def compute_metrics(eval_predictions): predictions, label_ids = eval_predictions preds = np.argmax(predictions, axis=1) return { "accuracy": (preds == label_ids).astype(np.float32).mean().item() } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics)
def main(batch_size,num_epochs,lr,train_input_dir,dev1_input_dir,im_features_dir,result_save_dir): logger.info("batch_size: {} num_epochs: {} lr: {}".format(batch_size,num_epochs,lr)) #Load lists of options. logger.info("Load lists of options.") train_options=load_options_list(os.path.join(train_input_dir,"options_list.txt")) dev1_options=load_options_list(os.path.join(dev1_input_dir,"options_list.txt")) #Create dataloaders. logger.info("Create a training dataloader from {}.".format(train_input_dir)) train_dataloader=create_dataloader(train_input_dir,batch_size,num_options=4,shuffle=True,drop_last=True) logger.info("Create a dev1 dataloader from {}.".format(dev1_input_dir)) dev1_dataloader=create_dataloader(dev1_input_dir,4,num_options=20,shuffle=False,drop_last=False) #Load a pre-trained BERT model. logger.info("Load a pre-trained BERT model.") bert_model=BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") bert_model.to(device) #Create a BertForMultipleChoice model. logger.info("Create a BertForMultipleChoice model.") bfmc_model=BertForMultipleChoice.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") bfmc_model.to(device) #Create an optimizer and a scheduler. optimizer=AdamW(bfmc_model.parameters(),lr=lr,eps=1e-8) total_steps = len(train_dataloader) * num_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) #Create a directory to save the results in. os.makedirs(result_save_dir,exist_ok=True) logger.info("Start model training.") for epoch in range(num_epochs): logger.info("===== Epoch {}/{} =====".format(epoch+1,num_epochs)) train(bert_model,bfmc_model,train_options,im_features_dir,optimizer,scheduler,train_dataloader) pred_labels,correct_labels,accuracy=evaluate(bert_model,bfmc_model,dev1_options,im_features_dir,dev1_dataloader) logger.info("Accuracy: {}".format(accuracy)) #Save model parameters. checkpoint_filepath=os.path.join(result_save_dir,"checkpoint_{}.pt".format(epoch+1)) torch.save(bfmc_model.state_dict(),checkpoint_filepath) #Save results as text files. res_filepath=os.path.join(result_save_dir,"result_eval_{}.txt".format(epoch+1)) labels_filepath=os.path.join(result_save_dir,"labels_eval_{}.txt".format(epoch+1)) with open(res_filepath,"w") as w: w.write("Accuracy: {}\n".format(accuracy)) with open(labels_filepath,"w") as w: for pred_label,correct_label in zip(pred_labels,correct_labels): w.write("{} {}\n".format(pred_label,correct_label)) logger.info("Finished model training.")
def main(args): train_input_dir:str=args.train_input_dir dev_input_dir:str=args.dev_input_dir bert_model_dir:str=args.bert_model_dir train_batch_size:int=args.train_batch_size num_epochs:int=args.num_epochs lr:float=args.lr result_save_dir:str=args.result_save_dir train_logging_steps:int=args.train_logging_steps logger.info("バッチサイズ: {}".format(train_batch_size)) logger.info("エポック数: {}".format(num_epochs)) logger.info("学習率: {}".format(lr)) logger.info("{}から訓練用データセットを作成します。".format(train_input_dir)) train_dataset=mf.create_dataset(train_input_dir,num_examples=-1,num_options=4) logger.info("{}からDev用データローダを作成します。".format(dev_input_dir)) dev_dataset=mf.create_dataset(dev_input_dir,num_examples=-1,num_options=20) dev_dataloader=DataLoader(dev_dataset,batch_size=4,shuffle=False) logger.info("{}から事前学習済みの重みを読み込みます。".format(bert_model_dir)) classifier_model=BertForMultipleChoice.from_pretrained(bert_model_dir) classifier_model.to(device) #結果を保存するディレクトリを作成する。 logger.info("結果は{}に保存されます。".format(result_save_dir)) os.makedirs(result_save_dir,exist_ok=True) num_iterations=len(train_dataset)//train_batch_size total_steps=num_iterations*num_epochs optimizer=AdamW(classifier_model.parameters(),lr=lr,eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) #訓練ループ for epoch in range(num_epochs): logger.info("===== Epoch {}/{} =====".format(epoch,num_epochs-1)) #訓練 train_dataloader=DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True) mean_loss=mf.train( classifier_model, optimizer, scheduler, train_dataloader, device, logger, train_logging_steps) logger.info("訓練時の損失平均値: {}".format(mean_loss)) #チェックポイントを保存する。 checkpoint_filepath=os.path.join(result_save_dir,"checkpoint_{}.pt".format(epoch)) torch.save(classifier_model.state_dict(),checkpoint_filepath) #評価 result_save_filepath=os.path.join(result_save_dir,"result_eval_{}.txt".format(epoch)) labels_save_filepath=os.path.join(result_save_dir,"labels_eval_{}.txt".format(epoch)) logits_save_filepath=os.path.join(result_save_dir,"logits_eval_{}.txt".format(epoch)) mf.evaluate_and_save_result( classifier_model, dev_dataloader, result_save_filepath, labels_save_filepath, logits_save_filepath, device, logger )
def main(do_train, train_batch_size, train_epoch_num, model_filename, result_save_dir): """ Main function Parameters ---------- do_train: bool Runs model training if true. train_batch_size: int Batch size for model training train_epoch_num: int Number of epochs for model training model_filename: str Filename of the saved model result_save_dir: str Directory to save the test result in. """ #Load the list of articles. logger.info("Start loading the article list.") df = pd.read_table(ARTICLE_LIST_FILENAME, header=None) logger.info("Finished loading the article list.") #Make a dict of articles. logger.info("Start creating a dict of articles.") article_dict = {} for row in df.itertuples(name=None): article_name = row[1] dir_1 = row[2] dir_2 = row[3] image_dir = IMAGE_BASE_DIR + str(dir_1) + "/" + str(dir_2) + "/" article_dict[article_name] = image_dir logger.info("Finished creating a dict of articles.") #Load contexts. logger.info("Start loading contexts.") context_dict = load_contexts(CANDIDATE_ENTITIES_FILENAME) logger.info("Finished loading contexts.") logger.info("Number of contexts: {}".format(len(context_dict))) #Create a model. model = BertForMultipleChoice.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") if torch.cuda.is_available(): model.cuda() #If there exists a cached file for the model parameters, then load it. if os.path.exists(model_filename): logger.info("Load parameters from {}.".format(model_filename)) model.load_state_dict(torch.load(model_filename)) #Load COCO labels. logger.info("Load labels for the COCO dataset.") logger.info("Filename: {}".format(COCO_LABEL_LIST_FILENAME)) label_dict = {} with open(COCO_LABEL_LIST_FILENAME, mode="r", encoding="utf-8") as r: for index, label in enumerate(r): stripped_label = label.strip() label_dict[index] = stripped_label logger.info("{} {}".format(index, stripped_label)) if do_train == True: #Train train_dataset = None #Load cached features it cache files exist. if os.path.exists(TRAIN_FEATURES_CACHE_DIR + "input_ids.pt"): logger.info("Load features from cached files.") input_ids = torch.load(TRAIN_FEATURES_CACHE_DIR + "input_ids.pt") attention_mask = torch.load(TRAIN_FEATURES_CACHE_DIR + "attention_mask.pt") token_type_ids = torch.load(TRAIN_FEATURES_CACHE_DIR + "token_type_ids.pt") labels = torch.load(TRAIN_FEATURES_CACHE_DIR + "labels.pt") train_dataset = torch.utils.data.TensorDataset( input_ids, attention_mask, token_type_ids, labels) else: logger.info("Start loading examples.") logger.info("JSON filename: {}".format(TRAIN_JSON_FILENAME)) examples = load_examples(TRAIN_JSON_FILENAME, option_num=TRAIN_OPTION_NUM, use_fixed_label=True) logger.info("Finished loading examples.") logger.info("Number of examples: {}".format(len(examples))) logger.info("Start converting examples to features.") input_ids, attention_mask, token_type_ids, labels = convert_examples_to_features( examples, context_dict, article_dict, option_num=TRAIN_OPTION_NUM, max_seq_length=512, image_features_length=50) input_ids, attention_mask, token_type_ids, labels = convert_examples_to_features_pred_labels( examples, context_dict, article_dict, label_dict, option_num=4, max_seq_length=512) logger.info("Finished converting examples to features.") os.makedirs(TRAIN_FEATURES_CACHE_DIR, exist_ok=True) torch.save(input_ids, TRAIN_FEATURES_CACHE_DIR + "input_ids.pt") torch.save(attention_mask, TRAIN_FEATURES_CACHE_DIR + "attention_mask.pt") torch.save(token_type_ids, TRAIN_FEATURES_CACHE_DIR + "token_type_ids.pt") torch.save(labels, TRAIN_FEATURES_CACHE_DIR + "labels.pt") logger.info( "Saved cache files in {}.".format(TRAIN_FEATURES_CACHE_DIR)) train_dataset = torch.utils.data.TensorDataset( input_ids, attention_mask, token_type_ids, labels) train(model, train_dataset, batch_size=train_batch_size, epoch_num=train_epoch_num, model_filename=model_filename) #Test test_dataset = None #Load cached features if cache files exist. if os.path.exists(DEV2_FEATURES_CACHE_DIR + "input_ids.pt"): logger.info("Load features from cached files.") input_ids = torch.load(DEV2_FEATURES_CACHE_DIR + "input_ids.pt") attention_mask = torch.load(DEV2_FEATURES_CACHE_DIR + "attention_mask.pt") token_type_ids = torch.load(DEV2_FEATURES_CACHE_DIR + "token_type_ids.pt") labels = torch.load(DEV2_FEATURES_CACHE_DIR + "labels.pt") test_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, token_type_ids, labels) else: logger.info("Start loading examples.") logger.info("JSON filename: {}".format(DEV2_JSON_FILENAME)) examples = load_examples(DEV2_JSON_FILENAME, option_num=20, use_fixed_label=False) logger.info("Finished loading examples.") logger.info("Number of examples: {}".format(len(examples))) logger.info("Start converting examples to features.") input_ids, attention_mask, token_type_ids, labels = convert_examples_to_features_pred_labels( examples, context_dict, article_dict, label_dict, option_num=20, max_seq_length=512) logger.info("Finished converting examples to features.") os.makedirs(DEV2_FEATURES_CACHE_DIR, exist_ok=True) torch.save(input_ids, DEV2_FEATURES_CACHE_DIR + "input_ids.pt") torch.save(attention_mask, DEV2_FEATURES_CACHE_DIR + "attention_mask.pt") torch.save(token_type_ids, DEV2_FEATURES_CACHE_DIR + "token_type_ids.pt") torch.save(labels, DEV2_FEATURES_CACHE_DIR + "labels.pt") logger.info("Saved cache files in {}.".format(DEV2_FEATURES_CACHE_DIR)) test_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, token_type_ids, labels) test(model, test_dataset, batch_size=4, result_filename=result_save_dir + "result.txt", labels_filename=result_save_dir + "labels.txt")