def build_dataloader(file: str, task: int, source2idx: Callable, target2idx: Callable, batch_size: int, firstline: bool, collate: Callable) -> Tuple[DataLoader, int]: """ @param task: Choose the respective task @param source2idx: source tokenizer @param target2idx: target tokenizer @param batch_size: batch size @param firstline: if first line is header @param collate: collate function for sequence conversion @return: Dataloader and the file size """ iterdata, num_lines = Tokenizer.prepare_iter(file, firstline=firstline, task=task) dataset = IterDataset(iterdata, source2idx=source2idx, target2idx=target2idx, num_lines=num_lines) dataloader = DataLoader(dataset, pin_memory=True, batch_size=batch_size, collate_fn=collate, num_workers=8) return dataloader, num_lines
def train(self): # training result is returned after training to inform calling code of the outcome of training # Values: Matching threshold reached (success): 0, Otherwise: 1 # training_result = 1 train_data, train_numlines = Tokenizer.prepare_iter(self.args.train_file, firstline=self.args.firstline, task=2) dev_data, dev_numlines = Tokenizer.prepare_iter(self.args.dev_file, firstline=self.args.firstline, task=2) test_data, test_numlines = Tokenizer.prepare_iter(self.args.test_file, firstline=self.args.firstline, task=2) saved_epoch = 0 nepoch_no_imprv = 0 epoch_start = time.time() max_epochs = self.args.max_epochs best_dev = -np.inf if self.args.metric == "bleu" else np.inf if self.args.tl: # 1. Load pre-trained model from previous model_dir print("INFO: - Load transfer learning models") self.load_transferlearning(epoch=-1) # 2. Update model_dir to the new one if self.args.timestamped_subdir: self.args.model_dir = os.path.abspath(os.path.join(self.args.model_dir, "..")) sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)): os.mkdir(os.path.join(self.args.model_dir, sub_folder)) self.args.model_dir = os.path.join(self.args.model_dir, sub_folder) # 3. Update logfile dir self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file) with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") # 4. save updated arguments and log file to the new folder print("INFO: - Save new argument file") SaveloadHP.save(self.args, os.path.join(self.args.model_dir, self.args.model_args)) dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file) best_dev = dev_bleu[0] if self.args.metric == "bleu" else dev_loss print("INFO: - Transfer learning performance") print(" - Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) self.appendfile("\t- Transfer learning performance") self.appendfile("\t\t- Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) # print("INFO: - Save transfer learning models") # self.save_parameters(epoch=0) # suppose the transfered model is the best one and save in the main dir self.save_parameters(epoch=-1) else: with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") print('Dev metric:', self.args.metric) for epoch in range(1, max_epochs + 1): print("Epoch: %s/%s" % (epoch, max_epochs)) stime = time.time() train_loss = self.train_batch(train_data, train_numlines) print("BONUS: Training time of %.4f" % (time.time() - stime)) # Save the model # print("INFO: - Frequently save models to checkpoint folders") # self.save_parameters(epoch=epoch) # set the first model as the best one and save to the main dir # evaluate on developing data dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file) dev_metric = dev_bleu[0] if self.args.metric == "bleu" else dev_loss cond = dev_metric > best_dev if self.args.metric == "bleu" else dev_loss < best_dev if cond: nepoch_no_imprv = 0 saved_epoch = epoch best_dev = dev_metric print("UPDATES: - New improvement") print(" - Train loss: %.4f" % train_loss) print(" - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) print("INFO: - Save best models") self.save_parameters(epoch=-1) # if dev_string_match >= self.args.matching_threshold: # # TODO: automatically load models to gcp # training_result = 0 # break else: print("UPDATES: - No improvement") print(" - Train loss: %.4f" % train_loss) print(" - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) nepoch_no_imprv += 1 # Decay learning_rate if no improvement if self.args.decay_rate > 0: self.lr_decay(epoch) if nepoch_no_imprv >= self.args.patience: # Load the current best models print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines, self.args.pred_test_file) print("SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_bleu[0], test_string_match, test_speed)) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_bleu[0], test_speed)) return test_bleu[0] epoch_finish, epoch_remain = Timer.timeEst2(epoch_start, epoch / max_epochs) print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish)) print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain)) # print("INFO: - Save best models") # self.save_parameters(epoch=-1) print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines, self.args.pred_test_file) print("SUMMARY: - Completed %d epoches" % max_epochs) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_bleu[0], test_string_match, test_speed)) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_bleu[0], test_string_match, test_speed)) return test_bleu[0]
def train(self): train_data, train_numlines = Tokenizer.prepare_iter( self.args.train_file, firstline=self.args.firstline, task=2) dev_data, dev_numlines = Tokenizer.prepare_iter( self.args.dev_file, firstline=self.args.firstline, task=2) test_data, test_numlines = Tokenizer.prepare_iter( self.args.test_file, firstline=self.args.firstline, task=2) saved_epoch = 0 nepoch_no_imprv = 0 epoch_start = time.time() max_epochs = self.args.max_epochs # best_dev = -np.inf if self.args.metric == "f1" else np.inf best_dev = np.inf if self.args.metric == "loss" else -np.inf with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") if self.args.tl: # 1. Load pre-trained model from previous model_dir print("INFO: - Load transfer learning models") self.load_transferlearning(epoch=-1) # 2. Update model_dir to the new one if self.args.timestamped_subdir: self.args.model_dir = os.path.abspath( os.path.join(self.args.model_dir, "..")) sub_folder = datetime.now().isoformat( sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)): os.mkdir(os.path.join(self.args.model_dir, sub_folder)) self.args.model_dir = os.path.join(self.args.model_dir, sub_folder) # 3. Update logfile dir self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file) # 4. save updated arguments and log file to the new folder print("INFO: - Save new argument file") SaveloadHP.save( self.args, os.path.join(self.args.model_dir, self.args.model_args)) dev_loss, dev_metrics, dev_speed = self.evaluate_batch( dev_data, dev_numlines) # best_dev = dev_metrics[2] if self.args.metric == "f1" else dev_loss best_dev = dev_loss if self.args.metric == "loss" else dev_metrics[ 2] print("INFO: - Transfer learning performance") print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) self.appendfile("\t- Transfer learning performance") self.appendfile( "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) self.appendfile( "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) # print("INFO: - Save transfer learning models") # self.save_parameters(epoch=0) # suppose the transfered model is the best one and save in the main dir self.save_parameters(epoch=-1) for epoch in range(1, max_epochs + 1): print("Epoch: %s/%s" % (epoch, max_epochs)) stime = time.time() train_loss = self.train_batch(train_data, train_numlines) print("BONUS: Training time of %.4f" % (time.time() - stime)) # Save the model # print("INFO: - Frequently save models to checkpoint folders") # self.save_parameters(epoch=epoch) # evaluate on developing data dev_loss, dev_metrics, dev_speed = self.evaluate_batch( dev_data, dev_numlines) # dev_metric = dev_metrics[2] if self.args.metric == "f1" else dev_loss dev_metric = dev_loss if self.args.metric == "loss" else dev_metrics[ 2] # cond = dev_metric > best_dev if self.args.metric == "f1" else dev_loss < best_dev cond = dev_loss < best_dev if self.args.metric == "loss" else dev_metric > best_dev if cond: nepoch_no_imprv = 0 saved_epoch = epoch best_dev = dev_metric print("UPDATES: - New improvement") print(" - Train loss: %.4f" % train_loss) print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch) self.appendfile( "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) self.appendfile( "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) print("INFO: - Save best models") self.save_parameters(epoch=-1) else: print("UPDATES: - No improvement") print(" - Train loss: %.4f" % train_loss) print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) nepoch_no_imprv += 1 # Decay learning_rate if no improvement if self.args.decay_rate > 0: self.lr_decay(epoch) if nepoch_no_imprv >= self.args.patience: # Load the current best models print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_metrics, test_speed = self.evaluate_batch( test_data, test_numlines) print( "SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print( " - Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) print( " - Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile( "\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile( "\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) self.appendfile( "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) return test_metrics epoch_finish, epoch_remain = Timer.timeEst2( epoch_start, epoch / max_epochs) print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish)) print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain)) print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_metrics, test_speed = self.evaluate_batch( test_data, test_numlines) print("SUMMARY: - Completed %d epoches" % max_epochs) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) print( " - Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) self.appendfile( "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) return test_metrics
num_labels=num_labels, id2label=tokenizer.i2tw, label2id=tokenizer.tw2i) model = AutoModelForSequenceClassification.from_pretrained( model_name, from_tf=bool(".ckpt" in model_name), config=config) # data = JSON.read(args.train_data_file) # train_dataset = MapDataset(data, source2idx=source2idx, target2idx=source2idx) # # train_sampler = RandomSampler(train_dataset) # train_sampler = SequentialSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, pin_memory=True, # batch_size=16, collate_fn=collate_fn) data_file = "/media/data/review_response/Dev.json" iterdata, num_lines = Tokenizer.prepare_iter(data_file, firstline=True, task=1) train_iterdataset = IterDataset(iterdata, source2idx=source2idx, target2idx=lb2ids, num_lines=num_lines) # bpe=True, special_tokens_func=build_inputs_with_special_tokens) # avg_scores = {'f': 0., 'p': 0., 'r': 0.} # rouge_lf = [] # rouge = Rouge() # for line in iterdata(0, num_lines): # scores = rouge.get_scores(line[0], line[1])[0] # rouge_lf.append(scores['rouge-l']['f']) # break
sw_size = len(tokenizer.sw2i) tw_size = len(tokenizer.tw2i) collate_fn = Tokenizer.collate_fn(pad_id, True) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" tokenizer = BPE.load(vocab_file) tokenizer.add_tokens(sys_tokens) nl2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id(BPAD) is not None else tokenizer.token_to_id(PAD) sw_size = tokenizer.get_vocab_size() tw_size = tokenizer.get_vocab_size() collate_fn = BPE.collate_fn(pad_id, True) train_data, num_lines = Tokenizer.prepare_iter(filename, firstline=False, task=1) train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=lb2ids, num_lines=num_lines) train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn) for i, batch in enumerate(train_dataloader): # inputs, outputs = batch[0], batch[1] nl_tensor, lb_tensor = batch nl_len_tensor = (nl_tensor != pad_id).sum(dim=1) break use_selfatt = True if use_selfatt: # use the maximum length 5 times larger than input length nlemb_HPs = [sw_size, 50, None, 0.5, True, True, 1000] # nn_mode, ninp, nhid, nlayers, nhead, dropout, activation, norm, his_mask enc_HPs = ["self_attention", 50, 200, 6, 10, 0.5, "relu", None, False]