def save_parameters(self, num): """ Save the trained parameters :param num: :return: """ logger.info('saving parameters for doc-rep module on step=%d' % num) save_model(self, num, model_weight_path=self.out_pt_weight_path + '-' + str(num), checkpoint_path=self.out_pt_checkpoint_path)
def save_parameters(self, num): """ Save the trained parameters :param num: :return: """ logger.info('saving parameters for {} module on steps={}'.format( self.model.__class__.__name__, num)) save_model(self, num, model_weight_path=self.out_weight_path + '-' + str(num), checkpoint_path=self.out_checkpoint_path)
def save_parameters(self, num): """ Save the trained parameters :param num: :return: """ logger.info('Saving agent parameters for state module on step=%d' % num) save_model(self, num, model_weight_path=self.out_state_weight_path + '-' + str(num), checkpoint_path=self.out_state_checkpoint_path)
def train(self, model, vocab, train_loader, valid_loader_list, loss_type, start_epoch, num_epochs, args, evaluate_every=1000, last_metrics=None, early_stop=10, opt_name="adam"): """ Training args: model: Model object train_loader: DataLoader object of the training set valid_loader_list: a list of Validation DataLoader objects start_epoch: start epoch (> 0 if you resume the process) num_epochs: last epoch """ history = [] best_valid_val = 1000000000 smoothing = args.label_smoothing early_stop_criteria, early_stop_val = early_stop.split(",")[0], int( early_stop.split(",")[1]) count_stop = 0 logging.info("name " + args.name) if opt_name == "adam": opt = torch.optim.Adam(model.parameters(), lr=args.lr) elif opt_name == "sgd": opt = torch.optim.SGD(model.parameters(), lr=args.lr) else: opt = None for epoch in range(start_epoch, num_epochs): total_loss, total_cer, total_wer, total_char, total_word = 0, 0, 0, 0, 0 total_time = 0 start_iter = 0 final_train_losses = [] final_train_cers = [] logging.info("TRAIN") print("TRAIN") model.train() pbar = tqdm(iter(train_loader), leave=True, total=len(train_loader)) max_len = 0 for i, (data) in enumerate(pbar, start=start_iter): torch.cuda.empty_cache() src, trg, src_percentages, src_lengths, trg_lengths = data max_len = max(max_len, src.size(-1)) opt.zero_grad() try: if args.cuda: src = src.cuda() trg = trg.cuda() start_time = time.time() loss, cer, num_char = self.train_one_batch( model, vocab, src, trg, src_percentages, src_lengths, trg_lengths, smoothing, loss_type) total_cer += cer total_char += num_char loss.backward() if args.clip: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) opt.step() total_loss += loss.item() end_time = time.time() diff_time = end_time - start_time total_time += diff_time pbar.set_description( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f} TOTAL TIME:{:.7f}" .format((epoch + 1), total_loss / (i + 1), total_cer * 100 / total_char, self.get_lr(opt), total_time)) except Exception as e: print(e) # del loss try: torch.cuda.empty_cache() src = src.cpu() trg = trg.cpu() src_splits, src_lengths_splits, trg_lengths_splits, trg_splits, src_percentages_splits = iter( src.split(2, dim=0)), iter( src_lengths.split(2, dim=0)), iter( trg_lengths.split(2, dim=0)), iter( trg.split(2, dim=0)), iter( src_percentages.split(2, dim=0)) j = 0 start_time = time.time() for src, trg, src_lengths, trg_lengths, src_percentages in zip( src_splits, trg_splits, src_lengths_splits, trg_lengths_splits, src_percentages_splits): opt.zero_grad() torch.cuda.empty_cache() if args.cuda: src = src.cuda() trg = trg.cuda() start_time = time.time() loss, cer, num_char = self.train_one_batch( model, vocab, src, trg, src_percentages, src_lengths, trg_lengths, smoothing, loss_type) total_cer += cer total_char += num_char loss.backward() if args.clip: torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_norm) opt.step() total_loss += loss.item() j += 1 end_time = time.time() diff_time = end_time - start_time total_time += diff_time logging.info( "probably OOM, autosplit batch. succeeded") print("probably OOM, autosplit batch. succeeded") except: logging.info( "probably OOM, autosplit batch. skip batch") print("probably OOM, autosplit batch. skip batch") continue pbar.set_description( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f} TOTAL TIME:{:.7f}" .format((epoch + 1), total_loss / (i + 1), total_cer * 100 / total_char, self.get_lr(opt), total_time)) final_train_loss = total_loss / (len(train_loader)) final_train_cer = total_cer * 100 / total_char final_train_losses.append(final_train_loss) final_train_cers.append(final_train_cer) logging.info( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f}".format( (epoch + 1), final_train_loss, final_train_cer, self.get_lr(opt))) # evaluate if (epoch + 1) % evaluate_every == 0: print("") logging.info("VALID") model.eval() final_valid_losses = [] final_valid_cers = [] for ind in range(len(valid_loader_list)): valid_loader = valid_loader_list[ind] total_valid_loss, total_valid_cer, total_valid_wer, total_valid_char, total_valid_word = 0, 0, 0, 0, 0 valid_pbar = tqdm(iter(valid_loader), leave=True, total=len(valid_loader)) for i, (data) in enumerate(valid_pbar): torch.cuda.empty_cache() src, trg, src_percentages, src_lengths, trg_lengths = data try: if args.cuda: src = src.cuda() trg = trg.cuda() loss, cer, num_char = self.train_one_batch( model, vocab, src, trg, src_percentages, src_lengths, trg_lengths, smoothing, loss_type) total_valid_cer += cer total_valid_char += num_char total_valid_loss += loss.item() valid_pbar.set_description( "VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format( ind, total_valid_loss / (i + 1), total_valid_cer * 100 / total_valid_char)) # valid_pbar.set_description("(Epoch {}) VALID LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%".format( # (epoch+1), total_valid_loss/(i+1), total_valid_cer*100/total_valid_char, total_valid_wer*100/total_valid_word)) except: try: torch.cuda.empty_cache() src = src.cpu() trg = trg.cpu() src_splits, src_lengths_splits, trg_lengths_splits, trg_splits, trg_transcript_splits, src_percentages_splits = iter( src.split(2, dim=0)), iter( src_lengths.split(2, dim=0)), iter( trg_lengths.split(2, dim=0)), iter( trg.split(2, dim=0)), iter( trg_transcript.split(2, dim=0) ), iter( src_percentages.split( 2, dim=0)) j = 0 for src, trg, src_lengths, trg_lengths, src_percentages in zip( src_splits, trg_splits, src_lengths_splits, trg_lengths_splits, src_percentages_splits): opt.zero_grad() torch.cuda.empty_cache() if args.cuda: src = src.cuda() trg = trg.cuda() loss, cer, num_char = self.train_one_batch( model, vocab, src, trg, src_percentages, src_lengths, trg_lengths, smoothing, loss_type) total_valid_cer += cer total_valid_char += num_char if args.clip: torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_norm) total_valid_loss += loss.item() j += 1 valid_pbar.set_description( "VALID SET {} LOSS:{:.4f} CER:{:.2f}%". format( ind, total_valid_loss / (i + 1), total_valid_cer * 100 / total_valid_char)) logging.info( "probably OOM, autosplit batch. succeeded") print( "probably OOM, autosplit batch. succeeded") except: logging.info( "probably OOM, autosplit batch. skip batch" ) print( "probably OOM, autosplit batch. skip batch" ) continue final_valid_loss = total_valid_loss / (len(valid_loader)) final_valid_cer = total_valid_cer * 100 / total_valid_char final_valid_losses.append(final_valid_loss) final_valid_cers.append(final_valid_cer) print("VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format( ind, final_valid_loss, final_valid_cer)) logging.info("VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format( ind, final_valid_loss, final_valid_cer)) metrics = {} avg_valid_loss = sum(final_valid_losses) / len( final_valid_losses) avg_valid_cer = sum(final_valid_cers) / len(final_valid_cers) metrics["avg_train_loss"] = sum(final_train_losses) / len( final_train_losses) metrics["avg_valid_loss"] = sum(final_valid_losses) / len( final_valid_losses) metrics["avg_train_cer"] = sum(final_train_cers) / len( final_train_cers) metrics["avg_valid_cer"] = sum(final_valid_cers) / len( final_valid_cers) metrics["train_loss"] = final_train_losses metrics["valid_loss"] = final_valid_losses metrics["train_cer"] = final_train_cers metrics["valid_cer"] = final_valid_cers metrics["history"] = history history.append(metrics) print("AVG VALID LOSS:{:.4f} AVG CER:{:.2f}%".format( sum(final_valid_losses) / len(final_valid_losses), sum(final_valid_cers) / len(final_valid_cers))) logging.info("AVG VALID LOSS:{:.4f} AVG CER:{:.2f}%".format( sum(final_valid_losses) / len(final_valid_losses), sum(final_valid_cers) / len(final_valid_cers))) if epoch % args.save_every == 0: save_model(model, vocab, (epoch + 1), opt, metrics, args, best_model=False) # save the best model early_stop_criteria, early_stop_val if early_stop_criteria == "cer": print("CRITERIA: CER") if best_valid_val > avg_valid_cer: count_stop = 0 best_valid_val = avg_valid_cer save_model(model, vocab, (epoch + 1), opt, metrics, args, best_model=True) else: print("count_stop:", count_stop) count_stop += 1 else: print("CRITERIA: LOSS") if best_valid_val > avg_valid_loss: count_stop = 0 best_valid_val = avg_valid_loss save_model(model, vocab, (epoch + 1), opt, metrics, args, best_model=True) else: count_stop += 1 print("count_stop:", count_stop) if count_stop >= early_stop_val: logging.info("EARLY STOP") print("EARLY STOP\n") break
def train(self, model, train_loader, train_sampler, valid_loader_list, opt, loss_type, start_epoch, num_epochs, label2id, id2label, last_metrics=None): """ Training args: model: Model object train_loader: DataLoader object of the training set valid_loader_list: a list of Validation DataLoader objects opt: Optimizer object start_epoch: start epoch (> 0 if you resume the process) num_epochs: last epoch last_metrics: (if resume) """ history = [] start_time = time.time() best_valid_loss = 1000000000 if last_metrics is None else last_metrics[ 'valid_loss'] smoothing = constant.args.label_smoothing logging.info("name " + constant.args.name) for epoch in range(start_epoch, num_epochs): sys.stdout.flush() total_loss, total_cer, total_wer, total_char, total_word = 0, 0, 0, 0, 0 start_iter = 0 logging.info("TRAIN") model.train() pbar = tqdm(iter(train_loader), leave=True, total=len(train_loader)) for i, (data) in enumerate(pbar, start=start_iter): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() opt.zero_grad() pred, gold, hyp_seq, gold_seq = model(src, src_lengths, tgt, verbose=False) try: # handle case for CTC strs_gold, strs_hyps = [], [] for ut_gold in gold_seq: str_gold = "" for x in ut_gold: if int(x) == constant.PAD_TOKEN: break str_gold = str_gold + id2label[int(x)] strs_gold.append(str_gold) for ut_hyp in hyp_seq: str_hyp = "" for x in ut_hyp: if int(x) == constant.PAD_TOKEN: break str_hyp = str_hyp + id2label[int(x)] strs_hyps.append(str_hyp) except Exception as e: print(e) logging.info("NaN predictions") continue seq_length = pred.size(1) sizes = Variable(src_percentages.mul_(int(seq_length)).int(), requires_grad=False) loss, num_correct = calculate_metrics( pred, gold, input_lengths=sizes, target_lengths=tgt_lengths, smoothing=smoothing, loss_type=loss_type) if loss.item() == float('Inf'): logging.info("Found infinity loss, masking") loss = torch.where(loss != loss, torch.zeros_like(loss), loss) # NaN masking continue # if constant.args.verbose: # logging.info("GOLD", strs_gold) # logging.info("HYP", strs_hyps) for j in range(len(strs_hyps)): strs_hyps[j] = strs_hyps[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') strs_gold[j] = strs_gold[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') cer = calculate_cer(strs_hyps[j].replace(' ', ''), strs_gold[j].replace(' ', '')) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_cer += cer total_wer += wer total_char += len(strs_gold[j].replace(' ', '')) total_word += len(strs_gold[j].split(" ")) loss.backward() if constant.args.clip: torch.nn.utils.clip_grad_norm_(model.parameters(), constant.args.max_norm) opt.step() total_loss += loss.item() non_pad_mask = gold.ne(constant.PAD_TOKEN) num_word = non_pad_mask.sum().item() pbar.set_description( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f}". format((epoch + 1), total_loss / (i + 1), total_cer * 100 / total_char, opt._rate)) logging.info( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f}".format( (epoch + 1), total_loss / (len(train_loader)), total_cer * 100 / total_char, opt._rate)) # evaluate print("") logging.info("VALID") model.eval() for ind in range(len(valid_loader_list)): valid_loader = valid_loader_list[ind] total_valid_loss, total_valid_cer, total_valid_wer, total_valid_char, total_valid_word = 0, 0, 0, 0, 0 valid_pbar = tqdm(iter(valid_loader), leave=True, total=len(valid_loader)) for i, (data) in enumerate(valid_pbar): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() pred, gold, hyp_seq, gold_seq = model(src, src_lengths, tgt, verbose=False) seq_length = pred.size(1) sizes = Variable(src_percentages.mul_( int(seq_length)).int(), requires_grad=False) loss, num_correct = calculate_metrics( pred, gold, input_lengths=sizes, target_lengths=tgt_lengths, smoothing=smoothing, loss_type=loss_type) if loss.item() == float('Inf'): logging.info("Found infinity loss, masking") loss = torch.where(loss != loss, torch.zeros_like(loss), loss) # NaN masking continue try: # handle case for CTC strs_gold, strs_hyps = [], [] for ut_gold in gold_seq: str_gold = "" for x in ut_gold: if int(x) == constant.PAD_TOKEN: break str_gold = str_gold + id2label[int(x)] strs_gold.append(str_gold) for ut_hyp in hyp_seq: str_hyp = "" for x in ut_hyp: if int(x) == constant.PAD_TOKEN: break str_hyp = str_hyp + id2label[int(x)] strs_hyps.append(str_hyp) except Exception as e: print(e) logging.info("NaN predictions") continue for j in range(len(strs_hyps)): strs_hyps[j] = strs_hyps[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') strs_gold[j] = strs_gold[j].replace( constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '') cer = calculate_cer(strs_hyps[j].replace(' ', ''), strs_gold[j].replace(' ', '')) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_valid_cer += cer total_valid_wer += wer total_valid_char += len(strs_gold[j].replace(' ', '')) total_valid_word += len(strs_gold[j].split(" ")) total_valid_loss += loss.item() valid_pbar.set_description( "VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format( ind, total_valid_loss / (i + 1), total_valid_cer * 100 / total_valid_char)) logging.info("VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format( ind, total_valid_loss / (len(valid_loader)), total_valid_cer * 100 / total_valid_char)) metrics = {} metrics["train_loss"] = total_loss / len(train_loader) metrics["valid_loss"] = total_valid_loss / (len(valid_loader)) metrics["train_cer"] = total_cer metrics["train_wer"] = total_wer metrics["valid_cer"] = total_valid_cer metrics["valid_wer"] = total_valid_wer metrics["history"] = history history.append(metrics) if epoch % constant.args.save_every == 0: save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=False) # save the best model if best_valid_loss > total_valid_loss / len(valid_loader): best_valid_loss = total_valid_loss / len(valid_loader) save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=True) if constant.args.shuffle: logging.info("SHUFFLE") print("SHUFFLE") train_sampler.shuffle(epoch)
def train(self, model, train_loader, train_sampler, valid_loaders, opt, loss_type, start_epoch, num_epochs, label2id, id2label, last_metrics=None, logger=None): """ Training args: model: Model object train_loader: DataLoader object of the training set valid_loaders: list of DataLoader object of the validation set opt: Optimizer object start_epoch: start epoch (> 0 if you resume the process) num_epochs: last epoch last_metrics: (if resume) """ if logger is not None: sys.out = logger start_time = time.time() best_valid_loss = 1000000000 if last_metrics is None else last_metrics[ 'valid_loss'] smoothing = constant.args.label_smoothing history = [] for epoch in range(start_epoch, num_epochs): sys.out.flush() total_loss, total_cer, total_wer, total_char, total_word = 0, 0, 0, 0, 0 start_iter = 0 print("TRAIN") model.train() pbar = tqdm(iter(train_loader), leave=True, total=len(train_loader)) for i, (data) in enumerate(pbar, start=start_iter): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() opt.optimizer.zero_grad() pred, gold, hyp_seq, gold_seq = model( src, input_lengths=src_lengths, padded_target=tgt, verbose=constant.args.verbose) strs_gold = [ "".join([id2label[int(x)] for x in gold]) for gold in gold_seq ] strs_hyps = [ "".join([id2label[int(x)] for x in hyp]) for hyp in hyp_seq ] loss, num_correct = calculate_metrics( pred, gold, smoothing=smoothing, loss_type=loss_type, input_lengths=src_lengths, target_lengths=tgt_lengths) if constant.args.verbose: print("GOLD", strs_gold) print("HYP", strs_hyps) for j in range(len(strs_hyps)): cer = calculate_cer(strs_hyps[j], strs_gold[j]) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_cer += cer total_wer += wer total_char += len(strs_gold[j]) total_word += len(strs_gold[j].split(" ")) loss.backward() opt.optimizer.step() total_loss += loss.detach().item() non_pad_mask = gold.ne(constant.PAD_TOKEN) num_word = non_pad_mask.sum().item() pbar.set_description( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%". format((epoch + 1), total_loss / (i + 1), total_cer * 100 / total_char, total_wer * 100 / total_word)) print( "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%".format( (epoch + 1), total_loss / (len(train_loader)), total_cer * 100 / total_char, total_wer * 100 / total_word)) print("VALID") all_valid_loss = [] for valid_task_id in range(len(valid_loaders)): model.eval() sys.out.flush() valid_loader = valid_loaders[valid_task_id] total_valid_loss, total_valid_cer, total_valid_wer, total_valid_char, total_valid_word = 0, 0, 0, 0, 0 valid_pbar = tqdm(iter(valid_loader), leave=True, total=len(valid_loader)) for i, (data) in enumerate(valid_pbar): src, tgt, src_percentages, src_lengths, tgt_lengths = data if constant.USE_CUDA: src = src.cuda() tgt = tgt.cuda() pred, gold, hyp_seq, gold_seq = model( src, input_lengths=src_lengths, padded_target=tgt, verbose=constant.args.verbose) loss, num_correct = calculate_metrics( pred, gold, smoothing=smoothing, loss_type=loss_type, input_lengths=src_lengths, target_lengths=tgt_lengths) strs_gold = [ "".join([id2label[int(x)] for x in gold]) for gold in gold_seq ] strs_hyps = [ "".join([id2label[int(x)] for x in hyp]) for hyp in hyp_seq ] for j in range(len(strs_hyps)): cer = calculate_cer(strs_hyps[j], strs_gold[j]) wer = calculate_wer(strs_hyps[j], strs_gold[j]) total_valid_cer += cer total_valid_wer += wer total_valid_char += len(strs_gold[j]) total_valid_word += len(strs_gold[j].split(" ")) total_valid_loss += loss.detach().item() valid_pbar.set_description( "(Epoch {}) TASK:{} VALID LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%" .format((epoch + 1), valid_task_id, total_valid_loss / (i + 1), total_valid_cer * 100 / total_valid_char, total_valid_wer * 100 / total_valid_word)) all_valid_loss.append(total_valid_loss / len(valid_pbar)) print( "(Epoch {}) TASK:{} VALID LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%" .format((epoch + 1), valid_task_id, total_valid_loss / (len(valid_loader)), total_valid_cer * 100 / total_valid_char, total_valid_wer * 100 / total_valid_word)) metrics = {} metrics["train_loss"] = total_loss / len(train_loader) metrics["valid_loss"] = np.mean(np.array(all_valid_loss)) metrics["valid_losses"] = all_valid_loss metrics["train_cer"] = total_cer metrics["train_wer"] = total_wer metrics["valid_cer"] = total_valid_cer metrics["valid_wer"] = total_valid_wer metrics["history"] = history history.append(metrics) if epoch % constant.args.save_every == 0: save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=False) # save the best model if best_valid_loss > total_valid_loss / len(valid_loader): best_valid_loss = total_valid_loss / len(valid_loader) save_model(model, (epoch + 1), opt, metrics, label2id, id2label, best_model=True) if constant.args.shuffle: print("SHUFFLE") train_sampler.shuffle(epoch)
trainer.writer.add_scalars('iter_loss/loss_d', {'train': loss_d.item()}, trainer.num_updates) trainer.writer.add_scalars('iter_loss/loss_g', {'train': loss_g.item()}, trainer.num_updates) # freed memory torch.cuda.empty_cache() i += 1 # log train stats train_loss_gen /= num_batches train_loss_dis /= num_batches trainer.writer.add_scalars('epoch_loss/loss_d', {'train': train_loss_dis}, trainer.num_updates) trainer.writer.add_scalars('epoch_loss/loss_g', {'train': train_loss_gen}, trainer.num_updates) print('Epoch {}: Loss D - {:.5f}, Loss G - {:.5f}'.format( current_epoch, train_loss_dis, train_loss_gen)) # save model save_model(trainer.gen, trainer.g_optimizer, current_epoch, trainer.num_updates, chkpdir, 'gen') save_model(trainer.dis, trainer.d_optimizer, current_epoch, trainer.num_updates, chkpdir, 'dis')