def trainIters(self, n_iters, model_file_path=None): print("trainIters__Started___model_file_path is : ", model_file_path) iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() print("Max iteration : n_iters = ", n_iters) print("going to start running iter NO : ", iter) print("\n******************************\n") while iter < n_iters: print("\n###################################\n") print("iter : ", iter) batch = self.batcher.next_batch() print("batch data loading : ", batch) loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) print("running_avg_loss : ", running_avg_loss) iter += 1 if iter % 100 == 0: ##100 self.summary_writer.flush() print_interval = 100 #1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 500 == 0: ##5000 self.save_model(running_avg_loss, iter)
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if (math.isnan(running_avg_loss)): print('Found a nan loss return. Restarting the training at {}' \ .format(self.last_good_model_save_path)) iter, running_avg_loss = self.setup_train(self.last_good_model_save_path) start = time.time() if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 1000 == 0: self.save_model(running_avg_loss, iter)
def run_eval(self): running_avg_loss, iter = 0, 0 start = time.time() batch = self.batcher.next_batch() print( "-----------------------------------------STARTING EVALATION---------------------------------------" ) with open(config.eval_log, 'a+', encoding='utf-8') as f: f.write( "-----------------------------------------STARTING EVALATION---------------------------------------" + "\n") while batch is not None: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 20 == 0: self.summary_writer.flush() print_interval = 100 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, running_avg_loss)) start = time.time() with open(config.eval_log, 'a+', encoding='utf-8') as f: f.write("Steps: " + str(iter) + " loss: " + str(running_avg_loss) + "\n") if (iter + 1) % config.max_iterations_eval == 0: break batch = self.batcher.next_batch() return running_avg_loss
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: # print("iteration", iter) batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 10000 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) logging.info( 'steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 10000 == 0: self.save_model(running_avg_loss, iter)
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() best_loss = 20 best_iter = 0 while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch, iter) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 # is_new_best = (running_avg_loss < best_loss) and (iter - best_iter >= 100) # best_loss = min(running_avg_loss, best_loss) if iter % 20 == 0: self.summary_writer.flush() print_interval = 100 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 2500 == 0: self.save_model(running_avg_loss, iter) if loss < 2.0: self.save_best_so_far(running_avg_loss, iter)
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) sys.stdout.flush() # data_path = "lib/data/batches_train.vocab50000.batch16.pk.bin" # with open(data_path, 'rb') as f: # stored_batches = pickle.load(f, encoding="bytes") # print("loaded data: {}".format(data_path)) # num_batches = len(stored_batches) while iter < n_iters: batch = self.batcher.next_batch() # batch_id = iter%num_batches # batch = stored_batches[batch_id] loss = self.train_one_batch(batch) # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 # if iter % 100 == 0: # self.summary_writer.flush() if iter % self.print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format(str(datetime.now()), iter, loss)) sys.stdout.flush() if iter % config.save_every == 0: self.save_model(running_avg_loss, iter) print("Finished training!")
def run_eval(self): running_avg_loss, iter = 0, 0 batch_losses = [] # while batch is not None: for _ in range(835): batch = self.batcher.next_batch() loss = self.eval_one_batch(batch) batch_losses.append(loss) # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 # if iter % 100 == 0: # self.summary_writer.flush() print_interval = 10 if iter % print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format( str(datetime.now()), iter, loss)) avg_loss = sum(batch_losses) / len(batch_losses) print("Finished Eval for Model {}: Avg Loss = {:.5f}".format( self.model_file_path, avg_loss))
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 #print ("Iteration:",iter) #CM - debugging if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: print("Iteration:", iter) #CM - debugging self.save_model(running_avg_loss, iter) #CM - debugging - if reach the end before hitting 5000, write the model out elif iter == n_iters: self.save_model(running_avg_loss, iter)
def trainIters(self, n_iters, model_file_path=None): start_iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() best_val_loss = None for it in tqdm(range(start_iter, n_iters)): iter = start_iter + it self.model.train() batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) #iter += 1 print_interval = 1000 if iter != 0 and iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter != 0 and iter % 5000 == 0: loss = self.run_eval() if best_val_loss is None or loss < best_val_loss: best_val_loss = loss self.save_model(running_avg_loss, iter) print("Saving best model")
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() start_iter = iter while iter < n_iters: batch = self.batcher.next_batch() loss, tau = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if config.DEBUG: debug('iter', iter) if iter - start_iter > config.BREAK_POINT: break if iter % 100 == 0: self.summary_writer.flush() print_interval = 100 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) if config.adaptive_sparsemax: print('tau + eps', [ round(e[0], 4) for e in (tau + config.eps).detach().cpu().numpy().tolist() ]) start = time.time() if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch, iter) val_loss = None if iter % 100 == 0: val_batch = self.val_batcher.next_batch() val_loss = self.eval_one_batch(val_batch) # print("val_loss",val_loss) self.scheduler.step() print("lr", self.scheduler.get_lr()) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1 if iter % print_interval == 0: if val_loss is not None: print( 'steps %d, seconds for %d batch: %.2f , loss: %f , eval_loss: %f' % (iter, print_interval, time.time() - start, loss, val_loss)) else: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 1000 == 0: self.save_model(running_avg_loss, iter)
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) sys.stdout.flush() ami_data = load_ami_data('train') valid_data = load_ami_data('valid') # make the training data 100 random.shuffle(valid_data) ami_data.extend(valid_data[:6]) valid_data = valid_data[6:] num_batches = len(ami_data) idx = 0 # validation & stopping best_valid_loss = 1000000000 stop_counter = 0 while iter < n_iters: if idx == 0: print("shuffle training data") random.shuffle(ami_data) loss = self.train_one_batch(ami_data, idx) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 idx += config.batch_size if idx == num_batches: idx = 0 if iter % self.print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format( str(datetime.now()), iter, loss)) sys.stdout.flush() if iter % config.save_every == 0: self.save_model(running_avg_loss, iter) if iter % config.eval_every == 0: valid_loss = self.run_eval(valid_data) print("valid_loss = {:.5f}".format(valid_loss)) if valid_loss < best_valid_loss: stop_counter = 0 best_valid_loss = valid_loss print("VALID better") else: stop_counter += 1 print( "VALID NOT better, counter = {}".format(stop_counter)) if stop_counter == config.stop_after: print("Stop training") return print("Finished training!")
def run_eval(self): running_avg_loss, iter = 0, 0 self.model.eval() self.eval_batcher._finished_reading = False self.eval_batcher.setup_queues() batch = self.eval_batcher.next_batch() while batch is not None: loss = self.get_loss(batch).item() if loss is not None: running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 batch = self.eval_batcher.next_batch() msg = 'Eval: loss: %f' % running_avg_loss print(msg) return running_avg_loss
def run_eval(self, eval_data): running_avg_loss, iter = 0, 0 batch_losses = [] num_batches = len(eval_data) print("valid data size = {}".format(num_batches)) for idx in range(num_batches): loss = self.eval_one_batch(eval_data, idx) batch_losses.append(loss) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("#", end="") sys.stdout.flush() print() avg_loss = sum(batch_losses) / len(batch_losses) return avg_loss
def trainIters(self, n_src_vocab, n_tgt_vocab, n_iters, model_file_path=None): print("Setting up the model...") iter, running_avg_loss = self.setup_train(n_src_vocab, n_tgt_vocab, model_file_path) print("Starting training...") print("Data for this model will be stored in", self.model_dir) start = time.time() #only_batch = None losses = [] iters = [] save_name = os.path.join(self.model_dir, "loss_lists") while iter < n_iters: batch = self.batcher.next_batch() # if iter == 0: # only_batch = batch # else: # batch = only_batch loss = self.train_one_batch(batch, iter) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 50 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() iters.append(iter) losses.append(loss) with open(save_name, 'wb') as f: pickle.dump((losses, iters), f) if iter % 5000 == 0: path = self.save_model(running_avg_loss, iter) print("Saving Checkpoint at {}".format(path))
def run_eval(self, model_dir, train_iter_id): dataloader = DataLoader(self.dataset, batch_size=config.batch_size, shuffle=False, num_workers=1, collate_fn=create_batch_collate( self.vocab, config.batch_size)) running_avg_loss, iter = 0, 0 start = time.time() # batch = self.batcher.next_batch() pg_losses = [] run_avg_losses = [] for batch in dataloader: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("Iteration:", iter, " loss:", loss, " Running avg loss:", running_avg_loss) iter += 1 print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, running_avg_loss)) start = time.time() pg_losses.append(loss) run_avg_losses.append(running_avg_loss) # Dump val losses pickle.dump( pg_losses, open( os.path.join(model_dir, 'val_pg_losses_{}.p'.format(train_iter_id)), 'wb')) pickle.dump( run_avg_losses, open( os.path.join(model_dir, 'val_run_avg_losses_{}.p'.format(train_iter_id)), 'wb')) return run_avg_losses
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() if iter % self.print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format( str(datetime.now()), iter, loss)) if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
def run_eval(self): running_avg_loss, iter = 0, 0 start = time.time() batch = self.batcher.next_batch() while batch is not None: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % ( iter, print_interval, time.time() - start, running_avg_loss)) start = time.time() batch = self.batcher.next_batch()
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("Iteration:", iter, " loss:", loss, " Running avg loss:", running_avg_loss) iter += 1 print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 1000 == 0: self.save_model(running_avg_loss, iter)
def run_eval(self): running_avg_loss, iter = 0, 0 start = time.time() batch = self.batcher.next_batch() while batch is not None: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 10 == 0: self.summary_writer.flush() print_interval = 1 if iter % print_interval == 0: print('iters = %d, time = %s , loss: %f' % (iter, time_since(start), running_avg_loss)) start = time.time() batch = self.batcher.next_batch()
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 10 == 0: self.summary_writer.flush() print_interval = 10 if iter % print_interval == 0: print('iters = %d, time = %s, loss: %f' % (iter, time_since(start), loss)) if iter % 10 == 0: self.save_model(running_avg_loss, iter)
def on_backward_begin(self, loss): """ :param loss: [] :return: """ print("|epoch: %d step: %d loss: %.4f|" % (self.epoch, self.step, loss.item())) if not np.isfinite(loss.item()): logger.error("train Loss is not finite. Stopping.") logger.info(loss.item()) for name, param in self.model.named_parameters(): if param.requires_grad: logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") self.running_avg_loss = calc_running_avg_loss(loss.item(), self.running_avg_loss, self.summary_writer, self.step)
def run_eval(self): running_avg_loss, iter = 0, 0 start = time.time() batch = self.batcher.next_batch() while batch is not None: loss = self.eval_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("Iteration:", iter, " loss:", loss, " Running avg loss:", running_avg_loss) iter += 1 print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, running_avg_loss)) start = time.time() batch = self.batcher.next_batch()
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() for iter in tqdm(range(n_iters), total=n_iters, desc='Training'): batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
def run_eval(self): self.model.eval() batch = self.batcher_eval.next_batch() iter = 0 start = time.time() running_avg_loss = 0 with torch.no_grad(): while batch is not None: loss, _ = self.model_batch_step(batch, False) loss = loss.item() running_avg_loss = calc_running_avg_loss(loss, running_avg_loss) batch = self.batcher_eval.next_batch() iter += 1 if iter % config.print_interval == 0: print('Eval steps %d, seconds for %d batch: %.2f , loss: %f' % ( iter, config.print_interval, time.time() - start, running_avg_loss)) start = time.time() return running_avg_loss
def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() batch_ds = self.ds_batcher.next_batch() loss = self.train_one_batch(batch, iter, batch_ds) loss = loss.cpu() running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 5 if iter % print_interval == 0: print('steps %d , loss: %f' % (iter, loss)) start = time.time() if iter % 50000 == 0: self.save_model(running_avg_loss, iter)
def on_backward_begin(self, loss): self.loss_update_every.append(loss.item()) if isinstance(loss, tuple) and not np.isfinite(loss[0].item()): logger.error("train Loss is not finite. Stopping.") logger.info(loss[0].item()) for name, param in self.model.named_parameters(): if param.requires_grad: logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") if self.step % self.update_every == 0: assert len(self.loss_update_every) == self.update_every loss_batch = sum(self.loss_update_every) self.loss_update_every = [] # report the loss if self.step < 10 or self.step % 1000 == 0: logger.info( "|epoch: %d step: %d log_loss: %.4f |" % (self.epoch, self.step / self.update_every, loss_batch)) self.running_avg_loss = calc_running_avg_loss( loss_batch, self.running_avg_loss, self.step / self.update_every)
def trainIters(self, n_iters, model_file_path=None): min_loss = float(100000) iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 20 == 0: self.summary_writer.flush() print_interval = 1000 if config.fix_bug: print_interval = 100 #100 if iter % print_interval == 0: with open(config.train_log, 'a+', encoding='utf-8') as f: f.write("steps: " + str(iter) + " loss: " + str(loss) + "\n") print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: #5000 model_save_path = self.save_model(running_avg_loss, iter) eval_process = eval.Evaluate(model_save_path, self.vocab) curr_loss = eval_process.run_eval() if curr_loss < min_loss: min_loss = curr_loss print("Loss update: ", min_loss) with open(config.best_model_log, 'a+', encoding='utf-8') as f: f.write(model_save_path + "\nLoss in eval data: " + str(min_loss) + "\n\n")
def trainIters(self, n_iters, model_file_path=None, evaluate=False): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() LOGGER.info('Starting training for {} iterations'.format(iter)) while iter < n_iters: iter_start = get_time() LOGGER.debug('Starting iteration {} at time {}'.format( iter + 1, iter_start)) batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: LOGGER.info('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: LOGGER.info('Saving model at iteration = {}'.format(iter + 1)) model_path = self.save_model(running_avg_loss, iter) if evaluate: beam_search_processor = BeamSearch(model_path) beam_search_processor.decode() iter_end = get_time() LOGGER.debug('Iteration {} ended at time {}'.format( iter + 1, iter_end)) LOGGER.debug('Time taken for iteration {} = {}'.format( iter + 1, time_diff_as_minutes(iter_start, iter_end)))
def train(self, model_path=None): train_iter = self.get_train_dataloader() iter, running_avg_loss = 0, 0 start = time.time() for epoch in range(self.args.epoches): print(f"Epoch: {epoch+1}") self.model.train() for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)): # print(batch.source[0].size()) # exit() batch_size = batch.batch_size # encoder part enc_padding_mask = self.get_mask(batch.source) enc_batch = batch.source[0] enc_lens = batch.source[1] encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) coverage = Variable(torch.zeros(batch.source[0].size())).to( self.args.device) c_t_1 = Variable( torch.zeros( (batch_size, 2 * self.args.hidden_dim))).to(self.args.device) extra_zeros, enc_batch_extend_vocab, max_art_oovs = self.get_extra_features( batch.source[0]) extra_zeros = extra_zeros.to(self.args.device) enc_batch_extend_vocab = enc_batch_extend_vocab.to( self.args.device) # decoder part dec_batch = batch.target[0][:, :-1] # print(dec_batch.size()) target_batch = batch.target[0][:, 0:] dec_lens_var = batch.target[1] dec_padding_mask = self.get_mask(batch.target) max_dec_len = max(dec_lens_var) step_losses = [] for di in range(min(max_dec_len, self.args.max_dec_steps) - 1): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + self.args.eps) if self.args.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + self.args.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() norm = clip_grad_norm_(self.model.encoder.parameters(), self.args.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), self.args.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), self.args.max_grad_norm) self.optimizer.step() running_avg_loss = calc_running_avg_loss( loss.item(), running_avg_loss, summary_writer, iter, 'Train') iter += 1 if iter % self.args.flush: # print('flush') summary_writer.flush() # print_interval = 10 # if iter % print_interval == 0: # print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}') # start = time.time() # if iter % 300 == 0: self.save_model(running_avg_loss, iter, model_dir) self.evaluate(self.eval_dataset, epoch) self.evaluate(self.test_dataset, epoch, True)