def train(self): batch_num = len(self.train_loader) best_loss = 1e10 for epoch in range(1, config.num_epochs + 1): self.model.encoder.train() self.model.decoder.train() print("epoch {}/{} :".format(epoch, config.num_epochs), end="\r") start = time.time() for batch_idx, train_data in enumerate(self.train_loader, start=1): batch_loss = self.step(train_data) self.optim.zero_grad() batch_loss.backward() # gradient clipping nn.utils.clip_grad_norm_(self.model.parameters(), config.max_grad_norm) self.optim.step() batch_loss = batch_loss.detach().item() msg = "{}/{} {} - ETA : {} - loss : {:.4f}" \ .format(batch_idx, batch_num, progress_bar(batch_idx, batch_num), eta(start, batch_idx, batch_num), batch_loss) print(msg, end="\r") # compute validation loss for every epoch val_loss = self.evaluate(msg) if val_loss <= best_loss: best_loss = val_loss self.save_model(val_loss, epoch) print("Epoch {} took {} - final loss : {:.4f} - val loss :{:.4f}" .format(epoch, user_friendly_time(time_since(start)), batch_loss, val_loss))
def train(self): global_step = 1 batch_num = len(self.train_loader) best_loss = 1e10 qa_loss_lst = [] qg_loss_lst = [] for epoch in range(1, config.num_epochs + 1): start = time.time() for step, batch in enumerate(self.train_loader, start=1): qa_loss, ca2q_loss = self.model(batch) # mean() to average across multiple gpu and back-propagation qa_loss = qa_loss.mean() / config.gradient_accumulation_steps ca2q_loss = ca2q_loss.mean() / config.gradient_accumulation_steps qa_loss.backward(retain_graph=True) ca2q_loss.backward() qa_loss_lst.append(qa_loss.detach().item()) qg_loss_lst.append(ca2q_loss.detach().item()) # clip gradient nn.utils.clip_grad_norm_(self.model.module.ca2q_model.parameters(), config.max_grad_norm) # update params if step % config.gradient_accumulation_steps == 0: self.qa_opt.step() self.qg_opt.step() # zero grad self.qa_opt.zero_grad() self.qg_opt.zero_grad() global_step += 1 avg_qa_loss = sum(qa_loss_lst) avg_qg_loss = sum(qg_loss_lst) # empty list qa_loss_lst = [] qg_loss_lst = [] msg = "{}/{} {} - ETA : {} - qa_loss: {:.2f}, ca2q_loss :{:.2f}" \ .format(step, batch_num, progress_bar(step, batch_num), eta(start, step, batch_num), avg_qa_loss, avg_qg_loss) print(msg, end="\r") val_qa_loss, val_qg_loss = self.evaluate(msg) if val_qg_loss <= best_loss: best_loss = val_qg_loss self.save_model(val_qg_loss, epoch) print("Epoch {} took {} - final loss : {:.4f} - qa_loss :{:.4f}, qg_loss :{:.4f}" .format(epoch, user_friendly_time(time_since(start)), ca2q_loss, val_qa_loss, val_qg_loss))
def train(self): global_step = 1 batch_num = len(self.train_loader) best_loss = 1e10 qa_loss_lst = [] self.model.train() for epoch in range(1, 4): start = time.time() for step, batch in enumerate(self.train_loader, start=1): input_ids, input_mask, segment_ids, start_positions, end_positions = batch seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].to(config.device) input_mask = input_mask[:, : max_len].to(config.device) segment_ids = segment_ids[:, :max_len].to(config.device) start_positions = start_positions.to(config.device) end_positions = end_positions.to(config.device) loss = self.model(input_ids, segment_ids, input_mask, start_positions, end_positions) # mean() to average across multiple gpu and back-propagation loss /= config.gradient_accumulation_steps loss.backward() qa_loss_lst.append(loss) # update params if step % config.gradient_accumulation_steps == 0: self.qa_opt.step() # zero grad self.qa_opt.zero_grad() global_step += 1 avg_qa_loss = sum(qa_loss_lst) # empty list qa_loss_lst = [] msg = "{}/{} {} - ETA : {} - qa_loss: {:.2f}" \ .format(step, batch_num, progress_bar(step, batch_num), eta(start, step, batch_num), avg_qa_loss) print(msg, end="\r") val_loss = self.evaluate(msg) if val_loss <= best_loss: best_loss = val_loss self.save_model(val_loss, epoch) print("Epoch {} took {} - final loss : {:.4f} - val_loss :{:.4f}" .format(epoch, user_friendly_time(time_since(start)), loss, val_loss))
def train(self): batch_num = len(self.train_loader) best_loss = 1e10 for epoch in range(1, config.num_epochs + 1): self.model.train() print("epoch {}/{} :".format(epoch, config.num_epochs), end="\r") start = time.time() # halving the learning rate after epoch 8 if epoch >= 8 and epoch % 2 == 0: self.lr *= 0.5 state_dict = self.optim.state_dict() for param_group in state_dict["param_groups"]: param_group["lr"] = self.lr self.optim.load_state_dict(state_dict) for batch_idx, train_data in enumerate(self.train_loader, start=1): batch_loss = self.step(train_data) self.model.zero_grad() batch_loss.backward() # gradient clipping nn.utils.clip_grad_norm_(self.model.parameters(), config.max_grad_norm) self.optim.step() batch_loss = batch_loss.detach().item() msg = "{}/{} {} - ETA : {} - loss : {:.4f}" \ .format(batch_idx, batch_num, progress_bar(batch_idx, batch_num), eta(start, batch_idx, batch_num), batch_loss) print(msg, end="\r") val_loss = self.evaluate(msg) if val_loss <= best_loss: best_loss = val_loss self.save_model(val_loss, epoch) # the condition of saving new checkpoints print("Epoch {} took {} - final loss : {:.4f} - val loss :{:.4f}". format(epoch, user_friendly_time(time_since(start)), batch_loss, val_loss))