def _train_stage(self): self.model.train() running_loss = 0. running_acc = 0. running_loss_cls = 0. running_loss_ft = 0. is_first = True for e in range(self.start_epoch, self.conf.epochs): if is_first: self.writer = SummaryWriter(self.conf.log_path) is_first = False print('epoch {} started'.format(e)) print("lr: ", self.schedule_lr.get_lr()) for sample, ft_sample, target in tqdm(iter(self.train_loader)): imgs = [sample, ft_sample] labels = target loss, acc, loss_cls, loss_ft = self._train_batch_data( imgs, labels) running_loss_cls += loss_cls running_loss_ft += loss_ft running_loss += loss running_acc += acc self.step += 1 if self.step % self.board_loss_every == 0 and self.step != 0: loss_board = running_loss / self.board_loss_every self.writer.add_scalar('Training/Loss', loss_board, self.step) acc_board = running_acc / self.board_loss_every self.writer.add_scalar('Training/Acc', acc_board, self.step) lr = self.optimizer.param_groups[0]['lr'] self.writer.add_scalar('Training/Learning_rate', lr, self.step) loss_cls_board = running_loss_cls / self.board_loss_every self.writer.add_scalar('Training/Loss_cls', loss_cls_board, self.step) loss_ft_board = running_loss_ft / self.board_loss_every self.writer.add_scalar('Training/Loss_ft', loss_ft_board, self.step) running_loss = 0. running_acc = 0. running_loss_cls = 0. running_loss_ft = 0. if self.step % self.save_every == 0 and self.step != 0: time_stamp = get_time() self._save_state(time_stamp, extra=self.conf.job_name) self.schedule_lr.step() time_stamp = get_time() self._save_state(time_stamp, extra=self.conf.job_name) self.writer.close()
def _train_stage(self): val_loss = None train_loss = None is_first = True for e in range(self.start_epoch, self.conf.epochs): if is_first: self.writer = SummaryWriter(self.conf.log_path) is_first = False print('epoch {} started'.format(e)) print("lr: ", self.schedule_lr.get_lr()) self.model.train() for sample, ft_sample, target in tqdm(iter(self.train_loader)): imgs = [sample, ft_sample] labels = target loss, acc, = self._train_batch_data(imgs, labels, True) if train_loss is None or loss < train_loss: train_loss = loss time_stamp = get_time() self._save_state(str(time_stamp) + "_train", extra=self.conf.job_name) print("\nBest train loss", train_loss) self.writer.add_scalar('Training/Loss', loss) print('\nTraining/Loss', loss) self.writer.add_scalar('Training/Acc', acc.item()) print('Training/Acc', acc.item()) lr = self.optimizer.param_groups[0]['lr'] self.writer.add_scalar('Training/Learning_rate', lr) print('Training/Learning_rate', lr) self.schedule_lr.step() self.model.eval() for sample, ft_sample, target in tqdm(iter(self.val_loader)): imgs = [sample, ft_sample] labels = target loss, acc, = self._train_batch_data(imgs, labels, False) if val_loss is None or loss < val_loss: val_loss = loss time_stamp = get_time() self._save_state(str(time_stamp) + "_val", extra=self.conf.job_name) print("\nBest val loss", val_loss) self.writer.add_scalar('Valid/Loss', loss) print('\nValid/Loss', loss) self.writer.add_scalar('Valid/Acc', acc.item()) print('Valid/Acc', acc.item()) self.writer.close()
def _train_eval_stage(self): running_loss = 0. running_acc = 0. running_loss_cls = 0. running_loss_ft = 0. is_first = True self.total_iter = len(self.train_loader) * self.conf.epochs for e in range(self.start_epoch, self.conf.epochs): self.model.train() eval = False time_stamp = get_time() if is_first: self.writer = SummaryWriter(self.conf.log_path, self.conf.time) is_first = False print('epoch {} started'.format(e)) print("lr: ", self.schedule_lr.optimizer.param_groups[0]['lr']) self.ratio = self.step / self.total_iter count = 1 for sample, ft_sample, target in tqdm(iter(self.train_loader)): # count += 1 # if count == 4: # break imgs = [sample, ft_sample] labels = target loss, acc, loss_cls, loss_ft = self._load_batch_data( imgs, labels, eval) running_loss_cls += loss_cls running_loss_ft += loss_ft running_loss += loss running_acc += acc self.step += 1 self.optimizer.step() self.schedule_lr.step_iter(self.step) # import pdb # pdb.set_trace() if self.step % self.board_loss_every == 0 and self.step != 0: loss_board = running_loss / self.board_loss_every self.writer.add_scalar('Training/Loss', loss_board, self.step) acc_board = running_acc / self.board_loss_every self.writer.add_scalar('Training/Acc', acc_board, self.step) lr = self.optimizer.param_groups[0]['lr'] self.writer.add_scalar('Training/Learning_rate', lr, self.step) loss_cls_board = running_loss_cls / self.board_loss_every self.writer.add_scalar('Training/Loss_cls', loss_cls_board, self.step) loss_ft_board = running_loss_ft / self.board_loss_every self.writer.add_scalar('Training/Loss_ft', loss_ft_board, self.step) print( "Step:{} Training/lr:{} Loss:{} Loss_cls:{} Loss_ft:{} Acc:{}" .format(str(self.step), str(float("%.4f" % lr)), str(float("%.2f" % loss_board)), str(float("%.2f" % loss_cls_board)), str(float("%.4f" % loss_ft_board)), str(float("%.2f" % acc_board)))) running_loss = 0. running_acc = 0. running_loss_cls = 0. running_loss_ft = 0. torch.save(self.model.state_dict(), self.conf.model_path + "/epoch_{}.pth".format(e)) if e % 1 == 0: eval = True self.model.eval() total_val_iter = 0 eval_loss = 0. eval_acc = 0. eval_loss_cls = 0. eval_loss_ft = 0. with torch.no_grad(): for sample, ft_sample, target in tqdm( iter(self.eval_loader)): total_val_iter += 1 imgs = [sample, ft_sample] labels = target loss, acc, loss_cls, loss_ft = self._load_batch_data( imgs, labels, eval) eval_loss_cls += loss_cls eval_loss_ft += loss_ft eval_loss += loss eval_acc += acc loss_board = eval_loss / total_val_iter self.writer.add_scalar('Eval/Loss', loss_board, self.step) acc_board = eval_acc / total_val_iter self.writer.add_scalar('Eval/Acc', acc_board, self.step) loss_cls_board = eval_loss_cls / total_val_iter self.writer.add_scalar('Eval/Loss_cls', loss_cls_board, self.step) loss_ft_board = eval_loss_ft / total_val_iter self.writer.add_scalar('Eval/Loss_ft', loss_ft_board, self.step) self.writer.close() # self.schedule_lr.step() self.schedule_lr.step_epoch() self.writer.close()