def _train_epoch(self, epoch): self.model.train() epoch_start = time.time() batch_start = time.time() train_loss = 0. running_metric_text = runningScore(2) lr = self.optimizer.param_groups[0]['lr'] for i, batch in enumerate(self.train_loader): if i >= self.train_loader_len: break self.global_step += 1 lr = self.optimizer.param_groups[0]['lr'] # 数据进行转换和丢到gpu for key, value in batch.items(): if value is not None: if isinstance(value, torch.Tensor): batch[key] = value.to(self.device) cur_batch_size = batch['img'].size()[0] preds = self.model(batch['img']) loss_dict = self.criterion(preds, batch) # backward self.optimizer.zero_grad() loss_dict['loss'].backward() self.optimizer.step() self.scheduler.step() # acc iou score_shrink_map = cal_text_score(preds[:, 0, :, :], batch['shrink_map'], batch['shrink_mask'], running_metric_text, thred=self.config['post_processing']['args']['thresh']) # loss 和 acc 记录到日志 loss_str = 'loss: {:.4f}, '.format(loss_dict['loss'].item()) for idx, (key, value) in enumerate(loss_dict.items()): loss_dict[key] = value.item() if key == 'loss': continue loss_str += '{}: {:.4f}'.format(key, loss_dict[key]) if idx < len(loss_dict) - 1: loss_str += ', ' train_loss += loss_dict['loss'] acc = score_shrink_map['Mean Acc'] iou_shrink_map = score_shrink_map['Mean IoU'] if self.global_step % self.log_iter == 0: batch_time = time.time() - batch_start self.logger_info( '[{}/{}], [{}/{}], global_step: {}, speed: {:.1f} samples/sec, acc: {:.4f}, iou_shrink_map: {:.4f}, {}lr:{:.6}, time:{:.2f}'.format( epoch, self.epochs, i + 1, self.train_loader_len, self.global_step, self.log_iter * cur_batch_size / batch_time, acc, iou_shrink_map, loss_str, lr, batch_time)) batch_start = time.time() return {'train_loss': train_loss / self.train_loader_len, 'lr': lr, 'time': time.time() - epoch_start, 'epoch': epoch}
def _train_epoch(self, epoch): self.model.train() epoch_start = time.time() batch_start = time.time() train_loss = 0. running_metric_text = runningScore(2) running_metric_kernel = runningScore(2) lr = self.optimizer.param_groups[0]['lr'] for i, (images, labels, training_masks) in enumerate(self.train_loader): if i >= self.train_loader_len: break self.global_step += 1 lr = self.optimizer.param_groups[0]['lr'] # 数据进行转换和丢到gpu cur_batch_size = images.size()[0] images, labels, training_masks = images.to(self.device), labels.to( self.device), training_masks.to(self.device) preds = self.model(images) loss_all, loss_tex, loss_ker, loss_agg, loss_dis = self.criterion( preds, labels, training_masks) # backward self.optimizer.zero_grad() loss_all.backward() self.optimizer.step() if self.config['lr_scheduler']['type'] == 'PolynomialLR': self.scheduler.step() # acc iou score_text = cal_text_score(preds[:, 0, :, :], labels[:, 0, :, :], training_masks, running_metric_text) score_kernel = cal_kernel_score(preds[:, 1, :, :], labels[:, 1, :, :], labels[:, 0, :, :], training_masks, running_metric_kernel) # loss 和 acc 记录到日志 loss_all = loss_all.item() loss_tex = loss_tex.item() loss_ker = loss_ker.item() loss_agg = loss_agg.item() loss_dis = loss_dis.item() train_loss += loss_all acc = score_text['Mean Acc'] iou_text = score_text['Mean IoU'] iou_kernel = score_kernel['Mean IoU'] if (i + 1) % self.display_interval == 0: batch_time = time.time() - batch_start self.logger.info( '[{}/{}], [{}/{}], global_step: {}, Speed: {:.1f} samples/sec, acc: {:.4f}, iou_text: {:.4f}, iou_kernel: {:.4f}, loss_all: {:.4f}, loss_tex: {:.4f}, loss_ker: {:.4f}, loss_agg: {:.4f}, loss_dis: {:.4f}, lr:{:.6}, time:{:.2f}' .format( epoch, self.epochs, i + 1, self.train_loader_len, self.global_step, self.display_interval * cur_batch_size / batch_time, acc, iou_text, iou_kernel, loss_all, loss_tex, loss_ker, loss_agg, loss_dis, lr, batch_time)) batch_start = time.time() if self.tensorboard_enable: # write tensorboard self.writer.add_scalar('TRAIN/LOSS/loss_all', loss_all, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_tex', loss_tex, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_ker', loss_ker, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_agg', loss_agg, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_dis', loss_dis, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/acc', acc, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/iou_text', iou_text, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/iou_kernel', iou_kernel, self.global_step) self.writer.add_scalar('TRAIN/lr', lr, self.global_step) if i % self.show_images_interval == 0: # show images on tensorboard self.writer.add_images('TRAIN/imgs', images, self.global_step) # text kernel and training_masks gt_texts, gt_kernels = labels[:, 0, :, :], labels[:, 1, :, :] gt_texts[gt_texts <= 0.5] = 0 gt_texts[gt_texts > 0.5] = 1 gt_kernels[gt_kernels <= 0.5] = 0 gt_kernels[gt_kernels > 0.5] = 1 show_label = torch.cat( [gt_texts, gt_kernels, training_masks.float()]) show_label = vutils.make_grid(show_label.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/gt', show_label, self.global_step) # model output preds[:, :2, :, :] = torch.sigmoid(preds[:, :2, :, :]) show_pred = torch.cat( [preds[:, 0, :, :], preds[:, 1, :, :]]) show_pred = vutils.make_grid(show_pred.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/preds', show_pred, self.global_step) return { 'train_loss': train_loss / self.train_loader_len, 'lr': lr, 'time': time.time() - epoch_start, 'epoch': epoch }
def _train_epoch(self, epoch): self.model.train() epoch_start = time.time() batch_start = time.time() train_loss = 0. running_metric_text = runningScore(2) lr = self.optimizer.param_groups[0]['lr'] for i, batch in enumerate(self.train_loader): if i >= self.train_loader_len: break self.global_step += 1 lr = self.optimizer.param_groups[0]['lr'] # 数据进行转换和丢到gpu for key, value in batch.items(): if value is not None: if isinstance(value, torch.Tensor): batch[key] = value.to(self.device) cur_batch_size = batch['img'].size()[0] preds = self.model(batch['img']) loss_dict = self.criterion(preds, batch) # backward self.optimizer.zero_grad() loss_dict['loss'].backward() self.optimizer.step() if self.config['lr_scheduler']['type'] == 'WarmupPolyLR': self.scheduler.step() # acc iou score_shrink_map = cal_text_score( preds[:, 0, :, :], batch['shrink_map'], batch['shrink_mask'], running_metric_text, thred=self.config['post_processing']['args']['thresh']) # loss 和 acc 记录到日志 loss_str = 'loss: {:.4f}, '.format(loss_dict['loss'].item()) for idx, (key, value) in enumerate(loss_dict.items()): loss_dict[key] = value.item() if key == 'loss': continue loss_str += '{}: {:.4f}'.format(key, loss_dict[key]) if idx < len(loss_dict) - 1: loss_str += ', ' train_loss += loss_dict['loss'] acc = score_shrink_map['Mean Acc'] iou_shrink_map = score_shrink_map['Mean IoU'] if self.global_step % self.log_iter == 0: batch_time = time.time() - batch_start self.logger_info( '[{}/{}], [{}/{}], global_step: {}, speed: {:.1f} samples/sec, acc: {:.4f}, iou_shrink_map: {:.4f}, {}, lr:{:.6}, time:{:.2f}' .format(epoch, self.epochs, i + 1, self.train_loader_len, self.global_step, self.log_iter * cur_batch_size / batch_time, acc, iou_shrink_map, loss_str, lr, batch_time)) batch_start = time.time() if self.tensorboard_enable and self.config['local_rank'] == 0: # write tensorboard for key, value in loss_dict.items(): self.writer.add_scalar('TRAIN/LOSS/{}'.format(key), value, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/acc', acc, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/iou_shrink_map', iou_shrink_map, self.global_step) self.writer.add_scalar('TRAIN/lr', lr, self.global_step) if self.global_step % self.show_images_iter == 0: # show images on tensorboard self.inverse_normalize(batch['img']) self.writer.add_images('TRAIN/imgs', batch['img'], self.global_step) # shrink_labels and threshold_labels shrink_labels = batch['shrink_map'] threshold_labels = batch['threshold_map'] shrink_labels[shrink_labels <= 0.5] = 0 shrink_labels[shrink_labels > 0.5] = 1 show_label = torch.cat([shrink_labels, threshold_labels]) show_label = vutils.make_grid(show_label.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/gt', show_label, self.global_step) # model output show_pred = [] for kk in range(preds.shape[1]): show_pred.append(preds[:, kk, :, :]) show_pred = torch.cat(show_pred) show_pred = vutils.make_grid(show_pred.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/preds', show_pred, self.global_step) return { 'train_loss': train_loss / self.train_loader_len, 'lr': lr, 'time': time.time() - epoch_start, 'epoch': epoch }
def _train_epoch(self, epoch): self.model.train() epoch_start = time.time() batch_start = time.time() train_loss = 0. running_metric_text = runningScore(2) lr = self.optimizer.param_groups[0]['lr'] for i, (images, shrink_labels, threshold_labels) in enumerate(self.train_loader): if i >= self.train_loader_len: break self.global_step += 1 lr = self.optimizer.param_groups[0]['lr'] # 数据进行转换和丢到gpu cur_batch_size = images.size()[0] images, shrink_labels, threshold_labels = images.to( self.device), shrink_labels.to( self.device), threshold_labels.to(self.device) preds = self.model(images) loss_all, loss_shrink_map, loss_binary_map, loss_threshold_map = self.criterion( preds, shrink_labels, threshold_labels) # backward self.optimizer.zero_grad() loss_all.backward() self.optimizer.step() if self.config['lr_scheduler']['type'] == 'WarmupPolyLR': self.scheduler.step() # acc iou score_shrink_map = cal_text_score(preds[:, 0, :, :], shrink_labels, running_metric_text, thred=0.5) # loss 和 acc 记录到日志 loss_all = loss_all.item() loss_shrink_map = loss_shrink_map.item() loss_binary_map = loss_binary_map.item() loss_threshold_map = loss_threshold_map.item() train_loss += loss_all acc = score_shrink_map['Mean Acc'] iou_shrink_map = score_shrink_map['Mean IoU'] if (i + 1) % self.display_interval == 0: batch_time = time.time() - batch_start self.logger.info( '[{}/{}], [{}/{}], global_step: {}, Speed: {:.1f} samples/sec, acc: {:.4f}, iou_shrink_map: {:.4f}, loss_all: {:.4f}, loss_shrink_map: {:.4f}, loss_binary_map: {:.4f}, loss_threshold_map: {:.4f}, lr:{:.6}, time:{:.2f}' .format( epoch, self.epochs, i + 1, self.train_loader_len, self.global_step, self.display_interval * cur_batch_size / batch_time, acc, iou_shrink_map, loss_all, loss_shrink_map, loss_binary_map, loss_threshold_map, lr, batch_time)) batch_start = time.time() if self.tensorboard_enable: # write tensorboard self.writer.add_scalar('TRAIN/LOSS/loss_all', loss_all, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_shrink_map', loss_shrink_map, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_binary_map', loss_binary_map, self.global_step) self.writer.add_scalar('TRAIN/LOSS/loss_threshold_map', loss_threshold_map, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/acc', acc, self.global_step) self.writer.add_scalar('TRAIN/ACC_IOU/iou_shrink_map', iou_shrink_map, self.global_step) self.writer.add_scalar('TRAIN/lr', lr, self.global_step) if i % self.show_images_interval == 0: # show images on tensorboard self.writer.add_images('TRAIN/imgs', images, self.global_step) # shrink_labels and threshold_labels shrink_labels[shrink_labels <= 0.5] = 0 shrink_labels[shrink_labels > 0.5] = 1 show_label = torch.cat([shrink_labels, threshold_labels]) show_label = vutils.make_grid(show_label.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/gt', show_label, self.global_step) # model output show_pred = torch.cat([ preds[:, 0, :, :], preds[:, 1, :, :], preds[:, 2, :, :] ]) show_pred = vutils.make_grid(show_pred.unsqueeze(1), nrow=cur_batch_size, normalize=False, padding=20, pad_value=1) self.writer.add_image('TRAIN/preds', show_pred, self.global_step) return { 'train_loss': train_loss / self.train_loader_len, 'lr': lr, 'time': time.time() - epoch_start, 'epoch': epoch }