コード例 #1
0
ファイル: base_trainer.py プロジェクト: insightcs/PAN-Pytorch
class BaseTrainer:
    def __init__(self, config, model, criterion, train_loader, weights_init):
        config['trainer']['output_dir'] = os.path.join(
            str(pathlib.Path(os.path.abspath(__name__)).parent),
            config['trainer']['output_dir'])
        config['name'] = config['name'] + '_' + model.name
        self.save_dir = os.path.join(config['trainer']['output_dir'],
                                     config['name'])
        self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint')

        if config['trainer']['resume_checkpoint'] == '' and config['trainer'][
                'finetune_checkpoint'] == '':
            shutil.rmtree(self.save_dir, ignore_errors=True)
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.global_step = 0
        self.start_epoch = 1
        self.config = config

        self.model = model
        self.criterion = criterion
        self.train_loader = train_loader
        # logger and tensorboard
        self.tensorboard_enable = self.config['trainer']['tensorboard']
        self.epochs = self.config['trainer']['epochs']
        self.display_interval = self.config['trainer']['display_interval']
        if self.tensorboard_enable:
            from torch.utils.tensorboard import SummaryWriter
            self.writer = SummaryWriter(self.save_dir)

        self.logger = setup_logger(os.path.join(self.save_dir, 'train_log'))
        self.logger.info(pformat(self.config))

        # device
        torch.manual_seed(self.config['trainer']['seed'])  # 为CPU设置随机种子
        if len(self.config['trainer']['gpus']) > 0 and torch.cuda.is_available(
        ):
            self.with_cuda = True
            torch.backends.cudnn.benchmark = True
            self.logger.info('train with gpu {} and pytorch {}'.format(
                self.config['trainer']['gpus'], torch.__version__))
            self.gpus = {
                i: item
                for i, item in enumerate(self.config['trainer']['gpus'])
            }
            self.device = torch.device("cuda:0")
            torch.cuda.manual_seed(
                self.config['trainer']['seed'])  # 为当前GPU设置随机种子
            torch.cuda.manual_seed_all(
                self.config['trainer']['seed'])  # 为所有GPU设置随机种子
        else:
            self.with_cuda = False
            self.logger.info('train with cpu and pytorch {}'.format(
                torch.__version__))
            self.device = torch.device("cpu")
        self.logger.info('device {}'.format(self.device))
        self.metrics = {
            'recall': 0,
            'precision': 0,
            'hmean': 0,
            'train_loss': float('inf'),
            'best_model': ''
        }

        self.optimizer = self._initialize('optimizer', torch.optim,
                                          self.model.parameters())

        if self.config['lr_scheduler']['type'] != 'PolynomialLR':
            self.scheduler = self._initialize('lr_scheduler',
                                              torch.optim.lr_scheduler,
                                              self.optimizer)
        else:
            self.scheduler = PolynomialLR(self.optimizer,
                                          self.epochs * len(self.train_loader))

        if self.config['trainer']['resume_checkpoint'] != '':
            self._load_checkpoint(self.config['trainer']['resume_checkpoint'],
                                  resume=True)
        elif self.config['trainer']['finetune_checkpoint'] != '':
            self._load_checkpoint(
                self.config['trainer']['finetune_checkpoint'], resume=False)
        else:
            if weights_init is not None:
                self.model.apply(weights_init)

        # 单机多卡
        num_gpus = torch.cuda.device_count()
        if num_gpus > 1:
            self.model = nn.DataParallel(self.model)
            # For sync bn
            patch_replication_callback(self.model)

        self.model.to(self.device)

        if self.tensorboard_enable:
            try:
                # add graph
                dummy_input = torch.zeros(
                    1, self.config['data_loader']['args']['dataset']
                    ['img_channel'], self.config['data_loader']['args']
                    ['dataset']['input_size'], self.config['data_loader']
                    ['args']['dataset']['input_size']).to(self.device)
                self.writer.add_graph(self.model, dummy_input)
            except Exception as e:
                self.logger.warn(
                    'add graph to tensorboard failed, error [{}]'.format(e))

    def train(self):
        """
        Full training logic
        """
        for epoch in range(self.start_epoch, self.epochs + 1):
            try:
                self.epoch_result = self._train_epoch(epoch)
                if self.config['lr_scheduler']['type'] != 'PolynomialLR':
                    self.scheduler.step()
                self._on_epoch_finish()
            except torch.cuda.CudaError:
                self._log_memory_usage()
        if self.tensorboard_enable:
            self.writer.close()
        self._on_train_finish()

    def _train_epoch(self, epoch):
        """
        Training logic for an epoch

        :param epoch: Current epoch number
        """
        raise NotImplementedError

    def _eval(self):
        """
        eval logic for an epoch

        :param epoch: Current epoch number
        """
        raise NotImplementedError

    def _on_epoch_finish(self):
        raise NotImplementedError

    def _on_train_finish(self):
        raise NotImplementedError

    def _log_memory_usage(self):
        if not self.with_cuda:
            return

        template = """Memory Usage: \n{}"""
        usage = []
        for deviceID, device in self.gpus.items():
            deviceID = int(deviceID)
            allocated = torch.cuda.memory_allocated(deviceID) / (1024 * 1024)
            cached = torch.cuda.memory_cached(deviceID) / (1024 * 1024)

            usage.append(
                '    CUDA: {}  Allocated: {} MB Cached: {} MB \n'.format(
                    device, allocated, cached))

        content = ''.join(usage)
        content = template.format(content)

        self.logger.debug(content)

    def _save_checkpoint(self, epoch, file_name, save_best=False):
        """
        Saving checkpoints

        :param epoch: current epoch number
        :param log: logging information of the epoch
        :param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar'
        """
        state = {
            'epoch': epoch,
            'global_step': self.global_step,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'scheduler': self.scheduler.state_dict(),
            'config': self.config,
            'metrics': self.metrics
        }
        filename = os.path.join(self.checkpoint_dir, file_name)
        torch.save(state, filename)
        if save_best:
            shutil.copy(filename,
                        os.path.join(self.checkpoint_dir, 'model_best.pth'))
            self.logger.info("Saving current best: {}".format(file_name))
        else:
            self.logger.info("Saving checkpoint: {}".format(filename))

    def _load_checkpoint(self, checkpoint_path, resume):
        """
        Resume from saved checkpoints
        :param checkpoint_path: Checkpoint path to be resumed
        """
        self.logger.info("Loading checkpoint: {} ...".format(checkpoint_path))
        checkpoint = torch.load(checkpoint_path)
        state_dict = {}
        for (k, v) in checkpoint['state_dict'].items():
            state_dict[k.replace('module.', '')] = v
        #self.model.load_state_dict(checkpoint['state_dict'])
        self.model.load_state_dict(state_dict)
        if resume:
            self.global_step = checkpoint['global_step']
            self.start_epoch = checkpoint['epoch'] + 1
            self.config['lr_scheduler']['args'][
                'last_epoch'] = self.start_epoch
            #self.scheduler.load_state_dict(checkpoint['scheduler'])
            #self.optimizer.load_state_dict(checkpoint['optimizer'])
            #if 'metrics' in checkpoint:
            #    self.metrics = checkpoint['metrics']
            if self.with_cuda:
                for state in self.optimizer.state.values():
                    for k, v in state.items():
                        if isinstance(v, torch.Tensor):
                            state[k] = v.to(self.device)
            self.logger.info("resume from checkpoint {} (epoch {})".format(
                checkpoint_path, self.start_epoch))
        else:
            self.logger.info(
                "finetune from checkpoint {}".format(checkpoint_path))

    def _initialize(self, name, module, *args, **kwargs):
        module_name = self.config[name]['type']
        module_args = self.config[name]['args']
        assert all([
            k not in module_args for k in kwargs
        ]), 'Overwriting kwargs given in config file is not allowed'
        module_args.update(kwargs)
        return getattr(module, module_name)(*args, **module_args)
コード例 #2
0
class Trainer(BaseTrainer):
    def __init__(self,
                 config,
                 model,
                 criterion,
                 train_loader,
                 weights_init=None):
        super(Trainer, self).__init__(config, model, criterion, weights_init)
        self.show_images_interval = self.config['trainer'][
            'show_images_interval']
        self.test_path = self.config['data_loader']['args']['dataset'][
            'val_data_path']
        self.train_loader = train_loader
        self.train_loader_len = len(train_loader)
        if self.config['lr_scheduler']['type'] == 'PolynomialLR':
            self.scheduler = PolynomialLR(self.optimizer,
                                          self.epochs * self.train_loader_len)

        self.logger.info(
            'train dataset has {} samples,{} in dataloader'.format(
                self.train_loader.dataset_len, self.train_loader_len))

    def _train_epoch(self, epoch):
        self.model.train()
        epoch_start = time.time()
        batch_start = time.time()
        train_loss = 0.
        running_metric_text = runningScore(2)
        running_metric_kernel = runningScore(2)
        lr = self.optimizer.param_groups[0]['lr']
        for i, (images, labels,
                training_masks) in enumerate(self.train_loader):
            if i >= self.train_loader_len:
                break
            self.global_step += 1
            lr = self.optimizer.param_groups[0]['lr']

            # 数据进行转换和丢到gpu
            cur_batch_size = images.size()[0]
            images, labels, training_masks = images.to(self.device), labels.to(
                self.device), training_masks.to(self.device)

            preds = self.model(images)
            loss_all, loss_tex, loss_ker, loss_agg, loss_dis = self.criterion(
                preds, labels, training_masks)
            # backward
            self.optimizer.zero_grad()
            loss_all.backward()
            self.optimizer.step()
            if self.config['lr_scheduler']['type'] == 'PolynomialLR':
                self.scheduler.step()
            # acc iou
            score_text = cal_text_score(preds[:, 0, :, :], labels[:, 0, :, :],
                                        training_masks, running_metric_text)
            score_kernel = cal_kernel_score(preds[:, 1, :, :], labels[:,
                                                                      1, :, :],
                                            labels[:, 0, :, :], training_masks,
                                            running_metric_kernel)

            # loss 和 acc 记录到日志
            loss_all = loss_all.item()
            loss_tex = loss_tex.item()
            loss_ker = loss_ker.item()
            loss_agg = loss_agg.item()
            loss_dis = loss_dis.item()
            train_loss += loss_all
            acc = score_text['Mean Acc']
            iou_text = score_text['Mean IoU']
            iou_kernel = score_kernel['Mean IoU']

            if (i + 1) % self.display_interval == 0:
                batch_time = time.time() - batch_start
                self.logger.info(
                    '[{}/{}], [{}/{}], global_step: {}, Speed: {:.1f} samples/sec, acc: {:.4f}, iou_text: {:.4f}, iou_kernel: {:.4f}, loss_all: {:.4f}, loss_tex: {:.4f}, loss_ker: {:.4f}, loss_agg: {:.4f}, loss_dis: {:.4f}, lr:{:.6}, time:{:.2f}'
                    .format(
                        epoch, self.epochs, i + 1, self.train_loader_len,
                        self.global_step,
                        self.display_interval * cur_batch_size / batch_time,
                        acc, iou_text, iou_kernel, loss_all, loss_tex,
                        loss_ker, loss_agg, loss_dis, lr, batch_time))
                batch_start = time.time()

            if self.tensorboard_enable:
                # write tensorboard
                self.writer.add_scalar('TRAIN/LOSS/loss_all', loss_all,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/LOSS/loss_tex', loss_tex,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/LOSS/loss_ker', loss_ker,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/LOSS/loss_agg', loss_agg,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/LOSS/loss_dis', loss_dis,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/ACC_IOU/acc', acc,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/ACC_IOU/iou_text', iou_text,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/ACC_IOU/iou_kernel', iou_kernel,
                                       self.global_step)
                self.writer.add_scalar('TRAIN/lr', lr, self.global_step)
                if i % self.show_images_interval == 0:
                    # show images on tensorboard
                    self.writer.add_images('TRAIN/imgs', images,
                                           self.global_step)
                    # text kernel and training_masks
                    gt_texts, gt_kernels = labels[:, 0, :, :], labels[:,
                                                                      1, :, :]
                    gt_texts[gt_texts <= 0.5] = 0
                    gt_texts[gt_texts > 0.5] = 1
                    gt_kernels[gt_kernels <= 0.5] = 0
                    gt_kernels[gt_kernels > 0.5] = 1
                    show_label = torch.cat(
                        [gt_texts, gt_kernels,
                         training_masks.float()])
                    show_label = vutils.make_grid(show_label.unsqueeze(1),
                                                  nrow=cur_batch_size,
                                                  normalize=False,
                                                  padding=20,
                                                  pad_value=1)
                    self.writer.add_image('TRAIN/gt', show_label,
                                          self.global_step)
                    # model output
                    preds[:, :2, :, :] = torch.sigmoid(preds[:, :2, :, :])
                    show_pred = torch.cat(
                        [preds[:, 0, :, :], preds[:, 1, :, :]])
                    show_pred = vutils.make_grid(show_pred.unsqueeze(1),
                                                 nrow=cur_batch_size,
                                                 normalize=False,
                                                 padding=20,
                                                 pad_value=1)
                    self.writer.add_image('TRAIN/preds', show_pred,
                                          self.global_step)

        return {
            'train_loss': train_loss / self.train_loader_len,
            'lr': lr,
            'time': time.time() - epoch_start,
            'epoch': epoch
        }

    def _eval(self):
        self.model.eval()
        # torch.cuda.empty_cache()  # speed up evaluating after training finished
        img_path = os.path.join(self.test_path, 'img')
        gt_path = os.path.join(self.test_path, 'gt')
        result_save_path = os.path.join(self.save_dir, 'result')
        if os.path.exists(result_save_path):
            shutil.rmtree(result_save_path, ignore_errors=True)
        if not os.path.exists(result_save_path):
            os.makedirs(result_save_path)
        short_size = 736
        # 预测所有测试图片
        img_paths = [os.path.join(img_path, x) for x in os.listdir(img_path)]
        for img_path in tqdm(img_paths, desc='test models'):
            img_name = os.path.basename(img_path).split('.')[0]
            save_name = os.path.join(result_save_path,
                                     'res_' + img_name + '.txt')

            assert os.path.exists(img_path), 'file is not exists'
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            h, w = img.shape[:2]
            scale = short_size / min(h, w)
            img = cv2.resize(img, None, fx=scale, fy=scale)
            # 将图片由(w,h)变为(1,img_channel,h,w)
            tensor = transforms.ToTensor()(img)
            tensor = tensor.unsqueeze_(0)

            tensor = tensor.to(self.device)
            with torch.no_grad():
                torch.cuda.synchronize(self.device)
                preds = self.model(tensor)[0]
                torch.cuda.synchronize(self.device)
                preds, boxes_list = decode(preds)
                scale = (preds.shape[1] / w, preds.shape[0] / h)
                if len(boxes_list):
                    boxes_list = boxes_list / scale
            np.savetxt(save_name,
                       boxes_list.reshape(-1, 8),
                       delimiter=',',
                       fmt='%d')
        # 开始计算 recall precision f1
        result_dict = cal_recall_precison_f1(gt_path=gt_path,
                                             result_path=result_save_path)
        return result_dict['recall'], result_dict['precision'], result_dict[
            'hmean']

    def _on_epoch_finish(self):
        self.logger.info(
            '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format(
                self.epoch_result['epoch'], self.epochs,
                self.epoch_result['train_loss'], self.epoch_result['time'],
                self.epoch_result['lr']))
        net_save_path = '{}/PANNet_latest.pth'.format(self.checkpoint_dir)

        save_best = False
        if self.config['trainer']['metrics'] == 'hmean':  # 使用f1作为最优模型指标
            recall, precision, hmean = self._eval()

            if self.tensorboard_enable:
                self.writer.add_scalar('EVAL/recall', recall, self.global_step)
                self.writer.add_scalar('EVAL/precision', precision,
                                       self.global_step)
                self.writer.add_scalar('EVAL/hmean', hmean, self.global_step)
            self.logger.info(
                'test: recall: {:.6f}, precision: {:.6f}, f1: {:.6f}'.format(
                    recall, precision, hmean))

            if hmean > self.metrics['hmean']:
                save_best = True
                self.metrics['train_loss'] = self.epoch_result['train_loss']
                self.metrics['hmean'] = hmean
                self.metrics['precision'] = precision
                self.metrics['recall'] = recall
                self.metrics['best_model'] = net_save_path
        else:
            if self.epoch_result['train_loss'] < self.metrics['train_loss']:
                save_best = True
                self.metrics['train_loss'] = self.epoch_result['train_loss']
                self.metrics['best_model'] = net_save_path
        self._save_checkpoint(self.epoch_result['epoch'], net_save_path,
                              save_best)

    def _on_train_finish(self):
        for k, v in self.metrics.items():
            self.logger.info('{}:{}'.format(k, v))
        self.logger.info('finish train')
コード例 #3
0
ファイル: train.py プロジェクト: HAL-42/DeepLabV2YQ
            # Loss
            iter_loss = criterion(logits, labels.to(device))

            # Propagate backward (just compute gradients wrt the loss)
            iter_loss /= CONFIG.SOLVER.ITER_SIZE
            iter_loss.backward()

            loss += float(iter_loss)

        average_loss.add(loss)

        # Update weights with accumulated gradients
        optimizer.step()

        # Update learning rate
        scheduler.step(epoch=iteration)

        # TensorBoard
        if iteration % CONFIG.SOLVER.ITER_TB == 0:
            writer.add_scalar("loss/train", average_loss.value()[0], iteration)
            for i, o in enumerate(optimizer.param_groups):
                writer.add_scalar("lr/group_{}".format(i), o["lr"], iteration)
            for i in range(torch.cuda.device_count()):
                writer.add_scalar(
                    "gpu/device_{}/memory_cached".format(i),
                    torch.cuda.memory_cached(i) / 1024**3,
                    iteration,
                )

        # Save a model
        if iteration % CONFIG.SOLVER.ITER_SAVE == 0: