Beispiel #1
0
    def __init__(self, config, model, criterion, train_loader, weights_init=None):
        super(Trainer, self).__init__(config, model, criterion, weights_init)
        self.show_images_interval = self.config['trainer']['show_images_interval']
        self.test_path = self.config['data_loader']['args']['dataset']['val_data_path']
        self.train_loader = train_loader
        self.train_loader_len = len(train_loader)
        if self.config['lr_scheduler']['type'] == 'PolynomialLR':
            self.scheduler = PolynomialLR(self.optimizer, self.epochs * self.train_loader_len)

        self.logger.info('train dataset has {} samples,{} in dataloader'.format(self.train_loader.dataset_len,
                                                                                self.train_loader_len))
Beispiel #2
0
    def __init__(self, config, model, criterion, train_loader, weights_init):
        config['trainer']['output_dir'] = os.path.join(
            str(pathlib.Path(os.path.abspath(__name__)).parent),
            config['trainer']['output_dir'])
        config['name'] = config['name'] + '_' + model.name
        self.save_dir = os.path.join(config['trainer']['output_dir'],
                                     config['name'])
        self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint')

        if config['trainer']['resume_checkpoint'] == '' and config['trainer'][
                'finetune_checkpoint'] == '':
            shutil.rmtree(self.save_dir, ignore_errors=True)
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.global_step = 0
        self.start_epoch = 1
        self.config = config

        self.model = model
        self.criterion = criterion
        self.train_loader = train_loader
        # logger and tensorboard
        self.tensorboard_enable = self.config['trainer']['tensorboard']
        self.epochs = self.config['trainer']['epochs']
        self.display_interval = self.config['trainer']['display_interval']
        if self.tensorboard_enable:
            from torch.utils.tensorboard import SummaryWriter
            self.writer = SummaryWriter(self.save_dir)

        self.logger = setup_logger(os.path.join(self.save_dir, 'train_log'))
        self.logger.info(pformat(self.config))

        # device
        torch.manual_seed(self.config['trainer']['seed'])  # 为CPU设置随机种子
        if len(self.config['trainer']['gpus']) > 0 and torch.cuda.is_available(
        ):
            self.with_cuda = True
            torch.backends.cudnn.benchmark = True
            self.logger.info('train with gpu {} and pytorch {}'.format(
                self.config['trainer']['gpus'], torch.__version__))
            self.gpus = {
                i: item
                for i, item in enumerate(self.config['trainer']['gpus'])
            }
            self.device = torch.device("cuda:0")
            torch.cuda.manual_seed(
                self.config['trainer']['seed'])  # 为当前GPU设置随机种子
            torch.cuda.manual_seed_all(
                self.config['trainer']['seed'])  # 为所有GPU设置随机种子
        else:
            self.with_cuda = False
            self.logger.info('train with cpu and pytorch {}'.format(
                torch.__version__))
            self.device = torch.device("cpu")
        self.logger.info('device {}'.format(self.device))
        self.metrics = {
            'recall': 0,
            'precision': 0,
            'hmean': 0,
            'train_loss': float('inf'),
            'best_model': ''
        }

        self.optimizer = self._initialize('optimizer', torch.optim,
                                          self.model.parameters())

        if self.config['lr_scheduler']['type'] != 'PolynomialLR':
            self.scheduler = self._initialize('lr_scheduler',
                                              torch.optim.lr_scheduler,
                                              self.optimizer)
        else:
            self.scheduler = PolynomialLR(self.optimizer,
                                          self.epochs * len(self.train_loader))

        if self.config['trainer']['resume_checkpoint'] != '':
            self._load_checkpoint(self.config['trainer']['resume_checkpoint'],
                                  resume=True)
        elif self.config['trainer']['finetune_checkpoint'] != '':
            self._load_checkpoint(
                self.config['trainer']['finetune_checkpoint'], resume=False)
        else:
            if weights_init is not None:
                self.model.apply(weights_init)

        # 单机多卡
        num_gpus = torch.cuda.device_count()
        if num_gpus > 1:
            self.model = nn.DataParallel(self.model)
            # For sync bn
            patch_replication_callback(self.model)

        self.model.to(self.device)

        if self.tensorboard_enable:
            try:
                # add graph
                dummy_input = torch.zeros(
                    1, self.config['data_loader']['args']['dataset']
                    ['img_channel'], self.config['data_loader']['args']
                    ['dataset']['input_size'], self.config['data_loader']
                    ['args']['dataset']['input_size']).to(self.device)
                self.writer.add_graph(self.model, dummy_input)
            except Exception as e:
                self.logger.warn(
                    'add graph to tensorboard failed, error [{}]'.format(e))
Beispiel #3
0
                "lr": 2 * CONFIG.SOLVER.LR,
                "weight_decay": 0.0,
            },
            {
                "params": train_params["20xbias"],
                "lr": 20 * CONFIG.SOLVER.LR,
                "weight_decay": 0.0,
            },
        ],
        momentum=CONFIG.SOLVER.MOMENTUM,
    )

    # Learning rate scheduler
    scheduler = PolynomialLR(
        optimizer=optimizer,
        step_size=CONFIG.SOLVER.LR_DECAY,
        iter_max=CONFIG.SOLVER.ITER_MAX,
        power=CONFIG.SOLVER.POLY_POWER,
    )

    # Setup loss logger
    writer = SummaryWriter(os.path.join("experiment", CONFIG.EXP_ID,
                                        "summary"))
    average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS)

    # Path to save models
    checkpoint_dir = os.path.join("experiment", CONFIG.EXP_ID, "checkpoints")
    makedirs(checkpoint_dir)
    print("Checkpoint dst:", checkpoint_dir)

    # Random Dropout
    model.train()