Beispiel #1
0
    def __init__(self, model, resume, config, iters_per_epoch, val_logger=None, train_logger=None):
        self.model = model
        self.config = config

        self.val_logger = val_logger
        self.train_logger = train_logger
        self.logger = logging.getLogger(self.__class__.__name__)
        self.do_validation = self.config['trainer']['val']
        self.start_epoch = 1
        self.improved = False

        # SETTING THE DEVICE
        self.device, availble_gpus = self._get_available_devices(self.config['n_gpu'])
        self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus)
        self.model.to(self.device)

        # CONFIGS
        cfg_trainer = self.config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']

        # OPTIMIZER
        trainable_params = [{'params': filter(lambda p:p.requires_grad, self.model.module.get_other_params())},
                            {'params': filter(lambda p:p.requires_grad, self.model.module.get_backbone_params()), 
                            'lr': config['optimizer']['args']['lr'] / 10}]

        self.optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params)
        model_params = sum([i.shape.numel() for i in list(model.parameters())])
        opt_params = sum([i.shape.numel() for j in self.optimizer.param_groups for i in j['params']])
        assert opt_params == model_params, 'some params are missing in the opt'

        self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler'])(optimizer=self.optimizer, num_epochs=self.epochs, 
                                        iters_per_epoch=iters_per_epoch)

        # MONITORING
        self.monitor = cfg_trainer.get('monitor', 'off')
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']
            self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf
            self.early_stoping = cfg_trainer.get('early_stop', math.inf)

        # CHECKPOINTS & TENSOBOARD
        date_time = datetime.datetime.now().strftime('%m-%d_%H-%M')
        run_name = config['experim_name']
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], run_name)
        helpers.dir_exists(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(self.config, handle, indent=4, sort_keys=True)
         
        writer_dir = os.path.join(cfg_trainer['log_dir'], run_name)
        self.writer = tensorboard.SummaryWriter(writer_dir)
        self.html_results = HTML(web_dir=config['trainer']['save_dir'], exp_name=config['experim_name'],
                            save_name=config['experim_name'], config=config, resume=resume)

        if resume: self._resume_checkpoint(resume)
    def __init__(self,
                 model,
                 loss,
                 resume,
                 config,
                 train_loader,
                 val_loader=None,
                 train_logger=None):
        self.model = model
        self.loss = loss
        self.config = config
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.train_logger = train_logger
        self.logger = logging.getLogger(self.__class__.__name__)
        self.do_validation = self.config['trainer']['val']
        self.start_epoch = 1
        self.improved = False

        # SETTING THE DEVICE
        self.device, availble_gpus = self._get_available_devices(
            self.config['n_gpu'])
        self.model.loss = loss
        if config["use_synch_bn"]:
            self.model = convert_model(self.model)
            self.model = DataParallelWithCallback(self.model,
                                                  device_ids=availble_gpus)
        else:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=availble_gpus)
        self.model.cuda()

        # CONFIGS
        cfg_trainer = self.config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']

        # OPTIMIZER
        if self.config['optimizer']['differential_lr']:
            if isinstance(self.model, torch.nn.DataParallel):
                trainable_params = [{
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.module.get_decoder_params())
                }, {
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.module.get_backbone_params()),
                    'lr':
                    config['optimizer']['args']['lr'] / 10
                }]
            else:
                trainable_params = [{
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.get_decoder_params())
                }, {
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.get_backbone_params()),
                    'lr':
                    config['optimizer']['args']['lr'] / 10
                }]
        else:
            trainable_params = filter(lambda p: p.requires_grad,
                                      self.model.parameters())
        self.optimizer = get_instance(torch.optim, 'optimizer', config,
                                      trainable_params)
        self.lr_scheduler = getattr(utils.lr_scheduler,
                                    config['lr_scheduler']['type'])(
                                        self.optimizer, self.epochs,
                                        len(train_loader))
        #self.lr_scheduler = getattr(torch.optim.lr_scheduler, config['lr_scheduler']['type'])(self.optimizer, **config['lr_scheduler']['args'])

        # MONITORING
        self.monitor = cfg_trainer.get('monitor', 'off')
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']
            self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf
            self.early_stoping = cfg_trainer.get('early_stop', math.inf)

        # CHECKPOINTS & TENSOBOARD
        start_time = datetime.datetime.now().strftime('%m-%d_%H-%M')
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'],
                                           self.config['name'], start_time)
        helpers.dir_exists(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(self.config, handle, indent=4, sort_keys=True)

        writer_dir = os.path.join(cfg_trainer['log_dir'], self.config['name'],
                                  start_time)
        self.writer = tensorboard.SummaryWriter(writer_dir)

        if resume: self._resume_checkpoint(resume)
Beispiel #3
0
    def __init__(self, model, resume, config, train_loader, val_loader=None, train_logger=None):
        self.model = model
        self.config = config
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.train_logger = train_logger
        self.logger = logging.getLogger(self.__class__.__name__)
        self.do_validation = self.config['trainer']['val']
        self.start_epoch = 1
        self.improved = False
        min_loss = sys.float_info.max
        # SETTING THE DEVICE
        self.device, availble_gpus = self._get_available_devices(self.config['n_gpu'])
        self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus)
        self.model.to(self.device)

        # CONFIGS
        cfg_trainer = self.config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']

        # OPTIMIZER
        optim_params = [
            {'params': model.parameters(), 'lr': self.config['optimizer']['args']['lr']},
            ]

        self.optimizer = torch.optim.Adam(optim_params,
                                 betas=(self.config['optimizer']['args']['momentum'], 0.99),
                                 weight_decay=self.config['optimizer']['args']['weight_decay'])
        self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler']['type'])(self.optimizer, self.epochs, len(train_loader))

        # MONITORING
        self.monitor = cfg_trainer.get('monitor', 'off')
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            print(self.monitor)
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']
            self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf
            self.early_stoping = cfg_trainer.get('early_stop', math.inf)

        # CHECKPOINTS & TENSOBOARD
        start_time = datetime.datetime.now().strftime('%m-%d_%H-%M')
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], self.config['name'], start_time)
        helpers.dir_exists(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(self.config, handle, indent=4, sort_keys=True)
            
        writepath = "/home/rtmdisp/VoxelNet_PyTorch/saved/"
        writer_dir = str(writepath + '/' + self.config['name'] + '/' + start_time)
#         if os.path.isdir(writer_dir):
#             self.writer = tensorboard.SummaryWriter(writer_dir)
#         else:
#             print("set logdir properly")
#             print(writer_dir)
#             exit()
#         import pdb; pdb.set_trace()
        self.writer = tensorboard.SummaryWriter(writer_dir)

        if resume: self._resume_checkpoint(resume)
Beispiel #4
0
    def __init__(self,
                 model,
                 loss,
                 resume,
                 config,
                 train_loader,
                 val_loader=None,
                 train_logger=None):
        self.model = model
        self.loss = loss
        self.config = config
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.train_logger = train_logger
        self.logger = logging.getLogger(self.__class__.__name__)
        self.do_validation = self.config['trainer']['val']
        self.start_epoch = 1
        self.improved = False

        # SETTING THE DEVICE
        self.device, availble_gpus = self._get_available_devices(
            self.config['n_gpu'])
        if len(availble_gpus) > 1:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=availble_gpus)
            self.loss = torch.nn.DataParallel(self.loss,
                                              device_ids=availble_gpus)
        self.model.to(self.device)
        self.loss.to(self.device)

        # CONFIGS
        cfg_trainer = self.config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']

        # OPTIMIZER
        if self.config['optimizer']['differential_lr']:
            if isinstance(self.model, torch.nn.DataParallel):
                #---如果是多gpu训练的模型,其类为self.model.module,单gpu为self.model,  filter(function,iterable)
                #---filter()函数将model中需要优化的参数(即p.requires_grad=True)的参数才传入optimzer优化器,因为有时候可能冻结freeze了某些层
                #---这里分为了两组参数,一个是resnet层,一个是ppm和辅助损失层,其中一个在字典中设置了lr,就不会用optim中的lr
                #---可以看到这里的lr是不一样的,resnet层的lr要小一点,这里是调用的预训练模型,优化会慢一点,预训练的模型的lr要小一点
                trainable_params = [{
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.module.get_decoder_params())
                }, {
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.module.get_backbone_params()),
                    'lr':
                    config['optimizer']['args']['lr'] / 10
                }]
            else:
                trainable_params = [{
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.get_decoder_params())
                }, {
                    'params':
                    filter(lambda p: p.requires_grad,
                           self.model.get_backbone_params()),
                    'lr':
                    config['optimizer']['args']['lr'] / 10
                }]
        else:
            trainable_params = filter(lambda p: p.requires_grad,
                                      self.model.parameters())
        self.optimizer = get_instance(torch.optim, 'optimizer', config,
                                      trainable_params)
        self.lr_scheduler = getattr(utils.lr_scheduler,
                                    config['lr_scheduler']['type'])(
                                        self.optimizer, self.epochs,
                                        len(train_loader))

        # MONITORING
        self.monitor = cfg_trainer.get('monitor', 'off')
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']
            self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf
            #---dict.get()函数,取dict中key='early_strp'的键值,如果不存在则让其等于math.inf防止报错
            self.early_stoping = cfg_trainer.get('early_stop', math.inf)

        # CHECKPOINTS & TENSOBOARD
        start_time = datetime.datetime.now().strftime('%m-%d_%H-%M')
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'],
                                           self.config['name'], start_time)
        helpers.dir_exists(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(self.config, handle, indent=4, sort_keys=True)

        writer_dir = os.path.join(cfg_trainer['log_dir'], self.config['name'],
                                  start_time)
        #self.writer = tensorboard.SummaryWriter(writer_dir)

        if resume: self._resume_checkpoint(resume)
Beispiel #5
0
    def __init__(
        self,
        model,
        loss,
        resume,
        config,
        train_loader,
        val_loader=None,
        train_logger=None,
    ):
        self.model = model
        self.loss = loss
        self.config = config
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.train_logger = train_logger
        self.logger = logging.getLogger(self.__class__.__name__)
        self.do_validation = self.config["trainer"]["val"]
        self.start_epoch = 1
        self.improved = False

        # SETTING THE DEVICE
        self.device, availble_gpus = self._get_available_devices(
            self.config["n_gpu"])
        if config["use_synch_bn"]:
            self.model = convert_model(self.model)
            self.model = DataParallelWithCallback(self.model,
                                                  device_ids=availble_gpus)
        else:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=availble_gpus)
        self.model.to(self.device)

        # CONFIGS
        cfg_trainer = self.config["trainer"]
        self.epochs = cfg_trainer["epochs"]
        self.save_period = cfg_trainer["save_period"]

        # OPTIMIZER
        if self.config["optimizer"]["differential_lr"]:
            if isinstance(self.model, torch.nn.DataParallel):
                trainable_params = [
                    {
                        "params":
                        filter(
                            lambda p: p.requires_grad,
                            self.model.module.get_decoder_params(),
                        )
                    },
                    {
                        "params":
                        filter(
                            lambda p: p.requires_grad,
                            self.model.module.get_backbone_params(),
                        ),
                        "lr":
                        config["optimizer"]["args"]["lr"] / 10,
                    },
                ]
            else:
                trainable_params = [
                    {
                        "params":
                        filter(lambda p: p.requires_grad,
                               self.model.get_decoder_params())
                    },
                    {
                        "params":
                        filter(lambda p: p.requires_grad,
                               self.model.get_backbone_params()),
                        "lr":
                        config["optimizer"]["args"]["lr"] / 10,
                    },
                ]
        else:
            trainable_params = filter(lambda p: p.requires_grad,
                                      self.model.parameters())
        self.optimizer = get_instance(torch.optim, "optimizer", config,
                                      trainable_params)
        self.lr_scheduler = getattr(utils.lr_scheduler,
                                    config["lr_scheduler"]["type"])(
                                        self.optimizer, self.epochs,
                                        len(train_loader))

        # MONITORING
        self.monitor = cfg_trainer.get("monitor", "off")
        if self.monitor == "off":
            self.mnt_mode = "off"
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ["min", "max"]
            self.mnt_best = -math.inf if self.mnt_mode == "max" else math.inf
            self.early_stoping = cfg_trainer.get("early_stop", math.inf)

        # CHECKPOINTS & TENSOBOARD
        start_time = datetime.datetime.now().strftime("%m-%d_%H-%M")
        self.checkpoint_dir = os.path.join(cfg_trainer["save_dir"],
                                           self.config["name"], start_time)
        helpers.dir_exists(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, "config.json")
        with open(config_save_path, "w") as handle:
            json.dump(self.config, handle, indent=4, sort_keys=True)

        writer_dir = os.path.join(cfg_trainer["log_dir"], self.config["name"],
                                  start_time)
        self.writer = tensorboard.SummaryWriter(writer_dir)

        if resume:
            self._resume_checkpoint(resume)
Beispiel #6
0
    def __init__(self, mode, model, rank, resume=None, config=None, loss=None,

                 train_loader=None,
                 val_loader=None,
                 checkpoint=None,
                 test_loader=None,

                 save_path=None, show=False, save_pic=False):
        self.rank = rank
        self.config = config

        self.scaler = torch.cuda.amp.GradScaler(enabled=True)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.group = 4
        self.save_pic = save_pic
        self.gt_num = config["loss"]["gt_num"]
        self.model = model

        if self.rank == 0: wandb.watch(self.model)

        cudnn.benchmark = True
        # train and val
        if mode == "train":
            self.start_epoch = 1
            self.show = show
            self.loss = loss
            # OPTIMIZER
            self.optimizer = getattr(torch.optim, config['optimizer']['type'])(self.model.parameters(),
                                                                               **config['optimizer']['args'])
            self.lr_scheduler = getattr(torch.optim.lr_scheduler, config['lr_scheduler']['type'])(
                self.optimizer, **config['lr_scheduler']['args'])

            # summary(model, input_size=(
            #     1, self.config["data_set"]["patch_size"], self.config["data_set"]["patch_size"]))
            # CONFIGS

            cfg_trainer = self.config['trainer']
            self.epochs = cfg_trainer['epochs']
            if self.rank == 0:
                self.save_period = cfg_trainer['save_period']
                # MONITORING
                self.improved = True
                self.not_improved_count = 0
                self.monitor = cfg_trainer.get('monitor', 'off')
                if self.monitor == 'off':
                    self.mnt_mode = 'off'
                    self.mnt_best = 0
                else:
                    self.mnt_mode, self.mnt_metric, self.gap = self.monitor.split()

                    assert self.mnt_mode in ['min', 'max']
                    self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf
                    self.early_stopping = cfg_trainer.get('early_stop', math.inf)

                # CHECKPOINTS & TENSOBOARD

                start_time = datetime.now().strftime('%y%m%d%H%M')
                self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], self.config['model']['type'], start_time)
                self.writer = tensorboard.SummaryWriter(self.checkpoint_dir)
                dir_exists(self.checkpoint_dir)
                config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
                self.train_logger_save_path = os.path.join(self.checkpoint_dir, 'runtime.log')
                logger.add(self.train_logger_save_path)
                logger.info(self.checkpoint_dir)
                with open(config_save_path, 'w') as handle:
                    json.dump(self.config, handle, indent=4, sort_keys=True)
                self.writer = tensorboard.SummaryWriter(self.checkpoint_dir)
                self.log_step = config['trainer'].get('log_per_iter', self.train_loader.batch_size)
                if resume: self._resume_checkpoint(resume)

        # test
        if mode == "test":
            self.model.load_state_dict(checkpoint['state_dict'])
            self.checkpoint_dir = save_path
    def __init__(self, model, resume, config, iters_per_epoch, train_logger=None, gpu=None, test=False):
        self.model = model
        self.config = config

        if gpu == 0:
            self.train_logger = train_logger
            self.logger = logging.getLogger(self.__class__.__name__)
            self.logger.setLevel(logging.INFO)
            log_dir = os.path.join(config['trainer']['log_dir'], config['experim_name'])
            log_path = os.path.join(log_dir, '{}.log'.format(time.time()))
            dir_exists(log_dir)
            fh = logging.FileHandler(log_path)
            fh.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            fh.setFormatter(formatter)
            self.logger.addHandler(fh)
            self.logger.info("config: {}".format(self.config))
        self.do_validation = self.config['trainer']['val']
        self.start_epoch = 1
        self.improved = False
        self.gpu = gpu 
        torch.cuda.set_device(self.gpu)

        self.model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.model)

        trainable_params = [{'params': list(filter(lambda p:p.requires_grad, self.model.get_other_params()))},
                            {'params': list(filter(lambda p:p.requires_grad, self.model.get_backbone_params())), 
                            'lr': config['optimizer']['args']['lr'] / 10}]

        self.model = torch.nn.parallel.DistributedDataParallel(self.model.cuda(), device_ids=[gpu], find_unused_parameters=True)

        # CONFIGS
        cfg_trainer = self.config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']

        # OPTIMIZER
        self.optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params) # trainable_params should be obtained before wraping the model with DistributedDataParallel
        
        model_params = sum([i.shape.numel() for i in list(filter(lambda p: p.requires_grad, model.parameters()))])
        opt_params = sum([i.shape.numel() for j in self.optimizer.param_groups for i in j['params']])

        assert opt_params == model_params, 'some params are missing in the opt'

        self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler'])(optimizer=self.optimizer, num_epochs=self.epochs, 
                                        iters_per_epoch=iters_per_epoch)

        # MONITORING
        self.monitor = cfg_trainer.get('monitor', 'off')
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']
            self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf
            self.early_stoping = cfg_trainer.get('early_stop', math.inf)

        if self.gpu == 0:
            # CHECKPOINTS & TENSOBOARD
            date_time = datetime.datetime.now().strftime('%m-%d_%H-%M')
            run_name = config['experim_name']
            self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], run_name)
            helpers.dir_exists(self.checkpoint_dir)
            config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
            with open(config_save_path, 'w') as handle:
                json.dump(self.config, handle, indent=4, sort_keys=True)
            
            writer_dir = os.path.join(cfg_trainer['log_dir'], run_name)
            self.writer = tensorboard.SummaryWriter(writer_dir)

        self.test = test
        if resume: self._resume_checkpoint(resume)