def __init__(self, model, resume, config, iters_per_epoch, val_logger=None, train_logger=None): self.model = model self.config = config self.val_logger = val_logger self.train_logger = train_logger self.logger = logging.getLogger(self.__class__.__name__) self.do_validation = self.config['trainer']['val'] self.start_epoch = 1 self.improved = False # SETTING THE DEVICE self.device, availble_gpus = self._get_available_devices(self.config['n_gpu']) self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus) self.model.to(self.device) # CONFIGS cfg_trainer = self.config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] # OPTIMIZER trainable_params = [{'params': filter(lambda p:p.requires_grad, self.model.module.get_other_params())}, {'params': filter(lambda p:p.requires_grad, self.model.module.get_backbone_params()), 'lr': config['optimizer']['args']['lr'] / 10}] self.optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params) model_params = sum([i.shape.numel() for i in list(model.parameters())]) opt_params = sum([i.shape.numel() for j in self.optimizer.param_groups for i in j['params']]) assert opt_params == model_params, 'some params are missing in the opt' self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler'])(optimizer=self.optimizer, num_epochs=self.epochs, iters_per_epoch=iters_per_epoch) # MONITORING self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf self.early_stoping = cfg_trainer.get('early_stop', math.inf) # CHECKPOINTS & TENSOBOARD date_time = datetime.datetime.now().strftime('%m-%d_%H-%M') run_name = config['experim_name'] self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], run_name) helpers.dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=True) writer_dir = os.path.join(cfg_trainer['log_dir'], run_name) self.writer = tensorboard.SummaryWriter(writer_dir) self.html_results = HTML(web_dir=config['trainer']['save_dir'], exp_name=config['experim_name'], save_name=config['experim_name'], config=config, resume=resume) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, resume, config, train_loader, val_loader=None, train_logger=None): self.model = model self.loss = loss self.config = config self.train_loader = train_loader self.val_loader = val_loader self.train_logger = train_logger self.logger = logging.getLogger(self.__class__.__name__) self.do_validation = self.config['trainer']['val'] self.start_epoch = 1 self.improved = False # SETTING THE DEVICE self.device, availble_gpus = self._get_available_devices( self.config['n_gpu']) self.model.loss = loss if config["use_synch_bn"]: self.model = convert_model(self.model) self.model = DataParallelWithCallback(self.model, device_ids=availble_gpus) else: self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus) self.model.cuda() # CONFIGS cfg_trainer = self.config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] # OPTIMIZER if self.config['optimizer']['differential_lr']: if isinstance(self.model, torch.nn.DataParallel): trainable_params = [{ 'params': filter(lambda p: p.requires_grad, self.model.module.get_decoder_params()) }, { 'params': filter(lambda p: p.requires_grad, self.model.module.get_backbone_params()), 'lr': config['optimizer']['args']['lr'] / 10 }] else: trainable_params = [{ 'params': filter(lambda p: p.requires_grad, self.model.get_decoder_params()) }, { 'params': filter(lambda p: p.requires_grad, self.model.get_backbone_params()), 'lr': config['optimizer']['args']['lr'] / 10 }] else: trainable_params = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params) self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler']['type'])( self.optimizer, self.epochs, len(train_loader)) #self.lr_scheduler = getattr(torch.optim.lr_scheduler, config['lr_scheduler']['type'])(self.optimizer, **config['lr_scheduler']['args']) # MONITORING self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf self.early_stoping = cfg_trainer.get('early_stop', math.inf) # CHECKPOINTS & TENSOBOARD start_time = datetime.datetime.now().strftime('%m-%d_%H-%M') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], self.config['name'], start_time) helpers.dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=True) writer_dir = os.path.join(cfg_trainer['log_dir'], self.config['name'], start_time) self.writer = tensorboard.SummaryWriter(writer_dir) if resume: self._resume_checkpoint(resume)
def __init__(self, model, resume, config, train_loader, val_loader=None, train_logger=None): self.model = model self.config = config self.train_loader = train_loader self.val_loader = val_loader self.train_logger = train_logger self.logger = logging.getLogger(self.__class__.__name__) self.do_validation = self.config['trainer']['val'] self.start_epoch = 1 self.improved = False min_loss = sys.float_info.max # SETTING THE DEVICE self.device, availble_gpus = self._get_available_devices(self.config['n_gpu']) self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus) self.model.to(self.device) # CONFIGS cfg_trainer = self.config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] # OPTIMIZER optim_params = [ {'params': model.parameters(), 'lr': self.config['optimizer']['args']['lr']}, ] self.optimizer = torch.optim.Adam(optim_params, betas=(self.config['optimizer']['args']['momentum'], 0.99), weight_decay=self.config['optimizer']['args']['weight_decay']) self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler']['type'])(self.optimizer, self.epochs, len(train_loader)) # MONITORING self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: print(self.monitor) self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf self.early_stoping = cfg_trainer.get('early_stop', math.inf) # CHECKPOINTS & TENSOBOARD start_time = datetime.datetime.now().strftime('%m-%d_%H-%M') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], self.config['name'], start_time) helpers.dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=True) writepath = "/home/rtmdisp/VoxelNet_PyTorch/saved/" writer_dir = str(writepath + '/' + self.config['name'] + '/' + start_time) # if os.path.isdir(writer_dir): # self.writer = tensorboard.SummaryWriter(writer_dir) # else: # print("set logdir properly") # print(writer_dir) # exit() # import pdb; pdb.set_trace() self.writer = tensorboard.SummaryWriter(writer_dir) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, resume, config, train_loader, val_loader=None, train_logger=None): self.model = model self.loss = loss self.config = config self.train_loader = train_loader self.val_loader = val_loader self.train_logger = train_logger self.logger = logging.getLogger(self.__class__.__name__) self.do_validation = self.config['trainer']['val'] self.start_epoch = 1 self.improved = False # SETTING THE DEVICE self.device, availble_gpus = self._get_available_devices( self.config['n_gpu']) if len(availble_gpus) > 1: self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus) self.loss = torch.nn.DataParallel(self.loss, device_ids=availble_gpus) self.model.to(self.device) self.loss.to(self.device) # CONFIGS cfg_trainer = self.config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] # OPTIMIZER if self.config['optimizer']['differential_lr']: if isinstance(self.model, torch.nn.DataParallel): #---如果是多gpu训练的模型,其类为self.model.module,单gpu为self.model, filter(function,iterable) #---filter()函数将model中需要优化的参数(即p.requires_grad=True)的参数才传入optimzer优化器,因为有时候可能冻结freeze了某些层 #---这里分为了两组参数,一个是resnet层,一个是ppm和辅助损失层,其中一个在字典中设置了lr,就不会用optim中的lr #---可以看到这里的lr是不一样的,resnet层的lr要小一点,这里是调用的预训练模型,优化会慢一点,预训练的模型的lr要小一点 trainable_params = [{ 'params': filter(lambda p: p.requires_grad, self.model.module.get_decoder_params()) }, { 'params': filter(lambda p: p.requires_grad, self.model.module.get_backbone_params()), 'lr': config['optimizer']['args']['lr'] / 10 }] else: trainable_params = [{ 'params': filter(lambda p: p.requires_grad, self.model.get_decoder_params()) }, { 'params': filter(lambda p: p.requires_grad, self.model.get_backbone_params()), 'lr': config['optimizer']['args']['lr'] / 10 }] else: trainable_params = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params) self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler']['type'])( self.optimizer, self.epochs, len(train_loader)) # MONITORING self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf #---dict.get()函数,取dict中key='early_strp'的键值,如果不存在则让其等于math.inf防止报错 self.early_stoping = cfg_trainer.get('early_stop', math.inf) # CHECKPOINTS & TENSOBOARD start_time = datetime.datetime.now().strftime('%m-%d_%H-%M') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], self.config['name'], start_time) helpers.dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=True) writer_dir = os.path.join(cfg_trainer['log_dir'], self.config['name'], start_time) #self.writer = tensorboard.SummaryWriter(writer_dir) if resume: self._resume_checkpoint(resume)
def __init__( self, model, loss, resume, config, train_loader, val_loader=None, train_logger=None, ): self.model = model self.loss = loss self.config = config self.train_loader = train_loader self.val_loader = val_loader self.train_logger = train_logger self.logger = logging.getLogger(self.__class__.__name__) self.do_validation = self.config["trainer"]["val"] self.start_epoch = 1 self.improved = False # SETTING THE DEVICE self.device, availble_gpus = self._get_available_devices( self.config["n_gpu"]) if config["use_synch_bn"]: self.model = convert_model(self.model) self.model = DataParallelWithCallback(self.model, device_ids=availble_gpus) else: self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus) self.model.to(self.device) # CONFIGS cfg_trainer = self.config["trainer"] self.epochs = cfg_trainer["epochs"] self.save_period = cfg_trainer["save_period"] # OPTIMIZER if self.config["optimizer"]["differential_lr"]: if isinstance(self.model, torch.nn.DataParallel): trainable_params = [ { "params": filter( lambda p: p.requires_grad, self.model.module.get_decoder_params(), ) }, { "params": filter( lambda p: p.requires_grad, self.model.module.get_backbone_params(), ), "lr": config["optimizer"]["args"]["lr"] / 10, }, ] else: trainable_params = [ { "params": filter(lambda p: p.requires_grad, self.model.get_decoder_params()) }, { "params": filter(lambda p: p.requires_grad, self.model.get_backbone_params()), "lr": config["optimizer"]["args"]["lr"] / 10, }, ] else: trainable_params = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = get_instance(torch.optim, "optimizer", config, trainable_params) self.lr_scheduler = getattr(utils.lr_scheduler, config["lr_scheduler"]["type"])( self.optimizer, self.epochs, len(train_loader)) # MONITORING self.monitor = cfg_trainer.get("monitor", "off") if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = -math.inf if self.mnt_mode == "max" else math.inf self.early_stoping = cfg_trainer.get("early_stop", math.inf) # CHECKPOINTS & TENSOBOARD start_time = datetime.datetime.now().strftime("%m-%d_%H-%M") self.checkpoint_dir = os.path.join(cfg_trainer["save_dir"], self.config["name"], start_time) helpers.dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, "config.json") with open(config_save_path, "w") as handle: json.dump(self.config, handle, indent=4, sort_keys=True) writer_dir = os.path.join(cfg_trainer["log_dir"], self.config["name"], start_time) self.writer = tensorboard.SummaryWriter(writer_dir) if resume: self._resume_checkpoint(resume)
def __init__(self, mode, model, rank, resume=None, config=None, loss=None, train_loader=None, val_loader=None, checkpoint=None, test_loader=None, save_path=None, show=False, save_pic=False): self.rank = rank self.config = config self.scaler = torch.cuda.amp.GradScaler(enabled=True) self.train_loader = train_loader self.val_loader = val_loader self.test_loader = test_loader self.group = 4 self.save_pic = save_pic self.gt_num = config["loss"]["gt_num"] self.model = model if self.rank == 0: wandb.watch(self.model) cudnn.benchmark = True # train and val if mode == "train": self.start_epoch = 1 self.show = show self.loss = loss # OPTIMIZER self.optimizer = getattr(torch.optim, config['optimizer']['type'])(self.model.parameters(), **config['optimizer']['args']) self.lr_scheduler = getattr(torch.optim.lr_scheduler, config['lr_scheduler']['type'])( self.optimizer, **config['lr_scheduler']['args']) # summary(model, input_size=( # 1, self.config["data_set"]["patch_size"], self.config["data_set"]["patch_size"])) # CONFIGS cfg_trainer = self.config['trainer'] self.epochs = cfg_trainer['epochs'] if self.rank == 0: self.save_period = cfg_trainer['save_period'] # MONITORING self.improved = True self.not_improved_count = 0 self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric, self.gap = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf self.early_stopping = cfg_trainer.get('early_stop', math.inf) # CHECKPOINTS & TENSOBOARD start_time = datetime.now().strftime('%y%m%d%H%M') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], self.config['model']['type'], start_time) self.writer = tensorboard.SummaryWriter(self.checkpoint_dir) dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') self.train_logger_save_path = os.path.join(self.checkpoint_dir, 'runtime.log') logger.add(self.train_logger_save_path) logger.info(self.checkpoint_dir) with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=True) self.writer = tensorboard.SummaryWriter(self.checkpoint_dir) self.log_step = config['trainer'].get('log_per_iter', self.train_loader.batch_size) if resume: self._resume_checkpoint(resume) # test if mode == "test": self.model.load_state_dict(checkpoint['state_dict']) self.checkpoint_dir = save_path
def __init__(self, model, resume, config, iters_per_epoch, train_logger=None, gpu=None, test=False): self.model = model self.config = config if gpu == 0: self.train_logger = train_logger self.logger = logging.getLogger(self.__class__.__name__) self.logger.setLevel(logging.INFO) log_dir = os.path.join(config['trainer']['log_dir'], config['experim_name']) log_path = os.path.join(log_dir, '{}.log'.format(time.time())) dir_exists(log_dir) fh = logging.FileHandler(log_path) fh.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) self.logger.addHandler(fh) self.logger.info("config: {}".format(self.config)) self.do_validation = self.config['trainer']['val'] self.start_epoch = 1 self.improved = False self.gpu = gpu torch.cuda.set_device(self.gpu) self.model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.model) trainable_params = [{'params': list(filter(lambda p:p.requires_grad, self.model.get_other_params()))}, {'params': list(filter(lambda p:p.requires_grad, self.model.get_backbone_params())), 'lr': config['optimizer']['args']['lr'] / 10}] self.model = torch.nn.parallel.DistributedDataParallel(self.model.cuda(), device_ids=[gpu], find_unused_parameters=True) # CONFIGS cfg_trainer = self.config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] # OPTIMIZER self.optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params) # trainable_params should be obtained before wraping the model with DistributedDataParallel model_params = sum([i.shape.numel() for i in list(filter(lambda p: p.requires_grad, model.parameters()))]) opt_params = sum([i.shape.numel() for j in self.optimizer.param_groups for i in j['params']]) assert opt_params == model_params, 'some params are missing in the opt' self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler'])(optimizer=self.optimizer, num_epochs=self.epochs, iters_per_epoch=iters_per_epoch) # MONITORING self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf self.early_stoping = cfg_trainer.get('early_stop', math.inf) if self.gpu == 0: # CHECKPOINTS & TENSOBOARD date_time = datetime.datetime.now().strftime('%m-%d_%H-%M') run_name = config['experim_name'] self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], run_name) helpers.dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=True) writer_dir = os.path.join(cfg_trainer['log_dir'], run_name) self.writer = tensorboard.SummaryWriter(writer_dir) self.test = test if resume: self._resume_checkpoint(resume)