def save_checkpoint(net, checkpoints_path, epoch=None, prefix='', verbose=True): if epoch is None: checkpoint_name = 'last_checkpoint.params' else: checkpoint_name = f'{epoch:03d}.params' if prefix: checkpoint_name = f'{prefix}_{checkpoint_name}' if not checkpoints_path.exists(): checkpoints_path.mkdir(parents=True) checkpoint_path = checkpoints_path / checkpoint_name if verbose: logger.info(f'Save checkpoint to {str(checkpoint_path)}') torch.save(net.state_dict(), str(checkpoint_path))
def init_experiment(experiment_name, add_exp_args, script_path=None): parser = get_train_arguments() parser = add_exp_args(parser) args = parser.parse_args() experiments_path = Path('./experiments') / experiment_name experiments_path.mkdir(parents=True, exist_ok=True) exp_indx = find_last_exp_indx(experiments_path) experiment_name = f'{exp_indx:03d}' if args.exp_name: experiment_name += f'_{args.exp_name}' experiment_path = experiments_path / experiment_name args.logs_path = experiment_path / 'logs' args.run_path = experiment_path args.checkpoints_path = experiment_path / 'checkpoints' experiment_path.mkdir(parents=True) if script_path is not None: temp_script_name = Path(script_path).stem + datetime.strftime( datetime.today(), '_%Y-%m-%d_%H-%M-%S.py') shutil.copy(script_path, experiment_path / temp_script_name) if not args.checkpoints_path.exists(): args.checkpoints_path.mkdir(parents=True) if not args.logs_path.exists(): args.logs_path.mkdir(parents=True) stdout_log_path = args.logs_path / 'train_log.txt' if stdout_log_path is not None: fh = logging.FileHandler(str(stdout_log_path)) formatter = logging.Formatter( fmt='(%(levelname)s) %(asctime)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(formatter) logger.addHandler(fh) if args.no_cuda: logger.info('Using CPU') args.device = torch.device('cpu') else: if args.gpus: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = f"{args.gpus.split(',')[0]}" args.device = torch.device(f'cuda:{0}') args.ngpus = 1 logger.info(f'Number of GPUs: {args.ngpus}') if args.ngpus < 2: args.syncbn = False logger.info(args) return args
def __init__(self, args, model, model_cfg, loss_cfg, trainset, valset, optimizer_params, image_dump_interval=200, checkpoint_interval=10, tb_dump_period=25, num_epochs=1, lr_scheduler=None, metrics=None, additional_val_metrics=None, train_proposals=False): self.args = args self.model_cfg = model_cfg self.loss_cfg = loss_cfg self.val_loss_cfg = deepcopy(loss_cfg) self.tb_dump_period = tb_dump_period self.train_metrics = metrics if metrics is not None else [] self.val_metrics = deepcopy(self.train_metrics) if additional_val_metrics is not None: self.val_metrics.extend(additional_val_metrics) self.checkpoint_interval = checkpoint_interval self.image_dump_interval = image_dump_interval self.train_proposals = train_proposals self.task_prefix = '' self.summary_writer = None self.trainset = trainset self.valset = valset self.train_loader = DataLoader(trainset, batch_size=args.batch_size, pin_memory=True, shuffle=True, num_workers=args.workers, drop_last=True) self.val_loader = DataLoader(valset, batch_size=args.val_batch_size, pin_memory=True, shuffle=False, num_workers=args.workers, drop_last=True) self.device = torch.device(args.device) log.logger.info(model) if not args.no_cuda: self.net = model.to(self.device) self.evaluator = None self._load_weights() if train_proposals: self.task_prefix = 'proposals' self.optim = torch.optim.Adam(self.net.get_trainable_params(), **optimizer_params) self.tqdm_out = log.TqdmToLogger(log.logger, level=log.logging.INFO) self.lr_scheduler = None self.lr = optimizer_params['lr'] if lr_scheduler is not None: self.lr_scheduler = lr_scheduler(optimizer=self.optim, T_max=num_epochs * len(self.train_loader)) if args.start_epoch > 0: for _ in range(args.start_epoch): self.lr_scheduler.step() if args.input_normalization: mean = torch.tensor(args.input_normalization['mean'], dtype=torch.float32) std = torch.tensor(args.input_normalization['std'], dtype=torch.float32) self.denormalizator = Normalize((-mean / std), (1.0 / std)) else: self.denormalizator = lambda x: x if len(args.gpus.split(",")) > 1: logger.info("could use {} gpus.".format(torch.cuda.device_count())) assert args.batch_size % torch.cuda.device_count( ) == 0, "batch size should be divided by device count" self.net = torch.nn.DataParallel(self.net) self.epoch_loss = AverageMeter() self.best_loss = 2.0
def training(self, epoch): if self.summary_writer is None: self.summary_writer = log.SummaryWriterAvg( log_dir=str(self.args.logs_path), flush_secs=10, dump_period=self.tb_dump_period) log_prefix = 'Train' + self.task_prefix.capitalize() tbar = tqdm(self.train_loader, file=self.tqdm_out, ncols=100) train_loss = 0.0 for metric in self.train_metrics: metric.reset_epoch_stats() for i, batch_data in enumerate(tbar): global_step = epoch * len(self.train_loader) + i loss, losses_logging, batch_data, outputs = self.batch_forward_parallelloss( batch_data) self.optim.zero_grad() loss.backward() self.optim.step() if self.lr_scheduler is not None: self.lr_scheduler.step() loss = loss.detach().cpu().numpy().mean() train_loss += loss for loss_name, loss_values in losses_logging.items(): self.summary_writer.add_scalar( tag=f'{log_prefix}Losses/{loss_name}', value=np.array(loss_values).mean(), global_step=global_step) self.summary_writer.add_scalar(tag=f'{log_prefix}Losses/overall', value=loss, global_step=global_step) for k, v in self.loss_cfg.items(): if '_loss' in k and hasattr( v, 'log_states') and self.loss_cfg.get( k + '_weight', 0.0) > 0: v.log_states(self.summary_writer, f'{log_prefix}Losses/{k}', global_step) if self.image_dump_interval > 0 and global_step % self.image_dump_interval == 0: self.save_visualization(batch_data, outputs, global_step, prefix='train') self.summary_writer.add_scalar( tag=f'{log_prefix}States/learning_rate', value=self.lr if self.lr_scheduler is None else self.lr_scheduler.get_lr(), global_step=global_step) tbar.set_description( f'Epoch {epoch}, training loss {train_loss / (i + 1):.6f}') for metric in self.train_metrics: metric.log_states(self.summary_writer, f'{log_prefix}Metrics/{metric.name}', global_step) self.epoch_loss.update(loss.item(), batch_data['instances'].size(0)) for metric in self.train_metrics: self.summary_writer.add_scalar( tag=f'{log_prefix}Metrics/{metric.name}', value=metric.get_epoch_value(), global_step=epoch, disable_avg=True) misc.save_checkpoint(self.net, self.args.checkpoints_path, prefix=self.task_prefix, epoch=None) if epoch % self.checkpoint_interval == 0: misc.save_checkpoint(self.net, self.args.checkpoints_path, prefix=self.task_prefix, epoch=epoch) model_state_dic = self.net.module.state_dict() # DataParallel if self.epoch_loss.get_avg() < self.best_loss: self.best_loss = self.epoch_loss.get_avg() logger.info("save best loss model epoch {}".format(epoch)) torch.save( model_state_dic, os.path.join( self.args.checkpoints_path, 'ep-{}-loss-{}_model.pth'.format(epoch, self.best_loss)))
def train(model, model_cfg, args, train_proposals, start_epoch=0): loss_cfg = edict() loss_cfg.instance_loss = NormalizedFocalLossSigmoid(alpha=0.50, gamma=2) loss_cfg.instance_loss_weight = 1.0 if not train_proposals else 0.0 if not train_proposals: num_epochs = 160 num_points = 12 loss_cfg.segmentation_loss = NormalizedFocalLossSoftmax( ignore_label=-1, gamma=1) loss_cfg.segmentation_loss_weight = 0.75 else: num_epochs = 10 num_points = 32 loss_cfg.proposals_loss = AdaptISProposalsLossIoU(args.batch_size) loss_cfg.proposals_loss_weight = 1.0 args.val_batch_size = args.batch_size args.input_normalization = model_cfg.input_normalization train_augmentator = Compose([Flip()], p=1.0) trainset = ToyDataset(args.dataset_path, split='train', num_points=num_points, augmentator=train_augmentator, with_segmentation=True, points_from_one_object=train_proposals, input_transform=model_cfg.input_transform, epoch_len=10000) valset = ToyDataset(args.dataset_path, split='test', augmentator=None, num_points=num_points, with_segmentation=True, points_from_one_object=train_proposals, input_transform=model_cfg.input_transform) optimizer_params = { 'learning_rate': 5e-4, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-8 } if not train_proposals: lr_scheduler = partial(LRScheduler, mode='cosine', baselr=optimizer_params['learning_rate'], nepochs=num_epochs) else: lr_scheduler = partial(LRScheduler, mode='cosine', baselr=optimizer_params['learning_rate'], nepochs=num_epochs) trainer = AdaptISTrainer( args, model, model_cfg, loss_cfg, trainset, valset, optimizer='adam', optimizer_params=optimizer_params, lr_scheduler=lr_scheduler, checkpoint_interval=40 if not train_proposals else 5, image_dump_interval=200 if not train_proposals else -1, train_proposals=train_proposals, hybridize_model=not train_proposals, metrics=[AdaptiveIoU()]) logger.info(f'Starting Epoch: {start_epoch}') logger.info(f'Total Epochs: {num_epochs}') for epoch in range(start_epoch, num_epochs): trainer.training(epoch) trainer.validation(epoch)
def train(model, model_cfg, args, train_proposals, start_epoch=0): args.val_batch_size = args.batch_size args.input_normalization = model_cfg.input_normalization crop_size = model_cfg.crop_size loss_cfg = edict() loss_cfg.instance_loss = NormalizedFocalLossSigmoid(alpha=0.25, gamma=2) loss_cfg.instance_loss_weight = 1.0 if not train_proposals else 0.0 if not train_proposals: num_epochs = 250 num_points = 6 loss_cfg.segmentation_loss = NormalizedFocalLossSoftmax( ignore_label=-1, gamma=1) loss_cfg.segmentation_loss_weight = 0.75 else: num_epochs = 8 num_points = 48 loss_cfg.proposals_loss = AdaptISProposalsLossIoU(args.batch_size) loss_cfg.proposals_loss_weight = 1.0 train_augmentator = Compose([ HorizontalFlip(), ShiftScaleRotate(shift_limit=0.03, scale_limit=0, rotate_limit=(-3, 3), border_mode=0, p=0.75), PadIfNeeded( min_height=crop_size[0], min_width=crop_size[1], border_mode=0), RandomCrop(*crop_size), RandomBrightness(limit=(-0.25, 0.25), p=0.75), RandomContrast(limit=(-0.15, 0.4), p=0.75), RGBShift(r_shift_limit=10, g_shift_limit=10, b_shift_limit=10, p=0.75) ], p=1.0) val_augmentator = Compose([ PadIfNeeded( min_height=crop_size[0], min_width=crop_size[1], border_mode=0), RandomCrop(*crop_size) ], p=1.0) def scale_func(image_shape): return random.uniform(0.85, 1.15) trainset = CityscapesDataset(args.dataset_path, split='train', num_points=num_points, augmentator=train_augmentator, with_segmentation=True, points_from_one_object=train_proposals, input_transform=model_cfg.input_transform, min_object_area=80, sample_ignore_object_prob=0.025, keep_background_prob=0.05, image_rescale=scale_func, use_jpeg=False) valset = CityscapesDataset(args.dataset_path, split='test', augmentator=val_augmentator, num_points=num_points, with_segmentation=True, points_from_one_object=train_proposals, input_transform=model_cfg.input_transform, min_object_area=80, image_rescale=scale_func, use_jpeg=False) if not train_proposals: optimizer_params = {'learning_rate': 0.01, 'momentum': 0.9, 'wd': 1e-4} lr_scheduler = partial(LRScheduler, mode='poly', baselr=optimizer_params['learning_rate'], nepochs=num_epochs) else: optimizer_params = { 'learning_rate': 5e-4, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-8 } lr_scheduler = partial(LRScheduler, mode='cosine', baselr=optimizer_params['learning_rate'], nepochs=num_epochs) trainer = AdaptISTrainer( args, model, model_cfg, loss_cfg, trainset, valset, optimizer='sgd' if not train_proposals else 'adam', optimizer_params=optimizer_params, lr_scheduler=lr_scheduler, checkpoint_interval=40 if not train_proposals else 2, image_dump_interval=100 if not train_proposals else -1, train_proposals=train_proposals, hybridize_model=not train_proposals, metrics=[AdaptiveIoU()]) logger.info(f'Starting Epoch: {start_epoch}') logger.info(f'Total Epochs: {num_epochs}') for epoch in range(start_epoch, num_epochs): trainer.training(epoch) trainer.validation(epoch)
def __init__(self, args, model, model_cfg, loss_cfg, trainset, valset, optimizer_params, optimizer='adam', image_dump_interval=200, checkpoint_interval=10, tb_dump_period=25, lr_scheduler=None, metrics=None, additional_val_metrics=None, train_proposals=False, hybridize_model=True): self.args = args self.model_cfg = model_cfg self.loss_cfg = loss_cfg self.val_loss_cfg = deepcopy(loss_cfg) self.tb_dump_period = tb_dump_period if metrics is None: metrics = [] self.train_metrics = metrics self.val_metrics = deepcopy(metrics) if additional_val_metrics is not None: self.val_metrics.extend(additional_val_metrics) self.hybridize_model = hybridize_model self.checkpoint_interval = checkpoint_interval self.train_proposals = train_proposals self.task_prefix = '' self.trainset = trainset self.valset = valset self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', batchify_fn=get_dict_batchify_fn(args.workers), thread_pool=args.thread_pool, num_workers=args.workers) self.val_data = gluon.data.DataLoader(valset, args.val_batch_size, batchify_fn=get_dict_batchify_fn( args.workers), last_batch='rollover', thread_pool=args.thread_pool, num_workers=args.workers) logger.info(model) model.cast(args.dtype) model.collect_params().reset_ctx(ctx=args.ctx) self.net = model self.evaluator = None if args.weights is not None: if os.path.isfile(args.weights): model.load_parameters(args.weights, ctx=args.ctx, allow_missing=True) args.weights = None else: raise RuntimeError( f"=> no checkpoint found at '{args.weights}'") self.lr_scheduler = None if lr_scheduler is not None: self.lr_scheduler = lr_scheduler(niters=len(self.train_data)) optimizer_params['lr_scheduler'] = self.lr_scheduler kv = mx.kv.create(args.kvstore) if not train_proposals: train_params = self.net.collect_params() else: train_params = self.net.proposals_head.collect_params() self.task_prefix = 'proposals' self.trainer = gluon.Trainer(train_params, optimizer, optimizer_params, kvstore=kv, update_on_kvstore=len(args.ctx) > 1) self.tqdm_out = TqdmToLogger(logger, level=logging.INFO) if args.input_normalization: self.denormalizator = DeNormalize(args.input_normalization['mean'], args.input_normalization['std']) else: self.denormalizator = lambda x: x self.sw = None self.image_dump_interval = image_dump_interval