def run(self): ## init distributed self.cfg = init_distributed(self.cfg) cfg = self.cfg # cfg.print() ## parser_dict self.dictionary = self._parser_dict() ## parser_datasets datasets, dataloaders, data_samplers = self._parser_datasets() # dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']} # class_names = datasets['train'].classes ## parser_model model_ft = self._parser_model() ## parser_optimizer # Scale learning rate based on global batch size # cfg.INIT_LR = cfg.INIT_LR * float(self.batch_size_all) / 256 optimizer_ft = parser_optimizer(cfg, model_ft) ## parser_lr_scheduler lr_scheduler_ft = parser_lr_scheduler(cfg, optimizer_ft) ''' # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (((1 + math.cos(x * math.pi / self.cfg.N_MAX_EPOCHS)) / 2) ** 1.0) * 0.8 + 0.2 # cosine lr_scheduler_ft = lr_scheduler.LambdaLR(optimizer_ft, lr_lambda=lf) ''' if cfg.distributed: model_ft = DDP(model_ft, device_ids=[cfg.local_rank], output_device=(cfg.local_rank)) # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model_ft.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False if self.cfg.PRETRAIN_MODEL is not None: if self.cfg.RESUME: self.start_epoch = self.ckpts.load_checkpoint( self.cfg.PRETRAIN_MODEL, model_ft, optimizer_ft, lr_scheduler_ft) else: self.ckpts.load_checkpoint(self.cfg.PRETRAIN_MODEL, model_ft) ## vis net graph if self.cfg.TENSORBOARD_MODEL and False: self.tb_writer.add_graph(model_ft, (model_ft.dummy_input.cuda(), )) self.n_steps_per_epoch = int( ceil(sum(len(t) for t in datasets['train']))) best_acc = 0.0 scaler = amp.GradScaler(enabled=True) for epoch in range(self.start_epoch + 1, self.cfg.N_MAX_EPOCHS): if cfg.distributed: dataloaders['train'].sampler.set_epoch(epoch) self.train_epoch(scaler, epoch, model_ft, dataloaders['train'], optimizer_ft) lr_scheduler_ft.step() if self.cfg.DATASET.VAL: acc = self.val_epoch(epoch, model_ft, dataloaders['val']) if cfg.local_rank == 0: # start to save best performance model after learning rate decay to 1e-6 if best_acc < acc: self.ckpts.autosave_checkpoint(model_ft, epoch, 'best', optimizer_ft, lr_scheduler_ft) best_acc = acc # continue if not epoch % cfg.N_EPOCHS_TO_SAVE_MODEL: if cfg.local_rank == 0: self.ckpts.autosave_checkpoint(model_ft, epoch, 'autosave', optimizer_ft, lr_scheduler_ft) if cfg.local_rank == 0: self.tb_writer.close() dist.destroy_process_group() if cfg.local_rank != 0 else None torch.cuda.empty_cache()
def run(self): cfg = self.cfg # cfg.print() ## parser_dict dictionary = self._parser_dict() ## parser_datasets datasets, dataloaders = self._parser_datasets() # dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']} # class_names = datasets['train'].classes ## parser_model model_ft = self._parser_model(dictionary) ## parser_optimizer optimizer_ft = parser_optimizer(cfg, model_ft) ## parser_lr_scheduler lr_scheduler_ft = parser_lr_scheduler(cfg, optimizer_ft) if self.cfg.PRETRAIN_MODEL is not None: if self.cfg.RESUME: self.start_epoch = self.checkpoints.load_checkpoint( self.cfg.PRETRAIN_MODEL, model_ft, optimizer_ft, lr_scheduler_ft) else: self.checkpoints.load_checkpoint(self.cfg.PRETRAIN_MODEL, model_ft) if torch.cuda.is_available(): model_ft = model_ft.cuda() cudnn.benchmark = True for state in optimizer_ft.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() ## vis net graph if self.cfg.TENSORBOARD_MODEL and False: self.tb_writer.add_graph(model_ft, (model_ft.dummy_input.cuda(), )) if self.cfg.HALF: model_ft.half() self.n_steps_per_epoch = int( ceil(sum(len(t) for t in datasets['train']))) best_acc = 0.0 for epoch in range(self.start_epoch + 1, self.cfg.N_MAX_EPOCHS): self.train_epoch(epoch, model_ft, dataloaders['train'], optimizer_ft, lr_scheduler_ft, None) if self.cfg.DATASET.VAL: acc = self.val_epoch(epoch, model_ft, dataloaders['val'], optimizer=optimizer_ft, lr_scheduler=lr_scheduler_ft) # start to save best performance model after learning rate decay to 1e-6 if best_acc < acc: self.checkpoints.autosave_checkpoint( model_ft, epoch, 'best', optimizer_ft, lr_scheduler_ft) best_acc = acc continue if not epoch % cfg.N_EPOCHS_TO_SAVE_MODEL: self.checkpoints.autosave_checkpoint(model_ft, epoch, 'autosave', optimizer_ft, lr_scheduler_ft) self.tb_writer.close()