def validation_epoch_end(self, validation_step_outputs): """ Called at the end of the validation epoch with the outputs of all validation steps. Evaluating results and save best model. Args: validation_step_outputs: A list of val outputs """ results = {} for res in validation_step_outputs: results.update(res) eval_results = self.evaluator.evaluate(results, self.cfg.save_dir, rank=self.local_rank) metric = eval_results[self.cfg.evaluator.save_key] # save best model if metric > self.save_flag: self.save_flag = metric best_save_path = os.path.join(self.cfg.save_dir, 'model_best') mkdir(self.local_rank, best_save_path) self.trainer.save_checkpoint(os.path.join(best_save_path, "model_best.ckpt")) txt_path = os.path.join(best_save_path, "eval_results.txt") if self.local_rank < 1: with open(txt_path, "a") as f: f.write("Epoch:{}\n".format(self.current_epoch+1)) for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v)) else: warnings.warn('Warning! Save_key is not in eval results! Only save model last!') if self.log_style == 'Lightning': for k, v in eval_results.items(): self.log('Val_metrics/' + k, v, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True) elif self.log_style == 'NanoDet': for k, v in eval_results.items(): self.scalar_summary('Val_metrics/' + k, 'Val', v, self.current_epoch+1)
def validation_epoch_end(self, validation_step_outputs): results = {} for res in validation_step_outputs: results.update(res) eval_results = self.evaluator.evaluate(results, self.cfg.save_dir, self.current_epoch, self._logger, rank=self.local_rank) metric = eval_results[self.cfg.evaluator.save_key] # ------save best model-------- if metric > self.save_flag: self.save_flag = metric best_save_path = os.path.join(self.cfg.save_dir, 'model_best') mkdir(self.local_rank, best_save_path) # TODO: replace with saving checkpoint save_model(self.local_rank, self.model, os.path.join(best_save_path, 'model_best.pth'), self.current_epoch + 1, self.global_step) txt_path = os.path.join(best_save_path, "eval_results.txt") if self.local_rank < 1: with open(txt_path, "a") as f: f.write("Epoch:{}\n".format(self.current_epoch + 1)) for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v)) else: warnings.warn( 'Warning! Save_key is not in eval results! Only save model last!' )
def run(self, train_loader, val_loader, evaluator): """ start running :param train_loader: :param val_loader: :param evaluator: """ start_epoch = self.epoch save_flag = -10 if self.cfg.schedule.warmup.steps > 0 and start_epoch == 1: self.logger.log('Start warming up...') self.warm_up(train_loader) for param_group in self.optimizer.param_groups: param_group['lr'] = self.cfg.schedule.optimizer.lr self._init_scheduler() self.lr_scheduler.last_epoch = start_epoch - 1 # resume learning rate of last epoch if start_epoch > 1: for param_group, lr in zip(self.optimizer.param_groups, self.lr_scheduler.get_lr()): param_group['lr'] = lr for epoch in range(start_epoch, self.cfg.schedule.total_epochs + 1): results, train_loss_dict = self.run_epoch(epoch, train_loader, mode='train') self.lr_scheduler.step() save_model(self.rank, self.model, os.path.join(self.cfg.save_dir, 'model_last.pth'), epoch, self._iter, self.optimizer) for k, v in train_loss_dict.items(): self.logger.scalar_summary('Epoch_loss/' + k, 'train', v, epoch) # --------evaluate---------- if self.cfg.schedule.val_intervals > 0 and epoch % self.cfg.schedule.val_intervals == 0: with torch.no_grad(): results, val_loss_dict = self.run_epoch(self.epoch, val_loader, mode='val') for k, v in val_loss_dict.items(): self.logger.scalar_summary('Epoch_loss/' + k, 'val', v, epoch) eval_results = evaluator.evaluate(results, self.cfg.save_dir, epoch, self.logger, rank=self.rank) if self.cfg.evaluator.save_key in eval_results: metric = eval_results[self.cfg.evaluator.save_key] if metric > save_flag: # ------save best model-------- save_flag = metric best_save_path = os.path.join(self.cfg.save_dir, 'model_best') mkdir(self.rank, best_save_path) save_model(self.rank, self.model, os.path.join(best_save_path, 'model_best.pth'), epoch, self._iter, self.optimizer) txt_path = os.path.join(best_save_path, "eval_results.txt") if self.rank < 1: with open(txt_path, "a") as f: f.write("Epoch:{}\n".format(epoch)) for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v)) else: warnings.warn('Warning! Save_key is not in eval results! Only save model last!') self.epoch += 1
def main(args): load_config(cfg, args.config) local_rank = -1 torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True cfg.defrost() timestr = datetime.datetime.now().__format__("%Y%m%d%H%M%S") cfg.save_dir = os.path.join(cfg.save_dir, timestr) mkdir(local_rank, cfg.save_dir) logger = NanoDetLightningLogger(cfg.save_dir) assert args.task in ["val", "test"] cfg.update({"test_mode": args.task}) logger.info("Setting up data...") val_dataset = build_dataset(cfg.data.val, args.task) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=naive_collate, drop_last=False, ) evaluator = build_evaluator(cfg.evaluator, val_dataset) logger.info("Creating model...") task = TrainingTask(cfg, evaluator) ckpt = torch.load(args.model) if "pytorch-lightning_version" not in ckpt: warnings.warn( "Warning! Old .pth checkpoint is deprecated. " "Convert the checkpoint with tools/convert_old_checkpoint.py ") ckpt = convert_old_model(ckpt) task.load_state_dict(ckpt["state_dict"]) if cfg.device.gpu_ids == -1: logger.info("Using CPU training") accelerator, devices = "cpu", None else: accelerator, devices = "gpu", cfg.device.gpu_ids trainer = pl.Trainer( default_root_dir=cfg.save_dir, accelerator=accelerator, devices=devices, log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, logger=logger, ) logger.info("Starting testing...") trainer.test(task, val_dataloader)
def main(args): load_config(cfg, args.config) if cfg.model.arch.head.num_classes != len(cfg.class_names): raise ValueError('cfg.model.arch.head.num_classes must equal len(cfg.class_names),but got {} and {}'.format(cfg.model.arch.head.num_classes,len(cfg.class_names))) local_rank = int(args.local_rank) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) if args.seed is not None: logger.log('Set random seed to {}'.format(args.seed)) pl.seed_everything(args.seed) logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train') val_dataset = build_dataset(cfg.data.val, 'test') evaluator = build_evaluator(cfg, val_dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) # TODO: batch eval val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) logger.log('Creating model...') task = TrainingTask(cfg, evaluator) if 'load_model' in cfg.schedule: ckpt = torch.load(cfg.schedule.load_model) if 'pytorch-lightning_version' not in ckpt: warnings.warn('Warning! Old .pth checkpoint is deprecated. ' 'Convert the checkpoint with tools/convert_old_checkpoint.py ') ckpt = convert_old_model(ckpt) task.load_state_dict(ckpt['state_dict'], strict=False) model_resume_path = os.path.join(cfg.save_dir, 'model_last.ckpt') if 'resume' in cfg.schedule else None trainer = pl.Trainer(default_root_dir=cfg.save_dir, max_epochs=cfg.schedule.total_epochs, gpus=cfg.device.gpu_ids, check_val_every_n_epoch=cfg.schedule.val_intervals, accelerator='ddp', log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, resume_from_checkpoint=model_resume_path, callbacks=[ProgressBar(refresh_rate=0)] # disable tqdm bar ) trainer.fit(task, train_dataloader, val_dataloader)
def main(args): warnings.warn( 'Warning! Old testing code is deprecated and will be deleted ' 'in next version. Please use tools/test.py') load_config(cfg, args.config) local_rank = -1 torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True cfg.defrost() timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S') cfg.save_dir = os.path.join(cfg.save_dir, timestr) cfg.freeze() mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) logger.log('Creating model...') model = build_model(cfg.model) logger.log('Setting up data...') val_dataset = build_dataset(cfg.data.val, args.task) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) trainer = build_trainer(local_rank, cfg, model, logger) cfg.schedule.update({'load_model': args.model}) trainer.load_model(cfg) evaluator = build_evaluator(cfg, val_dataset) logger.log('Starting testing...') with torch.no_grad(): results, val_loss_dict = trainer.run_epoch(0, val_dataloader, mode=args.task) if args.task == 'test': res_json = evaluator.results2json(results) json_path = os.path.join(cfg.save_dir, 'results{}.json'.format(timestr)) json.dump(res_json, open(json_path, 'w')) elif args.task == 'val': eval_results = evaluator.evaluate(results, cfg.save_dir, rank=local_rank) if args.save_result: txt_path = os.path.join(cfg.save_dir, "eval_results{}.txt".format(timestr)) with open(txt_path, "a") as f: for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v))
def main(args): warnings.warn('Warning! Old training code is deprecated and will be deleted ' 'in next version. Please use tools/train.py') load_config(cfg, args.config) local_rank = int(args.local_rank) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) # mkdir用@rank_filter包裹,主进程创建save_dir logger = Logger(local_rank, cfg.save_dir) if args.seed is not None: logger.log('Set random seed to {}'.format(args.seed)) init_seeds(args.seed) logger.log('Creating model...') model = build_model(cfg.model) logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train') val_dataset = build_dataset(cfg.data.val, 'test') if len(cfg.device.gpu_ids) > 1: print('rank = ', local_rank) num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend='nccl') train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) trainer = build_trainer(local_rank, cfg, model, logger) if 'load_model' in cfg.schedule: trainer.load_model(cfg) if 'resume' in cfg.schedule: trainer.resume(cfg) evaluator = build_evaluator(cfg, val_dataset) logger.log('Starting training...') trainer.run(train_dataloader, val_dataloader, evaluator)
def main(args): load_config(cfg, args.config) local_rank = int(args.local_rank) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) # TODO: replace with lightning random seed if args.seed is not None: logger.log('Set random seed to {}'.format(args.seed)) init_seeds(args.seed) logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train') val_dataset = build_dataset(cfg.data.val, 'test') evaluator = build_evaluator(cfg, val_dataset) logger.log('Creating model...') task = TrainingTask(cfg, evaluator, logger) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) # TODO: batch eval val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_function, drop_last=True) trainer = pl.Trainer(default_root_dir=cfg.save_dir, max_epochs=cfg.schedule.total_epochs, gpus=cfg.device.gpu_ids, check_val_every_n_epoch=cfg.schedule.val_intervals, accelerator='ddp', log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0) trainer.fit(task, train_dataloader, val_dataloader)
def main(args): load_config(cfg, args.config) local_rank = -1 torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True cfg.defrost() timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S') cfg.save_dir = os.path.join(cfg.save_dir, timestr) mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) assert args.task in ['val', 'test'] cfg.update({'test_mode': args.task}) logger.log('Setting up data...') val_dataset = build_dataset(cfg.data.val, args.task) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) evaluator = build_evaluator(cfg, val_dataset) logger.log('Creating model...') task = TrainingTask(cfg, evaluator) ckpt = torch.load(args.model) if 'pytorch-lightning_version' not in ckpt: warnings.warn( 'Warning! Old .pth checkpoint is deprecated. ' 'Convert the checkpoint with tools/convert_old_checkpoint.py ') ckpt = convert_old_model(ckpt) task.load_state_dict(ckpt['state_dict']) trainer = pl.Trainer( default_root_dir=cfg.save_dir, gpus=cfg.device.gpu_ids, accelerator='ddp', log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, ) logger.log('Starting testing...') trainer.test(task, val_dataloader)
def validation_epoch_end(self, validation_step_outputs): """ Called at the end of the validation epoch with the outputs of all validation steps.Evaluating results and save best model. Args: validation_step_outputs: A list of val outputs """ results = {} for res in validation_step_outputs: results.update(res) all_results = (gather_results(results) if dist.is_available() and dist.is_initialized() else results) if all_results: eval_results = self.evaluator.evaluate(all_results, self.cfg.save_dir, rank=self.local_rank) metric = eval_results[self.cfg.evaluator.save_key] # save best model if metric > self.save_flag: self.save_flag = metric best_save_path = os.path.join(self.cfg.save_dir, "model_best") mkdir(self.local_rank, best_save_path) self.trainer.save_checkpoint( os.path.join(best_save_path, "model_best.ckpt")) self.save_model_state( os.path.join(best_save_path, "nanodet_model_best.pth")) txt_path = os.path.join(best_save_path, "eval_results.txt") if self.local_rank < 1: with open(txt_path, "a") as f: f.write("Epoch:{}\n".format(self.current_epoch + 1)) for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v)) else: warnings.warn( "Warning! Save_key is not in eval results! Only save model last!" ) self.logger.log_metrics(eval_results, self.current_epoch + 1) else: self.logger.info("Skip val on rank {}".format(self.local_rank))
def validation_epoch_end(self, validation_step_outputs): results = {} for res in validation_step_outputs: results.update(res) eval_results = self.evaluator.evaluate(results, self.cfg.save_dir, self.current_epoch+1, self._logger, rank=self.local_rank) metric = eval_results[self.cfg.evaluator.save_key] # save best model if metric > self.save_flag: self.save_flag = metric best_save_path = os.path.join(self.cfg.save_dir, 'model_best') mkdir(self.local_rank, best_save_path) self.trainer.save_checkpoint(os.path.join(best_save_path, "model_best.ckpt")) txt_path = os.path.join(best_save_path, "eval_results.txt") if self.local_rank < 1: with open(txt_path, "a") as f: f.write("Epoch:{}\n".format(self.current_epoch+1)) for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v)) else: warnings.warn('Warning! Save_key is not in eval results! Only save model last!') if self.log_style == 'Lightning': for k, v in eval_results.items(): self.log('Val/' + k, v, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
def main(args): load_config(cfg, args.config) if cfg.model.arch.head.num_classes != len(cfg.class_names): raise ValueError( "cfg.model.arch.head.num_classes must equal len(cfg.class_names), " "but got {} and {}".format(cfg.model.arch.head.num_classes, len(cfg.class_names))) local_rank = int(args.local_rank) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) logger = NanoDetLightningLogger(cfg.save_dir) logger.dump_cfg(cfg) if args.seed is not None: logger.info("Set random seed to {}".format(args.seed)) pl.seed_everything(args.seed) logger.info("Setting up data...") train_dataset = build_dataset(cfg.data.train, "train") val_dataset = build_dataset(cfg.data.val, "test") evaluator = build_evaluator(cfg.evaluator, val_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=naive_collate, drop_last=True, ) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=naive_collate, drop_last=False, ) logger.info("Creating model...") task = TrainingTask(cfg, evaluator) if "load_model" in cfg.schedule: ckpt = torch.load(cfg.schedule.load_model) if "pytorch-lightning_version" not in ckpt: warnings.warn( "Warning! Old .pth checkpoint is deprecated. " "Convert the checkpoint with tools/convert_old_checkpoint.py ") ckpt = convert_old_model(ckpt) load_model_weight(task.model, ckpt, logger) logger.info("Loaded model weight from {}".format( cfg.schedule.load_model)) model_resume_path = (os.path.join(cfg.save_dir, "model_last.ckpt") if "resume" in cfg.schedule else None) accelerator = None if len(cfg.device.gpu_ids) <= 1 else "ddp" trainer = pl.Trainer( default_root_dir=cfg.save_dir, max_epochs=cfg.schedule.total_epochs, gpus=cfg.device.gpu_ids, check_val_every_n_epoch=cfg.schedule.val_intervals, accelerator=accelerator, log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, resume_from_checkpoint=model_resume_path, callbacks=[ProgressBar(refresh_rate=0)], # disable tqdm bar logger=logger, benchmark=True, gradient_clip_val=cfg.get("grad_clip", 0.0), ) trainer.fit(task, train_dataloader, val_dataloader)
def startNanodetTrain(self): #加载配置文件 load_config(cfg, self.nanoTrainConfig['cfg']) #判断分布式训练当中该主机的角色 local_rank = int(self.nanoTrainConfig["local_rank"]) # torch.backends.cudnn.enabled = True # torch.backends.cudnn.benchmark = True mkdir(local_rank, self.nanoTrainConfig["save_dir"]) logger = Logger(local_rank, self.nanoTrainConfig["save_dir"]) if self.nanoTrainConfig.keys().__contains__("seed"): logger.log('Set random seed to {}'.format( self.nanoTrainConfig['seed'])) self.init_seeds(self.nanoTrainConfig['seed']) #1.创建模型 model = build_model(cfg.model) model = model.cpu() #2.加载数据 logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train', self.nanoTrainConfig) val_dataset = build_dataset(cfg.data.val, 'test', self.nanoTrainConfig) if len(cfg.device.gpu_ids) > 1: print('rank = ', local_rank) num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend='nccl') train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: print("加载数据...") train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_function, drop_last=True) trainer = build_trainer(local_rank, cfg, model, logger) if 'load_model' in cfg.schedule: trainer.load_model(cfg) if 'resume' in cfg.schedule: trainer.resume(cfg) evaluator = build_evaluator(cfg, val_dataset) logger.log('Starting training...') trainer.run(train_dataloader, val_dataloader, evaluator, self.nanoTrainConfig)
def run(self, train_loader, val_loader, evaluator): """ start running :param train_loader: :param val_loader: :param evaluator: """ start_epoch = self.epoch save_flag = -10 if self.cfg.schedule.warmup.steps > 0 and start_epoch == 1: self.logger.log('Start warming up...') self.warm_up(train_loader) for param_group in self.optimizer.param_groups: param_group['lr'] = self.cfg.schedule.optimizer.lr self._init_scheduler() self.lr_scheduler.last_epoch = start_epoch - 1 # ---------- traverse each epoch for epoch_i, epoch in enumerate( range(start_epoch, self.cfg.schedule.total_epochs + 1)): # # ----- validate before training actually starts # ret_dict, val_loss_dict = self.run_epoch(self.epoch, val_loader, mode='val') # if self.cfg.evaluator.name == 'MyDetectionEvaluator': # evaluator.evaluate(ret_dict) # ----- run an epoch on train dataset, schedule lr, save model and logging ret_dict, train_loss_dict = self.run_epoch(epoch, train_loader, mode='train') self.lr_scheduler.step() save_model(self.rank, self.model, os.path.join(self.cfg.save_dir, 'model_last.pth'), epoch, self._iter, self.optimizer) for k, v in train_loss_dict.items(): self.logger.scalar_summary('Epoch_loss/' + k, 'train', v, epoch) # --------evaluate---------- if evaluator is None: # do not evaluate, save current epoch's checkpoint best_save_path = os.path.join( self.cfg.save_dir, 'epoch_{:d}'.format(start_epoch + epoch_i)) mkdir(self.rank, best_save_path) save_model(self.rank, self.model, os.path.join(best_save_path, 'model_best.pth'), epoch, self._iter, self.optimizer) else: # do evaluation if epoch % self.cfg.schedule.val_intervals == 0: with torch.no_grad( ): # train an epoch on validation dataset ret_dict, val_loss_dict = self.run_epoch(self.epoch, val_loader, mode='val') for k, v in val_loss_dict.items(): self.logger.scalar_summary('Epoch_loss/' + k, 'val', v, epoch) # ----- do evaluation, ret_dict, key: img_id, val: dets_dict if self.cfg.evaluator.name == 'CocoDetectionEvaluator': eval_results = evaluator.evaluate(ret_dict, self.cfg.save_dir, epoch, self.logger, rank=self.rank) elif self.cfg.evaluator.name == 'MyDetectionEvaluator': eval_results = evaluator.evaluate(ret_dict) if eval_results is None: continue if self.cfg.evaluator.save_key in eval_results: metric = eval_results[self.cfg.evaluator.save_key] if metric > save_flag: # ------save best model-------- save_flag = metric best_save_path = os.path.join( self.cfg.save_dir, 'model_best') mkdir(self.rank, best_save_path) save_model( self.rank, self.model, os.path.join(best_save_path, 'model_best.pth'), epoch, self._iter, self.optimizer) txt_path = os.path.join(best_save_path, "eval_results.txt") if self.rank < 1: with open(txt_path, "a") as f: f.write("Epoch:{}\n".format(epoch)) for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v)) else: warnings.warn( 'Warning! Save_key is not in eval results! Only save model last!' ) self.epoch += 1
def run(args): """ :param args: :return: """ load_config(cfg, args.config) local_rank = int(args.local_rank) # what's this? torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) if args.seed is not None: logger.log('Set random seed to {}'.format(args.seed)) init_seeds(args.seed) logger.log('Creating model...') model = build_model(cfg.model) logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train') # build_dataset(cfg.data.train, 'train') val_dataset = build_dataset(cfg.data.val, 'test') if len(cfg.device.gpu_ids) > 1: # More than one GPU(distributed training) print('rank = ', local_rank) num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend='nccl') train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.is_debug: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=0, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: if args.is_debug: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=0, pin_memory=True, collate_fn=collate_function, drop_last=True) else: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) if args.is_debug: val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_function, drop_last=True) else: val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_function, drop_last=True) # ----- trainer = build_trainer(local_rank, cfg, model, logger) if 'load_model' in cfg.schedule: trainer.load_model(cfg) if 'resume' in cfg.schedule: trainer.resume(cfg) # ----- Build a evaluator evaluator = build_evaluator(cfg, val_dataset) # evaluator = None logger.log('Starting training...') trainer.run(train_data_loader, val_data_loader, evaluator)