コード例 #1
0
    def validation_epoch_end(self, validation_step_outputs):
        """
        Called at the end of the validation epoch with the outputs of all validation steps.
        Evaluating results and save best model.
        Args:
            validation_step_outputs: A list of val outputs

        """
        results = {}
        for res in validation_step_outputs:
            results.update(res)
        eval_results = self.evaluator.evaluate(results, self.cfg.save_dir, rank=self.local_rank)
        metric = eval_results[self.cfg.evaluator.save_key]
        # save best model
        if metric > self.save_flag:
            self.save_flag = metric
            best_save_path = os.path.join(self.cfg.save_dir, 'model_best')
            mkdir(self.local_rank, best_save_path)
            self.trainer.save_checkpoint(os.path.join(best_save_path, "model_best.ckpt"))
            txt_path = os.path.join(best_save_path, "eval_results.txt")
            if self.local_rank < 1:
                with open(txt_path, "a") as f:
                    f.write("Epoch:{}\n".format(self.current_epoch+1))
                    for k, v in eval_results.items():
                        f.write("{}: {}\n".format(k, v))
        else:
            warnings.warn('Warning! Save_key is not in eval results! Only save model last!')
        if self.log_style == 'Lightning':
            for k, v in eval_results.items():
                self.log('Val_metrics/' + k, v, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
        elif self.log_style == 'NanoDet':
            for k, v in eval_results.items():
                self.scalar_summary('Val_metrics/' + k, 'Val', v, self.current_epoch+1)
コード例 #2
0
ファイル: task.py プロジェクト: wwdok/nanodet
    def validation_epoch_end(self, validation_step_outputs):
        results = {}
        for res in validation_step_outputs:
            results.update(res)

        eval_results = self.evaluator.evaluate(results,
                                               self.cfg.save_dir,
                                               self.current_epoch,
                                               self._logger,
                                               rank=self.local_rank)
        metric = eval_results[self.cfg.evaluator.save_key]

        # ------save best model--------
        if metric > self.save_flag:
            self.save_flag = metric
            best_save_path = os.path.join(self.cfg.save_dir, 'model_best')
            mkdir(self.local_rank, best_save_path)
            # TODO: replace with saving checkpoint
            save_model(self.local_rank, self.model,
                       os.path.join(best_save_path, 'model_best.pth'),
                       self.current_epoch + 1, self.global_step)
            txt_path = os.path.join(best_save_path, "eval_results.txt")
            if self.local_rank < 1:
                with open(txt_path, "a") as f:
                    f.write("Epoch:{}\n".format(self.current_epoch + 1))
                    for k, v in eval_results.items():
                        f.write("{}: {}\n".format(k, v))
        else:
            warnings.warn(
                'Warning! Save_key is not in eval results! Only save model last!'
            )
コード例 #3
0
ファイル: trainer.py プロジェクト: wwdok/nanodet
    def run(self, train_loader, val_loader, evaluator):
        """
        start running
        :param train_loader:
        :param val_loader:
        :param evaluator:
        """
        start_epoch = self.epoch
        save_flag = -10
        if self.cfg.schedule.warmup.steps > 0 and start_epoch == 1:
            self.logger.log('Start warming up...')
            self.warm_up(train_loader)
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.cfg.schedule.optimizer.lr

        self._init_scheduler()
        self.lr_scheduler.last_epoch = start_epoch - 1
        
        # resume learning rate of last epoch
        if start_epoch > 1:
            for param_group, lr in zip(self.optimizer.param_groups, self.lr_scheduler.get_lr()):
                param_group['lr'] = lr

        for epoch in range(start_epoch, self.cfg.schedule.total_epochs + 1):
            results, train_loss_dict = self.run_epoch(epoch, train_loader, mode='train')
            self.lr_scheduler.step()
            save_model(self.rank, self.model, os.path.join(self.cfg.save_dir, 'model_last.pth'), epoch, self._iter, self.optimizer)
            for k, v in train_loss_dict.items():
                self.logger.scalar_summary('Epoch_loss/' + k, 'train', v, epoch)

            # --------evaluate----------
            if self.cfg.schedule.val_intervals > 0 and epoch % self.cfg.schedule.val_intervals == 0:
                with torch.no_grad():
                    results, val_loss_dict = self.run_epoch(self.epoch, val_loader, mode='val')
                for k, v in val_loss_dict.items():
                    self.logger.scalar_summary('Epoch_loss/' + k, 'val', v, epoch)
                eval_results = evaluator.evaluate(results, self.cfg.save_dir, epoch, self.logger, rank=self.rank)
                if self.cfg.evaluator.save_key in eval_results:
                    metric = eval_results[self.cfg.evaluator.save_key]
                    if metric > save_flag:
                        # ------save best model--------
                        save_flag = metric
                        best_save_path = os.path.join(self.cfg.save_dir, 'model_best')
                        mkdir(self.rank, best_save_path)
                        save_model(self.rank, self.model, os.path.join(best_save_path, 'model_best.pth'), epoch,
                                   self._iter, self.optimizer)
                        txt_path = os.path.join(best_save_path, "eval_results.txt")
                        if self.rank < 1:
                            with open(txt_path, "a") as f:
                                f.write("Epoch:{}\n".format(epoch))
                                for k, v in eval_results.items():
                                    f.write("{}: {}\n".format(k, v))
                else:
                    warnings.warn('Warning! Save_key is not in eval results! Only save model last!')
            self.epoch += 1
コード例 #4
0
def main(args):
    load_config(cfg, args.config)
    local_rank = -1
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    cfg.defrost()
    timestr = datetime.datetime.now().__format__("%Y%m%d%H%M%S")
    cfg.save_dir = os.path.join(cfg.save_dir, timestr)
    mkdir(local_rank, cfg.save_dir)
    logger = NanoDetLightningLogger(cfg.save_dir)

    assert args.task in ["val", "test"]
    cfg.update({"test_mode": args.task})

    logger.info("Setting up data...")
    val_dataset = build_dataset(cfg.data.val, args.task)
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=naive_collate,
        drop_last=False,
    )
    evaluator = build_evaluator(cfg.evaluator, val_dataset)

    logger.info("Creating model...")
    task = TrainingTask(cfg, evaluator)

    ckpt = torch.load(args.model)
    if "pytorch-lightning_version" not in ckpt:
        warnings.warn(
            "Warning! Old .pth checkpoint is deprecated. "
            "Convert the checkpoint with tools/convert_old_checkpoint.py ")
        ckpt = convert_old_model(ckpt)
    task.load_state_dict(ckpt["state_dict"])

    if cfg.device.gpu_ids == -1:
        logger.info("Using CPU training")
        accelerator, devices = "cpu", None
    else:
        accelerator, devices = "gpu", cfg.device.gpu_ids

    trainer = pl.Trainer(
        default_root_dir=cfg.save_dir,
        accelerator=accelerator,
        devices=devices,
        log_every_n_steps=cfg.log.interval,
        num_sanity_val_steps=0,
        logger=logger,
    )
    logger.info("Starting testing...")
    trainer.test(task, val_dataloader)
コード例 #5
0
ファイル: train.py プロジェクト: ycdhqzhiai/nanodet
def main(args):
    load_config(cfg, args.config)
    if cfg.model.arch.head.num_classes != len(cfg.class_names):
        raise ValueError('cfg.model.arch.head.num_classes must equal len(cfg.class_names),but got {} and {}'.format(cfg.model.arch.head.num_classes,len(cfg.class_names)))
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        pl.seed_everything(args.seed)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    evaluator = build_evaluator(cfg, val_dataset)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                   shuffle=True, num_workers=cfg.device.workers_per_gpu,
                                                   pin_memory=True, collate_fn=collate_function, drop_last=True)
    # TODO: batch eval
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False,
                                                 num_workers=cfg.device.workers_per_gpu,
                                                 pin_memory=True, collate_fn=collate_function, drop_last=True)

    logger.log('Creating model...')
    task = TrainingTask(cfg, evaluator)

    if 'load_model' in cfg.schedule:
        ckpt = torch.load(cfg.schedule.load_model)
        if 'pytorch-lightning_version' not in ckpt:
            warnings.warn('Warning! Old .pth checkpoint is deprecated. '
                          'Convert the checkpoint with tools/convert_old_checkpoint.py ')
            ckpt = convert_old_model(ckpt)
        task.load_state_dict(ckpt['state_dict'], strict=False)

    model_resume_path = os.path.join(cfg.save_dir, 'model_last.ckpt') if 'resume' in cfg.schedule else None

    trainer = pl.Trainer(default_root_dir=cfg.save_dir,
                         max_epochs=cfg.schedule.total_epochs,
                         gpus=cfg.device.gpu_ids,
                         check_val_every_n_epoch=cfg.schedule.val_intervals,
                         accelerator='ddp',
                         log_every_n_steps=cfg.log.interval,
                         num_sanity_val_steps=0,
                         resume_from_checkpoint=model_resume_path,
                         callbacks=[ProgressBar(refresh_rate=0)]  # disable tqdm bar
                         )

    trainer.fit(task, train_dataloader, val_dataloader)
コード例 #6
0
def main(args):
    warnings.warn(
        'Warning! Old testing code is deprecated and will be deleted '
        'in next version. Please use tools/test.py')
    load_config(cfg, args.config)
    local_rank = -1
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    cfg.defrost()
    timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S')
    cfg.save_dir = os.path.join(cfg.save_dir, timestr)
    cfg.freeze()
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    val_dataset = build_dataset(cfg.data.val, args.task)
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=collate_function,
        drop_last=True)
    trainer = build_trainer(local_rank, cfg, model, logger)
    cfg.schedule.update({'load_model': args.model})
    trainer.load_model(cfg)
    evaluator = build_evaluator(cfg, val_dataset)
    logger.log('Starting testing...')
    with torch.no_grad():
        results, val_loss_dict = trainer.run_epoch(0,
                                                   val_dataloader,
                                                   mode=args.task)
    if args.task == 'test':
        res_json = evaluator.results2json(results)
        json_path = os.path.join(cfg.save_dir,
                                 'results{}.json'.format(timestr))
        json.dump(res_json, open(json_path, 'w'))
    elif args.task == 'val':
        eval_results = evaluator.evaluate(results,
                                          cfg.save_dir,
                                          rank=local_rank)
        if args.save_result:
            txt_path = os.path.join(cfg.save_dir,
                                    "eval_results{}.txt".format(timestr))
            with open(txt_path, "a") as f:
                for k, v in eval_results.items():
                    f.write("{}: {}\n".format(k, v))
コード例 #7
0
ファイル: train.py プロジェクト: AlbertiPot/nanodet
def main(args):
    warnings.warn('Warning! Old training code is deprecated and will be deleted '
                  'in next version. Please use tools/train.py')
    load_config(cfg, args.config)
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)                             # mkdir用@rank_filter包裹,主进程创建save_dir
    logger = Logger(local_rank, cfg.save_dir)
    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    if len(cfg.device.gpu_ids) > 1:
        print('rank = ', local_rank)
        num_gpus = torch.cuda.device_count()
        torch.cuda.set_device(local_rank % num_gpus)
        dist.init_process_group(backend='nccl')
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                       num_workers=cfg.device.workers_per_gpu, pin_memory=True,
                                                       collate_fn=collate_function, sampler=train_sampler,
                                                       drop_last=True)
    else:
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                       shuffle=True, num_workers=cfg.device.workers_per_gpu,
                                                       pin_memory=True, collate_fn=collate_function, drop_last=True)

    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                 shuffle=False, num_workers=cfg.device.workers_per_gpu,
                                                 pin_memory=True, collate_fn=collate_function, drop_last=True)

    trainer = build_trainer(local_rank, cfg, model, logger)

    if 'load_model' in cfg.schedule:
        trainer.load_model(cfg)
    if 'resume' in cfg.schedule:
        trainer.resume(cfg)

    evaluator = build_evaluator(cfg, val_dataset)

    logger.log('Starting training...')
    trainer.run(train_dataloader, val_dataloader, evaluator)
コード例 #8
0
ファイル: train_pl.py プロジェクト: wwdok/nanodet
def main(args):
    load_config(cfg, args.config)
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)
    # TODO: replace with lightning random seed
    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    evaluator = build_evaluator(cfg, val_dataset)

    logger.log('Creating model...')
    task = TrainingTask(cfg, evaluator, logger)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=True,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=collate_function,
        drop_last=True)
    # TODO: batch eval
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=1,
                                                 shuffle=False,
                                                 num_workers=1,
                                                 pin_memory=True,
                                                 collate_fn=collate_function,
                                                 drop_last=True)

    trainer = pl.Trainer(default_root_dir=cfg.save_dir,
                         max_epochs=cfg.schedule.total_epochs,
                         gpus=cfg.device.gpu_ids,
                         check_val_every_n_epoch=cfg.schedule.val_intervals,
                         accelerator='ddp',
                         log_every_n_steps=cfg.log.interval,
                         num_sanity_val_steps=0)

    trainer.fit(task, train_dataloader, val_dataloader)
コード例 #9
0
def main(args):
    load_config(cfg, args.config)
    local_rank = -1
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    cfg.defrost()
    timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S')
    cfg.save_dir = os.path.join(cfg.save_dir, timestr)
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    assert args.task in ['val', 'test']
    cfg.update({'test_mode': args.task})

    logger.log('Setting up data...')
    val_dataset = build_dataset(cfg.data.val, args.task)
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=collate_function,
        drop_last=True)
    evaluator = build_evaluator(cfg, val_dataset)

    logger.log('Creating model...')
    task = TrainingTask(cfg, evaluator)

    ckpt = torch.load(args.model)
    if 'pytorch-lightning_version' not in ckpt:
        warnings.warn(
            'Warning! Old .pth checkpoint is deprecated. '
            'Convert the checkpoint with tools/convert_old_checkpoint.py ')
        ckpt = convert_old_model(ckpt)
    task.load_state_dict(ckpt['state_dict'])

    trainer = pl.Trainer(
        default_root_dir=cfg.save_dir,
        gpus=cfg.device.gpu_ids,
        accelerator='ddp',
        log_every_n_steps=cfg.log.interval,
        num_sanity_val_steps=0,
    )
    logger.log('Starting testing...')
    trainer.test(task, val_dataloader)
コード例 #10
0
    def validation_epoch_end(self, validation_step_outputs):
        """
        Called at the end of the validation epoch with the
        outputs of all validation steps.Evaluating results
        and save best model.
        Args:
            validation_step_outputs: A list of val outputs

        """
        results = {}
        for res in validation_step_outputs:
            results.update(res)
        all_results = (gather_results(results) if dist.is_available()
                       and dist.is_initialized() else results)
        if all_results:
            eval_results = self.evaluator.evaluate(all_results,
                                                   self.cfg.save_dir,
                                                   rank=self.local_rank)
            metric = eval_results[self.cfg.evaluator.save_key]
            # save best model
            if metric > self.save_flag:
                self.save_flag = metric
                best_save_path = os.path.join(self.cfg.save_dir, "model_best")
                mkdir(self.local_rank, best_save_path)
                self.trainer.save_checkpoint(
                    os.path.join(best_save_path, "model_best.ckpt"))
                self.save_model_state(
                    os.path.join(best_save_path, "nanodet_model_best.pth"))
                txt_path = os.path.join(best_save_path, "eval_results.txt")
                if self.local_rank < 1:
                    with open(txt_path, "a") as f:
                        f.write("Epoch:{}\n".format(self.current_epoch + 1))
                        for k, v in eval_results.items():
                            f.write("{}: {}\n".format(k, v))
            else:
                warnings.warn(
                    "Warning! Save_key is not in eval results! Only save model last!"
                )
            self.logger.log_metrics(eval_results, self.current_epoch + 1)
        else:
            self.logger.info("Skip val on rank {}".format(self.local_rank))
コード例 #11
0
ファイル: task.py プロジェクト: karthiksharma98/nanodet
 def validation_epoch_end(self, validation_step_outputs):
     results = {}
     for res in validation_step_outputs:
         results.update(res)
     eval_results = self.evaluator.evaluate(results, self.cfg.save_dir, self.current_epoch+1,
                                            self._logger, rank=self.local_rank)
     metric = eval_results[self.cfg.evaluator.save_key]
     # save best model
     if metric > self.save_flag:
         self.save_flag = metric
         best_save_path = os.path.join(self.cfg.save_dir, 'model_best')
         mkdir(self.local_rank, best_save_path)
         self.trainer.save_checkpoint(os.path.join(best_save_path, "model_best.ckpt"))
         txt_path = os.path.join(best_save_path, "eval_results.txt")
         if self.local_rank < 1:
             with open(txt_path, "a") as f:
                 f.write("Epoch:{}\n".format(self.current_epoch+1))
                 for k, v in eval_results.items():
                     f.write("{}: {}\n".format(k, v))
     else:
         warnings.warn('Warning! Save_key is not in eval results! Only save model last!')
     if self.log_style == 'Lightning':
         for k, v in eval_results.items():
             self.log('Val/' + k, v, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
コード例 #12
0
def main(args):
    load_config(cfg, args.config)
    if cfg.model.arch.head.num_classes != len(cfg.class_names):
        raise ValueError(
            "cfg.model.arch.head.num_classes must equal len(cfg.class_names), "
            "but got {} and {}".format(cfg.model.arch.head.num_classes,
                                       len(cfg.class_names)))
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)

    logger = NanoDetLightningLogger(cfg.save_dir)
    logger.dump_cfg(cfg)

    if args.seed is not None:
        logger.info("Set random seed to {}".format(args.seed))
        pl.seed_everything(args.seed)

    logger.info("Setting up data...")
    train_dataset = build_dataset(cfg.data.train, "train")
    val_dataset = build_dataset(cfg.data.val, "test")

    evaluator = build_evaluator(cfg.evaluator, val_dataset)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=True,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=naive_collate,
        drop_last=True,
    )
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=naive_collate,
        drop_last=False,
    )

    logger.info("Creating model...")
    task = TrainingTask(cfg, evaluator)

    if "load_model" in cfg.schedule:
        ckpt = torch.load(cfg.schedule.load_model)
        if "pytorch-lightning_version" not in ckpt:
            warnings.warn(
                "Warning! Old .pth checkpoint is deprecated. "
                "Convert the checkpoint with tools/convert_old_checkpoint.py ")
            ckpt = convert_old_model(ckpt)
        load_model_weight(task.model, ckpt, logger)
        logger.info("Loaded model weight from {}".format(
            cfg.schedule.load_model))

    model_resume_path = (os.path.join(cfg.save_dir, "model_last.ckpt")
                         if "resume" in cfg.schedule else None)

    accelerator = None if len(cfg.device.gpu_ids) <= 1 else "ddp"

    trainer = pl.Trainer(
        default_root_dir=cfg.save_dir,
        max_epochs=cfg.schedule.total_epochs,
        gpus=cfg.device.gpu_ids,
        check_val_every_n_epoch=cfg.schedule.val_intervals,
        accelerator=accelerator,
        log_every_n_steps=cfg.log.interval,
        num_sanity_val_steps=0,
        resume_from_checkpoint=model_resume_path,
        callbacks=[ProgressBar(refresh_rate=0)],  # disable tqdm bar
        logger=logger,
        benchmark=True,
        gradient_clip_val=cfg.get("grad_clip", 0.0),
    )

    trainer.fit(task, train_dataloader, val_dataloader)
コード例 #13
0
    def startNanodetTrain(self):
        #加载配置文件
        load_config(cfg, self.nanoTrainConfig['cfg'])
        #判断分布式训练当中该主机的角色
        local_rank = int(self.nanoTrainConfig["local_rank"])
        # torch.backends.cudnn.enabled = True
        # torch.backends.cudnn.benchmark = True
        mkdir(local_rank, self.nanoTrainConfig["save_dir"])
        logger = Logger(local_rank, self.nanoTrainConfig["save_dir"])
        if self.nanoTrainConfig.keys().__contains__("seed"):
            logger.log('Set random seed to {}'.format(
                self.nanoTrainConfig['seed']))
            self.init_seeds(self.nanoTrainConfig['seed'])

        #1.创建模型
        model = build_model(cfg.model)
        model = model.cpu()

        #2.加载数据
        logger.log('Setting up data...')
        train_dataset = build_dataset(cfg.data.train, 'train',
                                      self.nanoTrainConfig)
        val_dataset = build_dataset(cfg.data.val, 'test', self.nanoTrainConfig)

        if len(cfg.device.gpu_ids) > 1:
            print('rank = ', local_rank)
            num_gpus = torch.cuda.device_count()
            torch.cuda.set_device(local_rank % num_gpus)
            dist.init_process_group(backend='nccl')
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_dataset)
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=cfg.device.batchsize_per_gpu,
                num_workers=cfg.device.workers_per_gpu,
                pin_memory=True,
                collate_fn=collate_function,
                sampler=train_sampler,
                drop_last=True)
        else:
            print("加载数据...")
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=cfg.device.batchsize_per_gpu,
                shuffle=True,
                num_workers=cfg.device.workers_per_gpu,
                pin_memory=True,
                collate_fn=collate_function,
                drop_last=True)

        val_dataloader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=1,
            pin_memory=True,
            collate_fn=collate_function,
            drop_last=True)

        trainer = build_trainer(local_rank, cfg, model, logger)

        if 'load_model' in cfg.schedule:
            trainer.load_model(cfg)
        if 'resume' in cfg.schedule:
            trainer.resume(cfg)

        evaluator = build_evaluator(cfg, val_dataset)

        logger.log('Starting training...')
        trainer.run(train_dataloader, val_dataloader, evaluator,
                    self.nanoTrainConfig)
コード例 #14
0
ファイル: trainer.py プロジェクト: CaptainEven/MyNanoDet
    def run(self, train_loader, val_loader, evaluator):
        """
        start running
        :param train_loader:
        :param val_loader:
        :param evaluator:
        """
        start_epoch = self.epoch
        save_flag = -10
        if self.cfg.schedule.warmup.steps > 0 and start_epoch == 1:
            self.logger.log('Start warming up...')
            self.warm_up(train_loader)
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.cfg.schedule.optimizer.lr

        self._init_scheduler()
        self.lr_scheduler.last_epoch = start_epoch - 1

        # ---------- traverse each epoch
        for epoch_i, epoch in enumerate(
                range(start_epoch, self.cfg.schedule.total_epochs + 1)):
            # # ----- validate before training actually starts
            # ret_dict, val_loss_dict = self.run_epoch(self.epoch, val_loader, mode='val')
            # if self.cfg.evaluator.name == 'MyDetectionEvaluator':
            #     evaluator.evaluate(ret_dict)

            # ----- run an epoch on train dataset, schedule lr, save model and logging
            ret_dict, train_loss_dict = self.run_epoch(epoch,
                                                       train_loader,
                                                       mode='train')
            self.lr_scheduler.step()
            save_model(self.rank, self.model,
                       os.path.join(self.cfg.save_dir, 'model_last.pth'),
                       epoch, self._iter, self.optimizer)
            for k, v in train_loss_dict.items():
                self.logger.scalar_summary('Epoch_loss/' + k, 'train', v,
                                           epoch)

            # --------evaluate----------
            if evaluator is None:
                # do not evaluate, save current epoch's checkpoint
                best_save_path = os.path.join(
                    self.cfg.save_dir,
                    'epoch_{:d}'.format(start_epoch + epoch_i))
                mkdir(self.rank, best_save_path)
                save_model(self.rank, self.model,
                           os.path.join(best_save_path, 'model_best.pth'),
                           epoch, self._iter, self.optimizer)
            else:  # do evaluation
                if epoch % self.cfg.schedule.val_intervals == 0:
                    with torch.no_grad(
                    ):  # train an epoch on validation dataset
                        ret_dict, val_loss_dict = self.run_epoch(self.epoch,
                                                                 val_loader,
                                                                 mode='val')

                    for k, v in val_loss_dict.items():
                        self.logger.scalar_summary('Epoch_loss/' + k, 'val', v,
                                                   epoch)

                    # ----- do evaluation, ret_dict, key: img_id, val: dets_dict
                    if self.cfg.evaluator.name == 'CocoDetectionEvaluator':
                        eval_results = evaluator.evaluate(ret_dict,
                                                          self.cfg.save_dir,
                                                          epoch,
                                                          self.logger,
                                                          rank=self.rank)
                    elif self.cfg.evaluator.name == 'MyDetectionEvaluator':
                        eval_results = evaluator.evaluate(ret_dict)

                    if eval_results is None:
                        continue
                    if self.cfg.evaluator.save_key in eval_results:
                        metric = eval_results[self.cfg.evaluator.save_key]
                        if metric > save_flag:
                            # ------save best model--------
                            save_flag = metric
                            best_save_path = os.path.join(
                                self.cfg.save_dir, 'model_best')
                            mkdir(self.rank, best_save_path)
                            save_model(
                                self.rank, self.model,
                                os.path.join(best_save_path, 'model_best.pth'),
                                epoch, self._iter, self.optimizer)
                            txt_path = os.path.join(best_save_path,
                                                    "eval_results.txt")
                            if self.rank < 1:
                                with open(txt_path, "a") as f:
                                    f.write("Epoch:{}\n".format(epoch))
                                    for k, v in eval_results.items():
                                        f.write("{}: {}\n".format(k, v))
                    else:
                        warnings.warn(
                            'Warning! Save_key is not in eval results! Only save model last!'
                        )

            self.epoch += 1
コード例 #15
0
ファイル: train.py プロジェクト: CaptainEven/MyNanoDet
def run(args):
    """
    :param args:
    :return:
    """
    load_config(cfg, args.config)

    local_rank = int(args.local_rank)  # what's this?
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')  # build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    if len(cfg.device.gpu_ids) > 1:  # More than one GPU(distributed training)
        print('rank = ', local_rank)
        num_gpus = torch.cuda.device_count()
        torch.cuda.set_device(local_rank % num_gpus)
        dist.init_process_group(backend='nccl')
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)

        if args.is_debug:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            num_workers=0,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            sampler=train_sampler,
                                                            drop_last=True)
        else:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            num_workers=cfg.device.workers_per_gpu,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            sampler=train_sampler,
                                                            drop_last=True)
    else:
        if args.is_debug:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            shuffle=True,
                                                            num_workers=0,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            drop_last=True)
        else:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            shuffle=True,
                                                            num_workers=cfg.device.workers_per_gpu,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            drop_last=True)

    if args.is_debug:
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                      batch_size=1,
                                                      shuffle=False,
                                                      num_workers=0,
                                                      pin_memory=True,
                                                      collate_fn=collate_function, drop_last=True)
    else:
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                      batch_size=1,
                                                      shuffle=False,
                                                      num_workers=1,
                                                      pin_memory=True,
                                                      collate_fn=collate_function, drop_last=True)

    # -----
    trainer = build_trainer(local_rank, cfg, model, logger)

    if 'load_model' in cfg.schedule:
        trainer.load_model(cfg)
    if 'resume' in cfg.schedule:
        trainer.resume(cfg)

    # ----- Build a evaluator
    evaluator = build_evaluator(cfg, val_dataset)
    # evaluator = None

    logger.log('Starting training...')
    trainer.run(train_data_loader, val_data_loader, evaluator)