Ejemplo n.º 1
0
def main(args):
    warnings.warn(
        'Warning! Old testing code is deprecated and will be deleted '
        'in next version. Please use tools/test.py')
    load_config(cfg, args.config)
    local_rank = -1
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    cfg.defrost()
    timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S')
    cfg.save_dir = os.path.join(cfg.save_dir, timestr)
    cfg.freeze()
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    val_dataset = build_dataset(cfg.data.val, args.task)
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=collate_function,
        drop_last=True)
    trainer = build_trainer(local_rank, cfg, model, logger)
    cfg.schedule.update({'load_model': args.model})
    trainer.load_model(cfg)
    evaluator = build_evaluator(cfg, val_dataset)
    logger.log('Starting testing...')
    with torch.no_grad():
        results, val_loss_dict = trainer.run_epoch(0,
                                                   val_dataloader,
                                                   mode=args.task)
    if args.task == 'test':
        res_json = evaluator.results2json(results)
        json_path = os.path.join(cfg.save_dir,
                                 'results{}.json'.format(timestr))
        json.dump(res_json, open(json_path, 'w'))
    elif args.task == 'val':
        eval_results = evaluator.evaluate(results,
                                          cfg.save_dir,
                                          rank=local_rank)
        if args.save_result:
            txt_path = os.path.join(cfg.save_dir,
                                    "eval_results{}.txt".format(timestr))
            with open(txt_path, "a") as f:
                for k, v in eval_results.items():
                    f.write("{}: {}\n".format(k, v))
Ejemplo n.º 2
0
def main(args):
    warnings.warn('Warning! Old training code is deprecated and will be deleted '
                  'in next version. Please use tools/train.py')
    load_config(cfg, args.config)
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)                             # mkdir用@rank_filter包裹,主进程创建save_dir
    logger = Logger(local_rank, cfg.save_dir)
    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    if len(cfg.device.gpu_ids) > 1:
        print('rank = ', local_rank)
        num_gpus = torch.cuda.device_count()
        torch.cuda.set_device(local_rank % num_gpus)
        dist.init_process_group(backend='nccl')
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                       num_workers=cfg.device.workers_per_gpu, pin_memory=True,
                                                       collate_fn=collate_function, sampler=train_sampler,
                                                       drop_last=True)
    else:
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                       shuffle=True, num_workers=cfg.device.workers_per_gpu,
                                                       pin_memory=True, collate_fn=collate_function, drop_last=True)

    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                 shuffle=False, num_workers=cfg.device.workers_per_gpu,
                                                 pin_memory=True, collate_fn=collate_function, drop_last=True)

    trainer = build_trainer(local_rank, cfg, model, logger)

    if 'load_model' in cfg.schedule:
        trainer.load_model(cfg)
    if 'resume' in cfg.schedule:
        trainer.resume(cfg)

    evaluator = build_evaluator(cfg, val_dataset)

    logger.log('Starting training...')
    trainer.run(train_dataloader, val_dataloader, evaluator)
Ejemplo n.º 3
0
    def startNanodetTrain(self):
        #加载配置文件
        load_config(cfg, self.nanoTrainConfig['cfg'])
        #判断分布式训练当中该主机的角色
        local_rank = int(self.nanoTrainConfig["local_rank"])
        # torch.backends.cudnn.enabled = True
        # torch.backends.cudnn.benchmark = True
        mkdir(local_rank, self.nanoTrainConfig["save_dir"])
        logger = Logger(local_rank, self.nanoTrainConfig["save_dir"])
        if self.nanoTrainConfig.keys().__contains__("seed"):
            logger.log('Set random seed to {}'.format(
                self.nanoTrainConfig['seed']))
            self.init_seeds(self.nanoTrainConfig['seed'])

        #1.创建模型
        model = build_model(cfg.model)
        model = model.cpu()

        #2.加载数据
        logger.log('Setting up data...')
        train_dataset = build_dataset(cfg.data.train, 'train',
                                      self.nanoTrainConfig)
        val_dataset = build_dataset(cfg.data.val, 'test', self.nanoTrainConfig)

        if len(cfg.device.gpu_ids) > 1:
            print('rank = ', local_rank)
            num_gpus = torch.cuda.device_count()
            torch.cuda.set_device(local_rank % num_gpus)
            dist.init_process_group(backend='nccl')
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_dataset)
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=cfg.device.batchsize_per_gpu,
                num_workers=cfg.device.workers_per_gpu,
                pin_memory=True,
                collate_fn=collate_function,
                sampler=train_sampler,
                drop_last=True)
        else:
            print("加载数据...")
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=cfg.device.batchsize_per_gpu,
                shuffle=True,
                num_workers=cfg.device.workers_per_gpu,
                pin_memory=True,
                collate_fn=collate_function,
                drop_last=True)

        val_dataloader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=1,
            pin_memory=True,
            collate_fn=collate_function,
            drop_last=True)

        trainer = build_trainer(local_rank, cfg, model, logger)

        if 'load_model' in cfg.schedule:
            trainer.load_model(cfg)
        if 'resume' in cfg.schedule:
            trainer.resume(cfg)

        evaluator = build_evaluator(cfg, val_dataset)

        logger.log('Starting training...')
        trainer.run(train_dataloader, val_dataloader, evaluator,
                    self.nanoTrainConfig)
Ejemplo n.º 4
0
def run(args):
    """
    :param args:
    :return:
    """
    load_config(cfg, args.config)

    local_rank = int(args.local_rank)  # what's this?
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')  # build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    if len(cfg.device.gpu_ids) > 1:  # More than one GPU(distributed training)
        print('rank = ', local_rank)
        num_gpus = torch.cuda.device_count()
        torch.cuda.set_device(local_rank % num_gpus)
        dist.init_process_group(backend='nccl')
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)

        if args.is_debug:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            num_workers=0,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            sampler=train_sampler,
                                                            drop_last=True)
        else:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            num_workers=cfg.device.workers_per_gpu,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            sampler=train_sampler,
                                                            drop_last=True)
    else:
        if args.is_debug:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            shuffle=True,
                                                            num_workers=0,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            drop_last=True)
        else:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            shuffle=True,
                                                            num_workers=cfg.device.workers_per_gpu,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            drop_last=True)

    if args.is_debug:
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                      batch_size=1,
                                                      shuffle=False,
                                                      num_workers=0,
                                                      pin_memory=True,
                                                      collate_fn=collate_function, drop_last=True)
    else:
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                      batch_size=1,
                                                      shuffle=False,
                                                      num_workers=1,
                                                      pin_memory=True,
                                                      collate_fn=collate_function, drop_last=True)

    # -----
    trainer = build_trainer(local_rank, cfg, model, logger)

    if 'load_model' in cfg.schedule:
        trainer.load_model(cfg)
    if 'resume' in cfg.schedule:
        trainer.resume(cfg)

    # ----- Build a evaluator
    evaluator = build_evaluator(cfg, val_dataset)
    # evaluator = None

    logger.log('Starting training...')
    trainer.run(train_data_loader, val_data_loader, evaluator)