Ejemplo n.º 1
0
def main():
    args, cfg = parse_config()
    if args.launcher == 'none':
        dist_test = False
    else:
        args.batch_size, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
            args.batch_size, args.tcp_port, args.local_rank, backend='nccl'
        )
        dist_test = True

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    output_dir.mkdir(parents=True, exist_ok=True)

    eval_output_dir = output_dir / 'eval'

    if not args.eval_all:
        num_list = re.findall(r'\d+', args.ckpt) if args.ckpt is not None else []
        epoch_id = num_list[-1] if num_list.__len__() > 0 else 'no_number'
        eval_output_dir = eval_output_dir / ('epoch_%s' % epoch_id) / cfg.DATA_CONFIG.DATA_SPLIT['test']
    else:
        eval_output_dir = eval_output_dir / 'eval_all_default'

    if args.eval_tag is not None:
        eval_output_dir = eval_output_dir / args.eval_tag

    eval_output_dir.mkdir(parents=True, exist_ok=True)
    log_file = eval_output_dir / ('log_eval_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys() else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_test:
        total_gpus = dist.get_world_size()
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)

    ckpt_dir = args.ckpt_dir if args.ckpt_dir is not None else output_dir / 'ckpt'

    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_test, workers=args.workers, logger=logger, training=False
    )

    model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), dataset=test_set)
    with torch.no_grad():
        if args.eval_all:
            repeat_eval_ckpt(model, test_loader, args, eval_output_dir, logger, ckpt_dir, dist_test=dist_test)
        else:
            eval_single_ckpt(model, test_loader, args, eval_output_dir, logger, epoch_id, dist_test=dist_test)
Ejemplo n.º 2
0
def main():
    args, cfg = parse_config()
    if args.launcher == 'none':
        dist_train = False
    else:
        args.batch_size, cfg.LOCAL_RANK = getattr(
            common_utils, 'init_dist_%s' % args.launcher)(args.batch_size,
                                                          args.tcp_port,
                                                          args.local_rank,
                                                          backend='nccl')
        dist_train = True
    if args.fix_random_seed:
        common_utils.set_random_seed(666)

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    ckpt_dir = output_dir / 'ckpt'
    output_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    log_file = output_dir / ('log_train_%s.txt' %
                             datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_train:
        total_gpus = dist.get_world_size()
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)
    if cfg.LOCAL_RANK == 0:
        os.system('cp %s %s' % (args.cfg_file, output_dir))

    tb_log = SummaryWriter(
        log_dir=str(output_dir /
                    'tensorboard')) if cfg.LOCAL_RANK == 0 else None

    # -----------------------create dataloader & network & optimizer---------------------------
    train_set, train_loader, train_sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=True,
        merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
        total_epochs=args.epochs)

    model = build_network(model_cfg=cfg.MODEL,
                          num_class=len(cfg.CLASS_NAMES),
                          dataset=train_set)
    if args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda()

    optimizer = build_optimizer(model, cfg.OPTIMIZATION)

    # load checkpoint if it is possible
    start_epoch = it = 0
    last_epoch = -1
    if args.pretrained_model is not None:
        model.load_params_from_file(filename=args.pretrained_model,
                                    to_cpu=dist,
                                    logger=logger)

    if args.ckpt is not None:
        it, start_epoch = model.load_params_with_optimizer(args.ckpt,
                                                           to_cpu=dist,
                                                           optimizer=optimizer,
                                                           logger=logger)
        last_epoch = start_epoch + 1
    else:
        ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
        if len(ckpt_list) > 0:
            ckpt_list.sort(key=os.path.getmtime)
            it, start_epoch = model.load_params_with_optimizer(
                ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger)
            last_epoch = start_epoch + 1

    model.train(
    )  # before wrap to DistributedDataParallel to support fixed some parameters
    if dist_train:
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()])
    logger.info(model)

    lr_scheduler, lr_warmup_scheduler = build_scheduler(
        optimizer,
        total_iters_each_epoch=len(train_loader),
        total_epochs=args.epochs,
        last_epoch=last_epoch,
        optim_cfg=cfg.OPTIMIZATION)

    # -----------------------start training---------------------------
    logger.info(
        '**********************Start training %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    train_model(model,
                optimizer,
                train_loader,
                model_func=model_fn_decorator(),
                lr_scheduler=lr_scheduler,
                optim_cfg=cfg.OPTIMIZATION,
                start_epoch=start_epoch,
                total_epochs=args.epochs,
                start_iter=it,
                rank=cfg.LOCAL_RANK,
                tb_log=tb_log,
                ckpt_save_dir=ckpt_dir,
                train_sampler=train_sampler,
                lr_warmup_scheduler=lr_warmup_scheduler,
                ckpt_save_interval=args.ckpt_save_interval,
                max_ckpt_save_num=args.max_ckpt_save_num,
                merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch)

    logger.info(
        '**********************End training %s/%s(%s)**********************\n\n\n'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))

    logger.info(
        '**********************Start evaluation %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=False)
    eval_output_dir = output_dir / 'eval' / 'eval_with_train'
    eval_output_dir.mkdir(parents=True, exist_ok=True)
    args.start_epoch = max(args.epochs - 10,
                           0)  # Only evaluate the last 10 epochs

    repeat_eval_ckpt(model.module if dist_train else model,
                     test_loader,
                     args,
                     eval_output_dir,
                     logger,
                     ckpt_dir,
                     dist_test=dist_train)
    logger.info(
        '**********************End evaluation %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
Ejemplo n.º 3
0
def main():
    args, cfg = parse_config()
    if args.launcher == 'none':
        dist_test = False
        total_gpus = 1
    else:
        total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' %
                                             args.launcher)(args.tcp_port,
                                                            args.local_rank,
                                                            backend='nccl')
        dist_test = True

    assert not dist_test and args.batch_size == 1
    if args.batch_size is None:
        args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
    else:
        assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
        args.batch_size = args.batch_size // total_gpus

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    output_dir.mkdir(parents=True, exist_ok=True)

    eval_output_dir = Path('deploy/eval')
    if args.eval_tag is not None:
        eval_output_dir = eval_output_dir / args.eval_tag

    eval_output_dir.mkdir(parents=True, exist_ok=True)
    log_file = eval_output_dir / (
        'log_eval_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_test:
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)

    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_test,
        workers=args.workers,
        logger=logger,
        training=False)

    if args.pruned_model is not None:
        tag = 'pruned_model'
        model = torch.load(args.pruned_model, map_location=torch.device('cpu'))
    elif args.pretrained_model is not None:
        tag = 'large_model'
        model = build_network(model_cfg=cfg.MODEL,
                              num_class=len(cfg.CLASS_NAMES),
                              dataset=test_set)
        model.load_params_from_file(filename=args.pretrained_model,
                                    logger=logger,
                                    to_cpu=dist_test)

    else:
        raise RuntimeError('error: please input weights.')

    model = model.cuda()
    model.eval()

    ExportModel = OnnxModelPointPillars(model)
    ExportModel.eval()
    ExportModel = ExportModel.cuda()

    points = np.fromfile(args.pcs_for_export, dtype=np.float32).reshape(-1, 4)

    points = torch.from_numpy(points).float().cuda()
    points = torch.autograd.Variable(points.contiguous())
    valid = torch.Tensor([len(points)]).int().cuda()
    dummy_input = torch.zeros((25000, 4)).float().cuda()
    dummy_input[:len(points)] = points

    torch.onnx.export(
        ExportModel, (dummy_input, valid),
        "pointpillars_%s.onnx" % tag,
        verbose=True,
        training=False,
        operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
        opset_version=10,
        input_names=['points', 'valid'],
        output_names=['pointpillars_output1', 'pointpillars_output2'])

    if args.eval_onnx_model:
        with torch.no_grad():
            eval_single_ckpt_onnx(ExportModel,
                                  test_loader,
                                  args,
                                  eval_output_dir,
                                  logger,
                                  tag,
                                  dist_test=dist_test)
Ejemplo n.º 4
0
def main():
    args, cfg = parge_config()
    if args.launcher == 'none':
        dist_train = False
    else:
        args.batch_size, cfg.LOCAL_RANK = getattr(
            common_utils, 'init_dist_%s' % args.launcher)(args.batch_size,
                                                          args.tcp_port,
                                                          args.local_rank,
                                                          backend='nccl')
        dist_train = True
    if args.fix_random_seed:
        common_utils.set_random_seed(666)

    output_dir = cfg.ROOT_DIR / 'output' / cfg.TAG / args.extra_tag
    output_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir = output_dir / 'ckpt'
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    log_file = output_dir / ('log_train_%s.txt' %
                             datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_train:
        total_gpus = dist.get_world_size()
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)

    tb_log = SummaryWriter(
        log_dir=str(output_dir /
                    'tensorboard')) if cfg.LOCAL_RANK == 0 else None
    if args.local_rank == 0:
        wandb.init(project='BEVSEG-PCDet',
                   sync_tensorboard=True,
                   name=args.extra_tag,
                   config={
                       **vars(args),
                       **cfg
                   })

    # -----------------------create dataloader & network & optimizer---------------------------
    train_set, train_loader, train_sampler = build_dataloader(
        cfg.DATA_CONFIG.DATA_DIR,
        args.batch_size,
        dist_train,
        workers=args.workers,
        logger=logger,
        training=True)

    model = build_network(train_set)
    if args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda()

    optimizer = build_optimizer(model, cfg.MODEL.TRAIN.OPTIMIZATION)

    # load checkpoint if it is possible
    start_epoch = it = 0
    last_epoch = -1
    if args.pretrained_model is not None:
        model.load_params_from_file(filename=args.pretrained_model,
                                    to_cpu=dist,
                                    logger=logger)

    if args.ckpt is not None:
        it, start_epoch = model.load_params_with_optimizer(args.ckpt,
                                                           to_cpu=dist,
                                                           optimizer=optimizer,
                                                           logger=logger)
        last_epoch = start_epoch + 1
    else:
        ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
        if len(ckpt_list) > 0:
            ckpt_list.sort(key=os.path.getmtime)
            it, start_epoch = model.load_params_with_optimizer(
                ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger)
            last_epoch = start_epoch + 1

    model.train(
    )  # before wrap to DistributedDataParallel to support fixed some parameters
    if dist_train:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()],
            find_unused_parameters=
            True  # uncomment this line to debug unused params
        )
    logger.info(model)

    lr_scheduler, lr_warmup_scheduler = build_scheduler(
        optimizer,
        total_iters_each_epoch=len(train_loader),
        total_epochs=args.epochs,
        last_epoch=last_epoch,
        optim_cfg=cfg.MODEL.TRAIN.OPTIMIZATION)

    # -----------------------start training---------------------------
    logger.info(
        '**********************Start training %s(%s)**********************' %
        (cfg.TAG, args.extra_tag))
    train_model(model,
                optimizer,
                train_loader,
                model_func=model_fn_decorator(),
                lr_scheduler=lr_scheduler,
                optim_cfg=cfg.MODEL.TRAIN.OPTIMIZATION,
                start_epoch=start_epoch,
                total_epochs=args.epochs,
                start_iter=it,
                rank=cfg.LOCAL_RANK,
                tb_log=tb_log,
                ckpt_save_dir=ckpt_dir,
                train_sampler=train_sampler,
                lr_warmup_scheduler=lr_warmup_scheduler,
                ckpt_save_interval=args.ckpt_save_interval,
                max_ckpt_save_num=args.max_ckpt_save_num)

    logger.info('**********************End training**********************')
    '''
Ejemplo n.º 5
0
def main():
    args, cfg = parse_config()
    dist_test = False
    total_gpus = 1

    if args.batch_size is None:
        args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
    else:
        assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
        args.batch_size = args.batch_size // total_gpus

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    output_dir.mkdir(parents=True, exist_ok=True)

    eval_output_dir = output_dir / 'eval'

    epoch_id = '110'
    eval_output_dir = eval_output_dir / (
        'epoch_%s' % epoch_id) / cfg.DATA_CONFIG.DATA_SPLIT['test']

    eval_tag = 'play'
    eval_output_dir = eval_output_dir / eval_tag

    eval_output_dir.mkdir(parents=True, exist_ok=True)
    log_file = eval_output_dir / (
        'log_eval_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_test:
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)

    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_test,
        workers=args.workers,
        logger=logger,
        training=False)

    model = build_network(model_cfg=cfg.MODEL,
                          num_class=len(cfg.CLASS_NAMES),
                          dataset=test_set)
    with torch.no_grad():
        eval_single_ckpt(model,
                         test_loader,
                         args,
                         output_dir,
                         eval_output_dir,
                         logger,
                         epoch_id,
                         dist_test=dist_test)
Ejemplo n.º 6
0
def main():

    args, cfg = parse_config()
    if args.launcher == 'none':
        dist_test = False
        total_gpus = 1
        torch.cuda.set_device(0)

    else:
        total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' %
                                             args.launcher)(args.tcp_port,
                                                            args.local_rank,
                                                            backend='nccl')
        dist_test = True

    if args.batch_size is None:
        args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
        args.batch_size = 1
    else:
        assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
        args.batch_size = args.batch_size // total_gpus

    exp_group_path = cfg.EXP_GROUP_PATH.split('/')[-1]
    args.extra_tag = 'kittitracking'
    output_dir = cfg.ROOT_DIR / 'output' / exp_group_path / cfg.TAG / args.extra_tag

    output_dir.mkdir(parents=True, exist_ok=True)

    eval_output_dir = output_dir / 'eval'

    if not args.eval_all:
        num_list = re.findall(r'\d+',
                              args.ckpt) if args.ckpt is not None else []
        epoch_id = num_list[-1] if num_list.__len__() > 0 else 'no_number'
        eval_output_dir = eval_output_dir / (
            'epoch_%s' % epoch_id) / cfg.DATA_CONFIG.DATA_SPLIT['test']
    else:
        eval_output_dir = eval_output_dir / 'eval_all_default'

    if args.eval_tag is not None:
        eval_output_dir = eval_output_dir / args.eval_tag

    eval_output_dir.mkdir(parents=True, exist_ok=True)
    log_file = eval_output_dir / (
        'log_eval_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_test:
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)

    ckpt_dir = args.ckpt_dir if args.ckpt_dir is not None else output_dir / 'ckpt'

    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_test,
        workers=args.workers,
        logger=logger,
        training=False,
        use_color=cfg.USE_COLOR,
        use_rgb=cfg.USE_RGB,
        nbg=cfg.USE_NBG,
        raw=cfg.RAW)

    model = build_network(model_cfg=cfg.MODEL,
                          num_class=len(cfg.CLASS_NAMES),
                          dataset=test_set)
    #ckpt_dir = '/media/ddd/data2/3d_MOTS_Ex./Code/OpenPCDet-RandlaNet/output/kitti_models/Randla_pointrcnn_iou_128/default/ckpt/70-80/'
    with torch.no_grad():
        if args.eval_all:
            repeat_eval_ckpt(model,
                             test_loader,
                             args,
                             eval_output_dir,
                             logger,
                             ckpt_dir,
                             dist_test=dist_test)
        else:
            eval_single_ckpt(model,
                             test_loader,
                             args,
                             eval_output_dir,
                             logger,
                             epoch_id,
                             dist_test=dist_test)
Ejemplo n.º 7
0
def main():
    args, cfg = parse_config()
    if args.launcher == 'none':
        dist_train = False
        total_gpus = 1
    else:
        total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' %
                                             args.launcher)(args.tcp_port,
                                                            args.local_rank,
                                                            backend='nccl')
        dist_train = True

    if args.batch_size is None:
        args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
    else:
        assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
        args.batch_size = args.batch_size // total_gpus

    args.epochs = cfg.OPTIMIZATION.NUM_EPOCHS if args.epochs is None else args.epochs

    if args.fix_random_seed:
        common_utils.set_random_seed(666)

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    ckpt_dir = output_dir / 'ckpt'
    output_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    log_file = output_dir / ('log_train_%s.txt' %
                             datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_train:
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)
    if cfg.LOCAL_RANK == 0:
        os.system('cp %s %s' % (args.cfg_file, output_dir))

    tb_log = SummaryWriter(
        log_dir=str(output_dir /
                    'tensorboard')) if cfg.LOCAL_RANK == 0 else None

    # -----------------------create dataloader & network & optimizer---------------------------
    train_set, train_loader, train_sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=True,
        merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
        total_epochs=args.epochs)

    logger.info(
        '**********************Starting Inference on Pointpillars**********************'
    )

    # Load model to GPU and deactivate gradients
    MODEL_PATH = '/home/triasamo/entire_model.pth'
    model_point = torch.load(MODEL_PATH)
    model_point.cuda()
    model_point.eval()

    start_time = time.time()
    all_predictions = []
    with torch.no_grad():
        for data_dict in tqdm(train_loader):
            load_data_to_gpu(data_dict)
            # feed point cloud into model
            predictions, _ = model_point(
                data_dict
            )  # returns a list of dictionaries (one for each frame fed into the model)
            for index, pred_dict in enumerate(predictions):
                # Sort out predictions into boxes, scores, labels and centers
                frame_id = data_dict['frame_id'][index]
                pred_boxes = pred_dict['pred_boxes'].cpu().numpy()
                pred_scores = pred_dict['pred_scores'].cpu().numpy()
                pred_labels = pred_dict['pred_labels'].cpu().numpy()
                pred_centers = pred_boxes[:, :3]
                frame_dict = {
                    'frame_id': frame_id,
                    'pred_centers': pred_centers,
                    'pred_scores': pred_scores,
                    'pred_labels': pred_labels
                }
                all_predictions.append(frame_dict)
    logger.info("Inferece of dataset executed in: %.2f sec" %
                (time.time() - start_time))

    #for idx, data_dict in enumerate(dataloader_extra):
    #    ic(data_dict.keys())
    #    dict_keys(['points', 'frame_id', 'gt_boxes', 'use_lead_xyz', 'voxels', 'voxel_coords', 'voxel_num_points', 'image_shape', 'batch_size'])

    logger.info(
        '**********************Finished Inference on Pointpillars**********************'
    )

    model = build_network(model_cfg=cfg.MODEL,
                          num_class=len(cfg.CLASS_NAMES),
                          dataset=train_set)
    if args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda()

    optimizer = build_optimizer(model, cfg.OPTIMIZATION)

    # load checkpoint if it is possible
    start_epoch = it = 0
    last_epoch = -1
    if args.pretrained_model is not None:
        model.load_params_from_file(filename=args.pretrained_model,
                                    to_cpu=dist,
                                    logger=logger)

    if args.ckpt is not None:
        it, start_epoch = model.load_params_with_optimizer(args.ckpt,
                                                           to_cpu=dist,
                                                           optimizer=optimizer,
                                                           logger=logger)
        last_epoch = start_epoch + 1
    else:
        ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
        if len(ckpt_list) > 0:
            ckpt_list.sort(key=os.path.getmtime)
            it, start_epoch = model.load_params_with_optimizer(
                ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger)
            last_epoch = start_epoch + 1

    model.train(
    )  # before wrap to DistributedDataParallel to support fixed some parameters
    if dist_train:
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()])
    logger.info(model)

    lr_scheduler, lr_warmup_scheduler = build_scheduler(
        optimizer,
        total_iters_each_epoch=len(train_loader),
        total_epochs=args.epochs,
        last_epoch=last_epoch,
        optim_cfg=cfg.OPTIMIZATION)

    # -----------------------start training---------------------------
    logger.info(
        '**********************Start training %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    train_model(model,
                optimizer,
                train_loader,
                model_func=model_fn_decorator(),
                lr_scheduler=lr_scheduler,
                optim_cfg=cfg.OPTIMIZATION,
                start_epoch=start_epoch,
                total_epochs=args.epochs,
                start_iter=it,
                rank=cfg.LOCAL_RANK,
                tb_log=tb_log,
                ckpt_save_dir=ckpt_dir,
                train_sampler=train_sampler,
                lr_warmup_scheduler=lr_warmup_scheduler,
                ckpt_save_interval=args.ckpt_save_interval,
                max_ckpt_save_num=args.max_ckpt_save_num,
                merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch)

    logger.info(
        '**********************End training %s/%s(%s)**********************\n\n\n'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    logger.info('************** Saving Entire model  **************\n\n\n')
    torch.save(model, '/home/triasamo/entire_model.pth')
    logger.info(
        '************** Saved model at /home/triasamo/entire_model.pth  **************\n\n\n'
    )
    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=False)
    eval_output_dir = output_dir / 'eval' / 'eval_with_train'
    eval_output_dir.mkdir(parents=True, exist_ok=True)
    args.start_epoch = max(args.epochs - 10,
                           0)  # Only evaluate the last 10 epochs

    repeat_eval_ckpt(model.module if dist_train else model,
                     test_loader,
                     args,
                     eval_output_dir,
                     logger,
                     ckpt_dir,
                     dist_test=dist_train)
    logger.info(
        '**********************End evaluation %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
Ejemplo n.º 8
0
def main():

    args, cfg = parse_config()
    print("epsilon", args.epsilon, "ord", args.norm, "iterations",
          args.iterations, "rec_type", args.rec_type, "pgd", args.pgd,
          "momentum", args.momentum, "ckpt", args.ckpt)
    # args.ckpt = ckpt
    if args.launcher == 'none':
        dist_test = False
        total_gpus = 1
    else:
        total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' %
                                             args.launcher)(args.tcp_port,
                                                            args.local_rank,
                                                            backend='nccl')
        dist_test = True
    if args.batch_size is None:
        args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
    else:
        assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
        args.batch_size = args.batch_size // total_gpus

    output_dir = cfg.ROOT_DIR / 'output' / \
        cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    output_dir.mkdir(parents=True, exist_ok=True)

    eval_output_dir = output_dir / 'eval'

    if not args.eval_all:
        num_list = re.findall(r'\d+',
                              args.ckpt) if args.ckpt is not None else []
        epoch_id = num_list[-1] if num_list.__len__() > 0 else 'no_number'
        eval_output_dir = eval_output_dir / \
            ('epoch_%s' % epoch_id) / cfg.DATA_CONFIG.DATA_SPLIT['test']
    else:
        eval_output_dir = eval_output_dir / 'eval_all_default'

    if args.eval_tag is not None:
        eval_output_dir = eval_output_dir / args.eval_tag

    eval_output_dir.mkdir(parents=True, exist_ok=True)
    log_file = eval_output_dir / (
        'log_eval_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_test:
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)

    ckpt_dir = args.ckpt_dir if args.ckpt_dir is not None else output_dir / 'ckpt'
    print(cfg.DATA_CONFIG.DATA_PATH)
    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_test,
        workers=args.workers,
        logger=logger,
        training=False)
    model = build_network(model_cfg=cfg.MODEL,
                          num_class=len(cfg.CLASS_NAMES),
                          dataset=test_set)
    if args.eval_all:
        repeat_eval_ckpt(model,
                         test_loader,
                         args,
                         eval_output_dir,
                         logger,
                         ckpt_dir,
                         dist_test=dist_test)
    else:
        eval_single_ckpt(model,
                         test_loader,
                         args,
                         eval_output_dir,
                         logger,
                         args.epsilon,
                         args.norm,
                         args.iterations,
                         args.rec_type,
                         args.pgd,
                         args.momentum,
                         epoch_id,
                         dist_test=dist_test)