Exemple #1
0
def main():
    args, cfg = parse_config()
    if args.launcher == 'none':
        dist_train = False
    else:
        args.batch_size, cfg.LOCAL_RANK = getattr(
            common_utils, 'init_dist_%s' % args.launcher)(args.batch_size,
                                                          args.tcp_port,
                                                          args.local_rank,
                                                          backend='nccl')
        dist_train = True
    if args.fix_random_seed:
        common_utils.set_random_seed(666)

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    ckpt_dir = output_dir / 'ckpt'
    output_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    log_file = output_dir / ('log_train_%s.txt' %
                             datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_train:
        total_gpus = dist.get_world_size()
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)
    if cfg.LOCAL_RANK == 0:
        os.system('cp %s %s' % (args.cfg_file, output_dir))

    tb_log = SummaryWriter(
        log_dir=str(output_dir /
                    'tensorboard')) if cfg.LOCAL_RANK == 0 else None

    # -----------------------create dataloader & network & optimizer---------------------------
    train_set, train_loader, train_sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=True,
        merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
        total_epochs=args.epochs)

    model = build_network(model_cfg=cfg.MODEL,
                          num_class=len(cfg.CLASS_NAMES),
                          dataset=train_set)
    if args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda()

    optimizer = build_optimizer(model, cfg.OPTIMIZATION)

    # load checkpoint if it is possible
    start_epoch = it = 0
    last_epoch = -1
    if args.pretrained_model is not None:
        model.load_params_from_file(filename=args.pretrained_model,
                                    to_cpu=dist,
                                    logger=logger)

    if args.ckpt is not None:
        it, start_epoch = model.load_params_with_optimizer(args.ckpt,
                                                           to_cpu=dist,
                                                           optimizer=optimizer,
                                                           logger=logger)
        last_epoch = start_epoch + 1
    else:
        ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
        if len(ckpt_list) > 0:
            ckpt_list.sort(key=os.path.getmtime)
            it, start_epoch = model.load_params_with_optimizer(
                ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger)
            last_epoch = start_epoch + 1

    model.train(
    )  # before wrap to DistributedDataParallel to support fixed some parameters
    if dist_train:
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()])
    logger.info(model)

    lr_scheduler, lr_warmup_scheduler = build_scheduler(
        optimizer,
        total_iters_each_epoch=len(train_loader),
        total_epochs=args.epochs,
        last_epoch=last_epoch,
        optim_cfg=cfg.OPTIMIZATION)

    # -----------------------start training---------------------------
    logger.info(
        '**********************Start training %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    train_model(model,
                optimizer,
                train_loader,
                model_func=model_fn_decorator(),
                lr_scheduler=lr_scheduler,
                optim_cfg=cfg.OPTIMIZATION,
                start_epoch=start_epoch,
                total_epochs=args.epochs,
                start_iter=it,
                rank=cfg.LOCAL_RANK,
                tb_log=tb_log,
                ckpt_save_dir=ckpt_dir,
                train_sampler=train_sampler,
                lr_warmup_scheduler=lr_warmup_scheduler,
                ckpt_save_interval=args.ckpt_save_interval,
                max_ckpt_save_num=args.max_ckpt_save_num,
                merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch)

    logger.info(
        '**********************End training %s/%s(%s)**********************\n\n\n'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))

    logger.info(
        '**********************Start evaluation %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=False)
    eval_output_dir = output_dir / 'eval' / 'eval_with_train'
    eval_output_dir.mkdir(parents=True, exist_ok=True)
    args.start_epoch = max(args.epochs - 10,
                           0)  # Only evaluate the last 10 epochs

    repeat_eval_ckpt(model.module if dist_train else model,
                     test_loader,
                     args,
                     eval_output_dir,
                     logger,
                     ckpt_dir,
                     dist_test=dist_train)
    logger.info(
        '**********************End evaluation %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
Exemple #2
0
def main():

    cfg = get_config()
    logger = get_logger(cfg.log_dir, cfg.tag)
    logger.info(pprint.pformat(cfg))
    
    if cfg.launcher == 'none':
        dist_train = False
    else:
        cfg.batch_size, cfg.local_rank = dist.init_dist_pytorch(
            cfg.batch_size, cfg.local_rank, backend='nccl'
        )
        cfg.data.train.batch_size = cfg.batch_size
        dist_train = True

    set_random_seed(cfg['random_seed'])

    if dist_train:
        total_gpus = dist.get_world_size()
        logger.info('total_batch_size: %d' % (total_gpus * cfg.batch_size))

    tb_log = SummaryWriter(log_dir=str(cfg.log_dir)) if cfg.local_rank == 0 else None

    train_set, train_loader, train_sampler = build_dataloader(
            cfg.data.train, dist=dist_train, training=True, logger=logger)

    #data = train_set[0]
    model = build_model(cfg.model)
    if cfg.get('sync_bn', False):
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda()
    
    optimizer = build_optimizer(model, cfg.optimization)

    start_epoch = it = 0
    last_epoch = -1
    if cfg.ckpt is not None:
        it, start_epoch = model.load_params_with_optimizer(cfg.ckpt, to_cpu=dist, optimizer=optimizer, logger=logger)
        last_epoch = start_epoch + 1

    model.train()  
    if dist_train:
        model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[cfg.local_rank % torch.cuda.device_count()])
    logger.info(model)

    total_iters_each_epoch = len(train_loader) 
    lr_scheduler, lr_warmup_scheduler = build_scheduler(
        optimizer, total_iters_each_epoch=total_iters_each_epoch, total_epochs=cfg.epochs,
        last_epoch=last_epoch, optim_cfg=cfg.optimization
    )

    # -----------------------start training---------------------------
    logger.info('*'*20 + 'Start training' +'*'*20)
    
    train_model(
        model,
        optimizer,
        train_loader,
        model_func=model_fn_decorator(),
        lr_scheduler=lr_scheduler,
        optim_cfg=cfg.optimization,
        start_epoch=start_epoch,
        total_epochs=cfg.epochs,
        start_iter=it,
        rank=cfg.local_rank,
        tb_log=tb_log,
        ckpt_save_dir=cfg.model_dir,
        train_sampler=train_sampler,
        lr_warmup_scheduler=lr_warmup_scheduler,
        ckpt_save_interval=cfg.ckpt_save_interval,
        max_ckpt_save_num=cfg.max_ckpt_save_num,
        merge_all_iters_to_one_epoch=False
    )
    logger.info('*'*20 + 'End training' +'*'*20)
Exemple #3
0
def main():
    args, cfg = parge_config()
    if args.launcher == 'none':
        dist_train = False
    else:
        args.batch_size, cfg.LOCAL_RANK = getattr(
            common_utils, 'init_dist_%s' % args.launcher)(args.batch_size,
                                                          args.tcp_port,
                                                          args.local_rank,
                                                          backend='nccl')
        dist_train = True
    if args.fix_random_seed:
        common_utils.set_random_seed(666)

    output_dir = cfg.ROOT_DIR / 'output' / cfg.TAG / args.extra_tag
    output_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir = output_dir / 'ckpt'
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    log_file = output_dir / ('log_train_%s.txt' %
                             datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_train:
        total_gpus = dist.get_world_size()
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)

    tb_log = SummaryWriter(
        log_dir=str(output_dir /
                    'tensorboard')) if cfg.LOCAL_RANK == 0 else None
    if args.local_rank == 0:
        wandb.init(project='BEVSEG-PCDet',
                   sync_tensorboard=True,
                   name=args.extra_tag,
                   config={
                       **vars(args),
                       **cfg
                   })

    # -----------------------create dataloader & network & optimizer---------------------------
    train_set, train_loader, train_sampler = build_dataloader(
        cfg.DATA_CONFIG.DATA_DIR,
        args.batch_size,
        dist_train,
        workers=args.workers,
        logger=logger,
        training=True)

    model = build_network(train_set)
    if args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda()

    optimizer = build_optimizer(model, cfg.MODEL.TRAIN.OPTIMIZATION)

    # load checkpoint if it is possible
    start_epoch = it = 0
    last_epoch = -1
    if args.pretrained_model is not None:
        model.load_params_from_file(filename=args.pretrained_model,
                                    to_cpu=dist,
                                    logger=logger)

    if args.ckpt is not None:
        it, start_epoch = model.load_params_with_optimizer(args.ckpt,
                                                           to_cpu=dist,
                                                           optimizer=optimizer,
                                                           logger=logger)
        last_epoch = start_epoch + 1
    else:
        ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
        if len(ckpt_list) > 0:
            ckpt_list.sort(key=os.path.getmtime)
            it, start_epoch = model.load_params_with_optimizer(
                ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger)
            last_epoch = start_epoch + 1

    model.train(
    )  # before wrap to DistributedDataParallel to support fixed some parameters
    if dist_train:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()],
            find_unused_parameters=
            True  # uncomment this line to debug unused params
        )
    logger.info(model)

    lr_scheduler, lr_warmup_scheduler = build_scheduler(
        optimizer,
        total_iters_each_epoch=len(train_loader),
        total_epochs=args.epochs,
        last_epoch=last_epoch,
        optim_cfg=cfg.MODEL.TRAIN.OPTIMIZATION)

    # -----------------------start training---------------------------
    logger.info(
        '**********************Start training %s(%s)**********************' %
        (cfg.TAG, args.extra_tag))
    train_model(model,
                optimizer,
                train_loader,
                model_func=model_fn_decorator(),
                lr_scheduler=lr_scheduler,
                optim_cfg=cfg.MODEL.TRAIN.OPTIMIZATION,
                start_epoch=start_epoch,
                total_epochs=args.epochs,
                start_iter=it,
                rank=cfg.LOCAL_RANK,
                tb_log=tb_log,
                ckpt_save_dir=ckpt_dir,
                train_sampler=train_sampler,
                lr_warmup_scheduler=lr_warmup_scheduler,
                ckpt_save_interval=args.ckpt_save_interval,
                max_ckpt_save_num=args.max_ckpt_save_num)

    logger.info('**********************End training**********************')
    '''
Exemple #4
0
def main():
    args, cfg = parse_config()
    if args.launcher == 'none':
        dist_train = False
        total_gpus = 1
    else:
        total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' %
                                             args.launcher)(args.tcp_port,
                                                            args.local_rank,
                                                            backend='nccl')
        dist_train = True

    if args.batch_size is None:
        args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU
    else:
        assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus'
        args.batch_size = args.batch_size // total_gpus

    args.epochs = cfg.OPTIMIZATION.NUM_EPOCHS if args.epochs is None else args.epochs

    if args.fix_random_seed:
        common_utils.set_random_seed(666)

    output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
    ckpt_dir = output_dir / 'ckpt'
    output_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    log_file = output_dir / ('log_train_%s.txt' %
                             datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)

    # log to file
    logger.info('**********************Start logging**********************')
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

    if dist_train:
        logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
    for key, val in vars(args).items():
        logger.info('{:16} {}'.format(key, val))
    log_config_to_file(cfg, logger=logger)
    if cfg.LOCAL_RANK == 0:
        os.system('cp %s %s' % (args.cfg_file, output_dir))

    tb_log = SummaryWriter(
        log_dir=str(output_dir /
                    'tensorboard')) if cfg.LOCAL_RANK == 0 else None

    # -----------------------create dataloader & network & optimizer---------------------------
    train_set, train_loader, train_sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=True,
        merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch,
        total_epochs=args.epochs)

    logger.info(
        '**********************Starting Inference on Pointpillars**********************'
    )

    # Load model to GPU and deactivate gradients
    MODEL_PATH = '/home/triasamo/entire_model.pth'
    model_point = torch.load(MODEL_PATH)
    model_point.cuda()
    model_point.eval()

    start_time = time.time()
    all_predictions = []
    with torch.no_grad():
        for data_dict in tqdm(train_loader):
            load_data_to_gpu(data_dict)
            # feed point cloud into model
            predictions, _ = model_point(
                data_dict
            )  # returns a list of dictionaries (one for each frame fed into the model)
            for index, pred_dict in enumerate(predictions):
                # Sort out predictions into boxes, scores, labels and centers
                frame_id = data_dict['frame_id'][index]
                pred_boxes = pred_dict['pred_boxes'].cpu().numpy()
                pred_scores = pred_dict['pred_scores'].cpu().numpy()
                pred_labels = pred_dict['pred_labels'].cpu().numpy()
                pred_centers = pred_boxes[:, :3]
                frame_dict = {
                    'frame_id': frame_id,
                    'pred_centers': pred_centers,
                    'pred_scores': pred_scores,
                    'pred_labels': pred_labels
                }
                all_predictions.append(frame_dict)
    logger.info("Inferece of dataset executed in: %.2f sec" %
                (time.time() - start_time))

    #for idx, data_dict in enumerate(dataloader_extra):
    #    ic(data_dict.keys())
    #    dict_keys(['points', 'frame_id', 'gt_boxes', 'use_lead_xyz', 'voxels', 'voxel_coords', 'voxel_num_points', 'image_shape', 'batch_size'])

    logger.info(
        '**********************Finished Inference on Pointpillars**********************'
    )

    model = build_network(model_cfg=cfg.MODEL,
                          num_class=len(cfg.CLASS_NAMES),
                          dataset=train_set)
    if args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda()

    optimizer = build_optimizer(model, cfg.OPTIMIZATION)

    # load checkpoint if it is possible
    start_epoch = it = 0
    last_epoch = -1
    if args.pretrained_model is not None:
        model.load_params_from_file(filename=args.pretrained_model,
                                    to_cpu=dist,
                                    logger=logger)

    if args.ckpt is not None:
        it, start_epoch = model.load_params_with_optimizer(args.ckpt,
                                                           to_cpu=dist,
                                                           optimizer=optimizer,
                                                           logger=logger)
        last_epoch = start_epoch + 1
    else:
        ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
        if len(ckpt_list) > 0:
            ckpt_list.sort(key=os.path.getmtime)
            it, start_epoch = model.load_params_with_optimizer(
                ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger)
            last_epoch = start_epoch + 1

    model.train(
    )  # before wrap to DistributedDataParallel to support fixed some parameters
    if dist_train:
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()])
    logger.info(model)

    lr_scheduler, lr_warmup_scheduler = build_scheduler(
        optimizer,
        total_iters_each_epoch=len(train_loader),
        total_epochs=args.epochs,
        last_epoch=last_epoch,
        optim_cfg=cfg.OPTIMIZATION)

    # -----------------------start training---------------------------
    logger.info(
        '**********************Start training %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    train_model(model,
                optimizer,
                train_loader,
                model_func=model_fn_decorator(),
                lr_scheduler=lr_scheduler,
                optim_cfg=cfg.OPTIMIZATION,
                start_epoch=start_epoch,
                total_epochs=args.epochs,
                start_iter=it,
                rank=cfg.LOCAL_RANK,
                tb_log=tb_log,
                ckpt_save_dir=ckpt_dir,
                train_sampler=train_sampler,
                lr_warmup_scheduler=lr_warmup_scheduler,
                ckpt_save_interval=args.ckpt_save_interval,
                max_ckpt_save_num=args.max_ckpt_save_num,
                merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch)

    logger.info(
        '**********************End training %s/%s(%s)**********************\n\n\n'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
    logger.info('************** Saving Entire model  **************\n\n\n')
    torch.save(model, '/home/triasamo/entire_model.pth')
    logger.info(
        '************** Saved model at /home/triasamo/entire_model.pth  **************\n\n\n'
    )
    test_set, test_loader, sampler = build_dataloader(
        dataset_cfg=cfg.DATA_CONFIG,
        class_names=cfg.CLASS_NAMES,
        batch_size=args.batch_size,
        dist=dist_train,
        workers=args.workers,
        logger=logger,
        training=False)
    eval_output_dir = output_dir / 'eval' / 'eval_with_train'
    eval_output_dir.mkdir(parents=True, exist_ok=True)
    args.start_epoch = max(args.epochs - 10,
                           0)  # Only evaluate the last 10 epochs

    repeat_eval_ckpt(model.module if dist_train else model,
                     test_loader,
                     args,
                     eval_output_dir,
                     logger,
                     ckpt_dir,
                     dist_test=dist_train)
    logger.info(
        '**********************End evaluation %s/%s(%s)**********************'
        % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))