Ejemplo n.º 1
0
def train(cfg, args):
    logger = logging.getLogger('SSD.trainer')
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)

    lr = cfg.SOLVER.LR * args.num_gpus  # scale by num gpus
    optimizer = make_optimizer(cfg, model, lr)

    milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}
    save_to_disk = dist_util.get_rank() == 0
    checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR,
                                save_to_disk, logger)
    extra_checkpoint_data = checkpointer.load()
    arguments.update(extra_checkpoint_data)

    max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus
    train_loader = make_data_loader(cfg,
                                    is_train=True,
                                    distributed=args.distributed,
                                    max_iter=max_iter,
                                    start_iter=arguments['iteration'])

    model = do_train(cfg, model, train_loader, optimizer, scheduler,
                     checkpointer, device, arguments, args)
    return model
Ejemplo n.º 2
0
def train(cfg, args):
    # 工厂模式,加载日志文件设置,这里暂时不同管
    logger = logging.getLogger('SSD.trainer')
    # 建立目标检测模型
    model = build_detection_model(cfg)
    # 设置Device并且把模型部署到设备上
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

    # 设置学习率、优化器还有学习率变化步长,可以理解为模拟退火这种,前面的步长比较大,后面的步长比较小
    lr = cfg.SOLVER.LR * args.num_gpus  # scale by num gpus
    optimizer = make_optimizer(cfg, model, lr)

    milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}
    save_to_disk = dist_util.get_rank() == 0
    # **** 这里应该是从断点开始对模型进行训练 ****
    checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger)
    extra_checkpoint_data = checkpointer.load()
    arguments.update(extra_checkpoint_data)

    # Important 通过torch的形式去加载数据集
    # 关键在于如何加载数据集,模型的构建过程可以简单地看成是黑盒
    max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus
    train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration'])

    # 正式开始训练, 暂时先不训练?
    # 不对,不训练也得加载数据集**** 暂时不训练就完事了 *** 直接看数据加载过程
    # model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args)
    return model
Ejemplo n.º 3
0
def start_train(cfg):
    logger = logging.getLogger('SSD.trainer')
    model = SSDDetector(cfg)
    model = torch_utils.to_cuda(model)

    lr = cfg.SOLVER.LR
    optimizer = make_optimizer(cfg, model, lr)

    milestones = [step for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}
    save_to_disk = True
    checkpointer = CheckPointer(cfg, model, optimizer, scheduler,
                                cfg.OUTPUT_DIR, save_to_disk, logger)
    extra_checkpoint_data = checkpointer.load()
    arguments.update(extra_checkpoint_data)

    max_iter = cfg.SOLVER.MAX_ITER
    train_loader = make_data_loader(cfg,
                                    is_train=True,
                                    max_iter=max_iter,
                                    start_iter=arguments['iteration'])

    model = do_train(cfg, model, train_loader, optimizer, scheduler,
                     checkpointer, arguments)
    return model
Ejemplo n.º 4
0
def train(cfg, args):
    logger = logging.getLogger('SSD.trainer')
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)

    lr = cfg.SOLVER.LR * args.num_gpus  # scale by num gpus
    optimizer = make_optimizer(cfg, model, lr)

    milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}
    save_to_disk = dist_util.get_rank() == 0
    checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR,
                                save_to_disk, logger)
    extra_checkpoint_data = checkpointer.load()
    arguments.update(extra_checkpoint_data)

    max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus
    train_loader = make_data_loader(cfg,
                                    is_train=True,
                                    distributed=args.distributed,
                                    max_iter=max_iter,
                                    start_iter=arguments['iteration'])

    # macs, params = profile(model, inputs=(input, ))
    #
    # macs, params = clever_format([flops, params], "%.3f")

    # net = model.to()
    # with torch.cuda.device(0):

    # net = model.to(device)
    # macs, params = get_model_complexity_info(net, (3, 512, 512), as_strings=True,
    #                                        print_per_layer_stat=True, verbose=True)
    # print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
    # print('{:<30}  {:<8}'.format('Number of parameters: ', params))

    n_params = sum(p.numel() for name, p in model.named_parameters()
                   if p.requires_grad)
    print(n_params)
    #
    # model = net
    # inputs = torch.randn(1, 3, 300, 300) #8618 305
    # inputs = torch.randn(1, 3, 300, 300)

    # macs = profile_macs(model, inputs)
    # print(macs)

    model = do_train(cfg, model, train_loader, optimizer, scheduler,
                     checkpointer, device, arguments, args)
    return model
Ejemplo n.º 5
0
def train(cfg, args):
    logger = logging.getLogger('SSD.trainer')
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    if args.distributed:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)

    lr = cfg.SOLVER.LR * args.num_gpus  # scale by num gpus
    optimizer = make_optimizer(cfg, model, lr)

    milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}
    save_to_disk = dist_util.get_rank() == 0
    checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR,
                                save_to_disk, logger)
    extra_checkpoint_data = checkpointer.load(args.ckpt)
    arguments.update(extra_checkpoint_data)

    max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus
    train_loader = make_data_loader(cfg,
                                    is_train=True,
                                    distributed=args.distributed,
                                    max_iter=max_iter,
                                    start_iter=arguments['iteration'])

    logging.info('==>Start statistic')
    do_run(cfg, model, distributed=args.distributed)
    logging.info('==>End statistic')

    for ops in model.modules():
        if isinstance(ops, torch.nn.ReLU):
            ops.collectStats = False

            #            ops.c.data = ops.running_mean + (ops.running_b * laplace[args.actBitwidth])
            ops.c.data = ops.running_mean + (3 * ops.running_std)
            ops.quant = True
    torch.cuda.empty_cache()
    model = do_train(cfg, model, train_loader, optimizer, scheduler,
                     checkpointer, device, arguments, args)
    return model
Ejemplo n.º 6
0
def train(cfg: CfgNode,
          args: Namespace,
          output_dir: Path,
          model_manager: Dict[str, Any],
          freeze_non_sigma: bool = False):
    logger = logging.getLogger('SSD.trainer')
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)

    lr = cfg.SOLVER.LR * args.num_gpus  # scale by num gpus
    optimizer = make_optimizer(cfg, model, lr)

    milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}
    save_to_disk = dist_util.get_rank() == 0
    checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR,
                                save_to_disk, logger)
    resume_from = checkpointer.get_best_from_experiment_dir(cfg)
    extra_checkpoint_data = checkpointer.load(f=resume_from)
    arguments.update(extra_checkpoint_data)

    max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus
    train_loader = make_data_loader(cfg,
                                    is_train=True,
                                    distributed=args.distributed,
                                    max_iter=max_iter,
                                    start_iter=arguments['iteration'])

    # Weight freezing test:
    # print_model(model)
    # freeze_weights(model)
    print_model(model)

    model = do_train(cfg, model, train_loader, optimizer, scheduler,
                     checkpointer, device, arguments, args, output_dir,
                     model_manager)
    return model
Ejemplo n.º 7
0
def train(cfg, args):
    logger = logging.getLogger('SSD.trainer')
    model = build_detection_model(cfg)  # 建立模型
    device = torch.device(cfg.MODEL.DEVICE)  # 看cfg怎么组织的,把文件和args剥离开
    model.to(device)
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
        # model = nn.DataParallel(model)

    lr = cfg.SOLVER.LR * args.num_gpus  # scale by num gpus
    optimizer = make_optimizer(cfg, model, lr)  # 建立优化器

    milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}
    save_to_disk = dist_util.get_rank() == 0
    checkpointer = CheckPointer(model,
                                optimizer,
                                scheduler,
                                save_dir=cfg.OUTPUT_DIR,
                                save_to_disk=save_to_disk,
                                logger=logger)
    # 建立模型存储载入类,给save_dir赋值表示
    extra_checkpoint_data = checkpointer.load(f='', use_latest=False)  # 载入模型
    arguments.update(extra_checkpoint_data)

    max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus
    train_loader = make_data_loader(cfg,
                                    is_train=True,
                                    distributed=args.distributed,
                                    max_iter=max_iter,
                                    start_iter=arguments['iteration'])  # 建立数据库

    print("dataloader: ", train_loader.batch_size)
    # exit(1232)
    model = do_train(cfg, model, train_loader, optimizer, scheduler,
                     checkpointer, device, arguments, args)  # 训练
    return model
Ejemplo n.º 8
0
def active_train(cfg, args):
    logger = logging.getLogger("SSD.trainer")
    raw_model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    raw_model.to(device)

    lr = cfg.SOLVER.LR * args.num_gpus
    optimizer = make_optimizer(cfg, raw_model, lr)

    milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS]
    scheduler = make_lr_scheduler(cfg, optimizer, milestones)

    arguments = {"iteration": 0}

    checkpointer = None
    save_to_disk = dist_util.get_rank() == 0
    checkpointer = CheckPointer(raw_model, optimizer, scheduler,
                                args.model_dir, save_to_disk, logger)

    max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus

    is_train = True
    train_transform = build_transforms(cfg, is_train=is_train)
    target_transform = build_target_transform(cfg) if is_train else None
    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST
    datasets = build_dataset(dataset_list,
                             transform=train_transform,
                             target_transform=target_transform,
                             is_train=is_train)

    logger.info(f'Creating query loader...')
    query_loader = QueryLoader(datasets[0], args, cfg)

    logger.info(f'Creating al model...')
    strategy = get_strategy(args.strategy)
    model = ALModel(raw_model, strategy, optimizer, device, scheduler,
                    arguments, args, checkpointer, cfg)

    logger.info(f'Training on initial data with size {args.init_size}...')
    n_bbox = query_loader.len_annotations()
    t1 = time.time()
    model.fit(query_loader.get_labeled_loader())
    init_time = time.time() - t1
    logger.info(f'Scoring after initial training...')
    score = model.score()
    logger.info(f'SCORE : {score:.4f}')

    fields = [
        args.strategy, {}, 0, score, init_time, 0, init_time,
        len(query_loader), n_bbox
    ]
    save_to_csv(args.filename, fields)

    for step in range(args.query_step):
        logger.info(f'STEP NUMBER {step}')
        logger.info('Querying assets to label')
        t1 = time.time()
        query_idx = model.query(
            unlabeled_loader=query_loader.get_unlabeled_loader(),
            cfg=cfg,
            args=args,
            step=step,
            n_instances=args.query_size,
            length_ds=len(datasets[0]))
        logger.info('Adding labeled samples to train dataset')
        query_loader.add_to_labeled(query_idx, step + 1)
        t2 = time.time()
        logger.info('Fitting with new data...')
        model.fit(query_loader.get_labeled_loader())
        total_time = time.time() - t1
        train_time = time.time() - t2
        active_time = total_time - train_time
        logger.info('Scoring model...')
        score = model.score()
        n_bbox = query_loader.len_annotations()
        fields = [
            args.strategy, {}, step + 1, score, train_time, active_time,
            total_time,
            len(query_loader), n_bbox
        ]
        save_to_csv(args.filename, fields)
        logger.info(f'SCORE : {score:.4f}')

    return model.model