def _train(dataset_name: str, backbone_name: str, path_to_data_dir: str,
           path_to_checkpoints_dir: str,
           path_to_resuming_checkpoint: Optional[str]):

    dataset = DatasetBase.from_name(dataset_name)(path_to_data_dir,
                                                  DatasetBase.Mode.TRAIN,
                                                  Config.IMAGE_MIN_SIDE,
                                                  Config.IMAGE_MAX_SIDE)
    dataloader = DataLoader(dataset,
                            batch_size=Config.BATCH_SIZE,
                            sampler=DatasetBase.NearestRatioRandomSampler(
                                dataset.image_ratios,
                                num_neighbors=Config.BATCH_SIZE),
                            num_workers=0,
                            collate_fn=DatasetBase.padding_collate_fn,
                            pin_memory=True)

    #为便于调试,num_works置为0

    Log.i('Found {:d} samples'.format(len(dataset)))

    backbone = BackboneBase.from_name(backbone_name)(pretrained=True)

    model = nn.DataParallel(
        Model(backbone,
              dataset.num_classes(),
              pooler_mode=Config.POOLER_MODE,
              anchor_ratios=Config.ANCHOR_RATIOS,
              anchor_sizes=Config.ANCHOR_SIZES,
              rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N,
              rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N,
              anchor_smooth_l1_loss_beta=Config.ANCHOR_SMOOTH_L1_LOSS_BETA,
              proposal_smooth_l1_loss_beta=Config.PROPOSAL_SMOOTH_L1_LOSS_BETA
              ).cuda())

    # 便于调试
    # model =  Model(
    #         backbone, dataset.num_classes(), pooler_mode=Config.POOLER_MODE,
    #         anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES,
    #         rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N,
    #         anchor_smooth_l1_loss_beta=Config.ANCHOR_SMOOTH_L1_LOSS_BETA, proposal_smooth_l1_loss_beta=Config.PROPOSAL_SMOOTH_L1_LOSS_BETA
    #     ).cuda()
    ''' 训练用参数:

        IMAGE_MIN_SIDE: float = 600.0
        IMAGE_MAX_SIDE: float = 1000.0

        ANCHOR_RATIOS: List[Tuple[int, int]] = [(1, 2), (1, 1), (2, 1)]
        ANCHOR_SIZES: List[int] = [128, 256, 512]
        POOLER_MODE: Pooler.Mode = Pooler.Mode.ALIGN

        RPN_PRE_NMS_TOP_N: int = 12000
        RPN_POST_NMS_TOP_N: int = 2000

        ANCHOR_SMOOTH_L1_LOSS_BETA: float = 1.0
        PROPOSAL_SMOOTH_L1_LOSS_BETA: float = 1.0

        BATCH_SIZE: int = 1
        LEARNING_RATE: float = 0.001
        MOMENTUM: float = 0.9
        WEIGHT_DECAY: float = 0.0005
        STEP_LR_SIZES: List[int] = [50000, 70000]
        STEP_LR_GAMMA: float = 0.1
        WARM_UP_FACTOR: float = 0.3333
        WARM_UP_NUM_ITERS: int = 500

        NUM_STEPS_TO_DISPLAY: int = 20
        NUM_STEPS_TO_SNAPSHOT: int = 10000
        NUM_STEPS_TO_FINISH: int = 90000
    '''
    #动量的意义:
    #1.降低病态条件数带来的振荡
    #2.减少随机梯度带来的方差(权值的衰减也有这个用处)

    #优化算法的两种衰减:
    #1. 权值的衰减:表现为在总的损失函数后面再加上权值的L2范数
    #2.学习率的衰减:表现为通过学习率调节器以不同策略随着学习步增加,对学习率进行衰减调节

    #optimizer = optim.Adam(model.parameters())

    optimizer = optim.SGD(model.parameters(),
                          lr=Config.LEARNING_RATE,
                          momentum=Config.MOMENTUM,
                          weight_decay=Config.WEIGHT_DECAY)
    scheduler = WarmUpMultiStepLR(optimizer,
                                  milestones=Config.STEP_LR_SIZES,
                                  gamma=Config.STEP_LR_GAMMA,
                                  factor=Config.WARM_UP_FACTOR,
                                  num_iters=Config.WARM_UP_NUM_ITERS)

    step = 0
    time_checkpoint = time.time()
    losses = deque(maxlen=100)
    summary_writer = SummaryWriter(
        os.path.join(path_to_checkpoints_dir, 'summaries'))
    should_stop = False

    num_steps_to_display = Config.NUM_STEPS_TO_DISPLAY
    num_steps_to_snapshot = Config.NUM_STEPS_TO_SNAPSHOT
    num_steps_to_finish = Config.NUM_STEPS_TO_FINISH

    if path_to_resuming_checkpoint is not None:
        step = model.module.load(path_to_resuming_checkpoint, optimizer,
                                 scheduler)
        Log.i(
            f'Model has been restored from file: {path_to_resuming_checkpoint}'
        )

    device_count = torch.cuda.device_count()
    #BATCH_SIZE默认是1
    assert Config.BATCH_SIZE % device_count == 0, 'The batch size is not divisible by the device count'
    Log.i('Start training with {:d} GPUs ({:d} batches per GPU)'.format(
        torch.cuda.device_count(),
        Config.BATCH_SIZE // torch.cuda.device_count()))

    while not should_stop:
        for _, (_, image_batch, _, bboxes_batch,
                labels_batch) in enumerate(dataloader):
            #训练使用的数据集采用voc2007
            batch_size = image_batch.shape[0]  #(1,)
            image_batch = image_batch.cuda()  #(1,3,h,w)
            bboxes_batch = bboxes_batch.cuda()  #(1,gt_n,4)
            labels_batch = labels_batch.cuda()  #(1,gt_n)

            anchor_objectness_losses, anchor_transformer_losses, proposal_class_losses, proposal_transformer_losses = \
                model.train().forward(image_batch, bboxes_batch, labels_batch)
            #rpn的损失
            anchor_objectness_loss = anchor_objectness_losses.mean()
            anchor_transformer_loss = anchor_transformer_losses.mean()
            #detection的损失
            proposal_class_loss = proposal_class_losses.mean()
            proposal_transformer_loss = proposal_transformer_losses.mean()
            loss = anchor_objectness_loss + anchor_transformer_loss + proposal_class_loss + proposal_transformer_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            losses.append(loss.item())
            summary_writer.add_scalar('train/anchor_objectness_loss',
                                      anchor_objectness_loss.item(), step)
            summary_writer.add_scalar('train/anchor_transformer_loss',
                                      anchor_transformer_loss.item(), step)
            summary_writer.add_scalar('train/proposal_class_loss',
                                      proposal_class_loss.item(), step)
            summary_writer.add_scalar('train/proposal_transformer_loss',
                                      proposal_transformer_loss.item(), step)
            summary_writer.add_scalar('train/loss', loss.item(), step)
            step += 1

            if step == num_steps_to_finish:
                should_stop = True

            if step % num_steps_to_display == 0:
                elapsed_time = time.time() - time_checkpoint
                time_checkpoint = time.time()
                steps_per_sec = num_steps_to_display / elapsed_time
                samples_per_sec = batch_size * steps_per_sec
                eta = (num_steps_to_finish - step) / steps_per_sec / 3600
                avg_loss = sum(losses) / len(losses)
                lr = scheduler.get_lr()[0]
                #lr = optimizer.param_groups[0]['lr']
                Log.i(
                    f'[Step {step}] Avg. Loss = {avg_loss:.6f}, Learning Rate = {lr} ({samples_per_sec:.2f} samples/sec; ETA {eta:.1f} hrs)'
                )

            #test
            if step == 10:
                path_to_checkpoint = model.module.save(path_to_checkpoints_dir,
                                                       step, optimizer,
                                                       scheduler)
                #path_to_checkpoint = model.module.save(path_to_checkpoints_dir, step, optimizer)

                Log.i(f'Model has been saved to {path_to_checkpoint}')

            if step % num_steps_to_snapshot == 0 or should_stop:
                path_to_checkpoint = model.module.save(path_to_checkpoints_dir,
                                                       step, optimizer,
                                                       scheduler)
                #path_to_checkpoint = model.module.save(path_to_checkpoints_dir, step, optimizer)

                Log.i(f'Model has been saved to {path_to_checkpoint}')

            if should_stop:
                break

    Log.i('Done')
Esempio n. 2
0
def _train(dataset_name: str, 
                backbone_name: str, 
                path_to_data_dir: str, 
                path_to_checkpoints_dir: str, 
                path_to_resuming_checkpoint: Optional[str]):
    dataset = DatasetBase.from_name(dataset_name)(path_to_data_dir, DatasetBase.Mode.TRAIN, Config.IMAGE_MIN_SIDE, Config.IMAGE_MAX_SIDE)
    dataloader = DataLoader(dataset, 
                                            batch_size=Config.BATCH_SIZE,
                                            sampler=DatasetBase.NearestRatioRandomSampler(dataset.image_ratios, num_neighbors=Config.BATCH_SIZE),
                                            num_workers=8, 
                                            collate_fn=DatasetBase.padding_collate_fn, 
                                            pin_memory=True)

    Log.i('Found {:d} samples'.format(len(dataset)))

    backbone = BackboneBase.from_name(backbone_name)(pretrained=True)
    model = nn.DataParallel(
        Model(
                    backbone, dataset.num_classes(), 
                    pooler_mode=Config.POOLER_MODE,
                    anchor_ratios=Config.ANCHOR_RATIOS, 
                    anchor_sizes=Config.ANCHOR_SIZES,
                    rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, 
                    rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N,
                    anchor_smooth_l1_loss_beta=Config.ANCHOR_SMOOTH_L1_LOSS_BETA, 
                    proposal_smooth_l1_loss_beta=Config.PROPOSAL_SMOOTH_L1_LOSS_BETA
                ).cuda()
    )
    optimizer = optim.SGD(model.parameters(), 
                                        lr=Config.LEARNING_RATE,
                                        momentum=Config.MOMENTUM, 
                                        weight_decay=Config.WEIGHT_DECAY)
    scheduler = WarmUpMultiStepLR(optimizer, 
                                                    milestones=Config.STEP_LR_SIZES, 
                                                    gamma=Config.STEP_LR_GAMMA,
                                                    factor=Config.WARM_UP_FACTOR, 
                                                    num_iters=Config.WARM_UP_NUM_ITERS)

    step = 0
    time_checkpoint = time.time()
    losses = deque(maxlen=100)
    summary_writer = SummaryWriter(os.path.join(path_to_checkpoints_dir, 'summaries'))
    should_stop = False

    num_steps_to_display = Config.NUM_STEPS_TO_DISPLAY
    num_steps_to_snapshot = Config.NUM_STEPS_TO_SNAPSHOT
    num_steps_to_finish = Config.NUM_STEPS_TO_FINISH

    if path_to_resuming_checkpoint is not None:
        step = model.module.load(path_to_resuming_checkpoint, optimizer, scheduler)
        Log.i(f'Model has been restored from file: {path_to_resuming_checkpoint}')

    device_count = torch.cuda.device_count()
    assert Config.BATCH_SIZE % device_count == 0, 'The batch size is not divisible by the device count'
    Log.i('Start training with {:d} GPUs ({:d} batches per GPU)'.format(torch.cuda.device_count(),
                                                                        Config.BATCH_SIZE // torch.cuda.device_count()))

    while not should_stop:
        for _, (_, image_batch, _, bboxes_batch, labels_batch) in enumerate(dataloader):
            batch_size = image_batch.shape[0]
            image_batch = image_batch.cuda()
            bboxes_batch = bboxes_batch.cuda()
            labels_batch = labels_batch.cuda()

            anchor_objectness_losses, anchor_transformer_losses, proposal_class_losses, proposal_transformer_losses = \
                model.train().forward(image_batch, bboxes_batch, labels_batch)
            anchor_objectness_loss = anchor_objectness_losses.mean()
            anchor_transformer_loss = anchor_transformer_losses.mean()
            proposal_class_loss = proposal_class_losses.mean()
            proposal_transformer_loss = proposal_transformer_losses.mean()
            loss = anchor_objectness_loss + anchor_transformer_loss + proposal_class_loss + proposal_transformer_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            losses.append(loss.item())
            summary_writer.add_scalar('train/anchor_objectness_loss', anchor_objectness_loss.item(), step)
            summary_writer.add_scalar('train/anchor_transformer_loss', anchor_transformer_loss.item(), step)
            summary_writer.add_scalar('train/proposal_class_loss', proposal_class_loss.item(), step)
            summary_writer.add_scalar('train/proposal_transformer_loss', proposal_transformer_loss.item(), step)
            summary_writer.add_scalar('train/loss', loss.item(), step)
            step += 1

            if step == num_steps_to_finish:
                should_stop = True

            if step % num_steps_to_display == 0:
                elapsed_time = time.time() - time_checkpoint
                time_checkpoint = time.time()
                steps_per_sec = num_steps_to_display / elapsed_time
                samples_per_sec = batch_size * steps_per_sec
                eta = (num_steps_to_finish - step) / steps_per_sec / 3600
                avg_loss = sum(losses) / len(losses)
                lr = scheduler.get_lr()[0]
                Log.i(f'[Step {step}] Avg. Loss = {avg_loss:.6f}, Learning Rate = {lr:.8f} ({samples_per_sec:.2f} samples/sec; ETA {eta:.1f} hrs)')

            if step % num_steps_to_snapshot == 0 or should_stop:
                path_to_checkpoint = model.module.save(path_to_checkpoints_dir, step, optimizer, scheduler)
                Log.i(f'Model has been saved to {path_to_checkpoint}')

            if should_stop:
                break

    Log.i('Done')
Esempio n. 3
0
    # If specified we start from checkpoint
    # if config.TRAIN.PRETRAINED_WEIGHTS is not None and config.TRAIN.PRETRAINED_WEIGHTS != '':
    #     if config.TRAIN.PRETRAINED_WEIGHTS.endswith(".pth"):
    #         backbone.load_state_dict(torch.load(config.TRAIN.PRETRAINED_WEIGHTS))
    #     else:
    #         backbone.load_darknet_weights(config.TRAIN.PRETRAINED_WEIGHTS)

    # Get dataloader
    dataset = DatasetBase.from_name('tiny-person')(
        config.TRAIN.PATH_TO_IMAGES_DIR, config.TRAIN.PATH_TO_ANNOTATIONS,
        DatasetBase.Mode.TRAIN)

    dataloader = DataLoader(dataset,
                            batch_size=config.TRAIN.BATCH_SIZE,
                            sampler=DatasetBase.NearestRatioRandomSampler(
                                dataset.image_ratios,
                                num_neighbors=config.TRAIN.BATCH_SIZE),
                            num_workers=config.TRAIN.NUM_WORKERS,
                            collate_fn=dataset.collate_fn,
                            pin_memory=True)
    optimizer = torch.optim.Adam(model.parameters())

    metrics = [
        "grid_size",
        "loss",
        "x",
        "y",
        "w",
        "h",
        "conf",
        "cls",