Example #1
0
    def __init__(self,
                 num_iter=1,
                 filter_size=1,
                 feature_dim=256,
                 feat_stride=16,
                 init_step_length=1.0,
                 init_filter_reg=1e-2,
                 init_gauss_sigma=1.0,
                 num_dist_bins=5,
                 bin_displacement=1.0,
                 mask_init_factor=4.0,
                 test_loss=None):
        super().__init__()

        if test_loss is None:
            test_loss = ltr_losses.LBHinge(threshold=0.05)

        self.log_step_length = nn.Parameter(
            math.log(init_step_length) * torch.ones(1))
        self.num_iter = num_iter
        self.test_loss = test_loss
        self.filter_reg = nn.Parameter(init_filter_reg * torch.ones(1))
        self.feat_stride = feat_stride
        self.distance_map = DistanceMap(num_dist_bins, bin_displacement)

        # Distance coordinates
        d = torch.arange(num_dist_bins, dtype=torch.float32).view(
            1, -1, 1, 1) * bin_displacement
        if init_gauss_sigma == 0:
            init_gauss = torch.zeros_like(d)
            init_gauss[0, 0, 0, 0] = 1
        else:
            init_gauss = torch.exp(-1 / 2 * (d / init_gauss_sigma)**2)

        self.label_map_predictor = nn.Conv2d(num_dist_bins,
                                             1,
                                             kernel_size=1,
                                             bias=False)
        self.label_map_predictor.weight.data = init_gauss - init_gauss.min()

        self.target_mask_predictor = nn.Sequential(
            nn.Conv2d(num_dist_bins, 1, kernel_size=1, bias=False),
            nn.Sigmoid())
        self.target_mask_predictor[
            0].weight.data = mask_init_factor * torch.tanh(2.0 - d)

        self.spatial_weight_predictor = nn.Conv2d(num_dist_bins,
                                                  1,
                                                  kernel_size=1,
                                                  bias=False)
        self.spatial_weight_predictor.weight.data.fill_(1.0)
Example #2
0
def run(settings):
    settings.description = 'First training with gradient descent.'
    settings.batch_size = 6
    settings.num_workers = 16
    settings.print_interval = 1
    settings.normalize_mean = [0.485, 0.456, 0.406]
    settings.normalize_std = [0.229, 0.224, 0.225]
    settings.search_area_factor = 5.0
    settings.output_sigma_factor = 1 / 4
    settings.target_filter_sz = 4
    settings.feature_sz = 18
    settings.output_sz = settings.feature_sz * 16
    settings.center_jitter_factor = {'train': 3, 'test': 4.5}
    settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5}
    settings.hinge_threshold = 0.05
    settings.print_stats = [
        'Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/train_loss',
        'ClfTrain/iter_loss', 'ClfTrain/test_loss', 'ClfTrain/test_init_loss',
        'ClfTrain/test_iter_loss'
    ]

    # Train datasets
    #lasot_train = Lasot(settings.env.lasot_dir, split='train')
    got10k_train = Got10k_i(settings.env.got10k_dir, split='train')
    #trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=[0, 1, 2, 3])
    #coco_train = MSCOCOSeq(settings.env.coco_dir)

    # Validation datasets
    # lasot_val = Lasot(settings.env.lasot_dir, vid_ids=list(range(17, 21)))
    got10k_val = Got10k_i(settings.env.got10k_dir, split='val')

    # Data transform
    transform_joint = dltransforms.ToGrayscale(probability=0.05)

    transform_train = torchvision.transforms.Compose([
        dltransforms.ToTensorAndJitter(0.2),
        torchvision.transforms.Normalize(mean=settings.normalize_mean,
                                         std=settings.normalize_std)
    ])

    transform_val = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=settings.normalize_mean,
                                         std=settings.normalize_std)
    ])

    # The tracking pairs processing module
    output_sigma = settings.output_sigma_factor / settings.search_area_factor
    proposal_params = {
        'min_iou': 0.1,
        'boxes_per_frame': 8,
        'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]
    }
    label_params = {
        'feature_sz': settings.feature_sz,
        'sigma_factor': output_sigma,
        'kernel_sz': settings.target_filter_sz
    }
    data_processing_train = processing.TrackingProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        proposal_params=proposal_params,
        label_function_params=label_params,
        transform=transform_train,
        joint_transform=transform_joint)

    data_processing_val = processing.TrackingProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        proposal_params=proposal_params,
        label_function_params=label_params,
        transform=transform_val,
        joint_transform=transform_joint)

    # Train sampler and loader
    dataset_train = sampler.RandomSequenceWithDistractors(
        [got10k_train], [1],
        samples_per_epoch=26000,
        max_gap=30,
        frame_sample_mode='causal',
        num_seq_test_frames=3,
        num_class_distractor_frames=0,
        num_seq_train_frames=3,
        num_class_distractor_train_frames=0,
        processing=data_processing_train)

    loader_train = LTRLoader('train',
                             dataset_train,
                             training=True,
                             batch_size=settings.batch_size,
                             num_workers=settings.num_workers,
                             shuffle=True,
                             drop_last=True,
                             stack_dim=1)

    # Validation samplers and loaders
    # dataset_val = sampler.RandomSequence([lasot_val, got10k_val], [1,1], samples_per_epoch=5000, max_gap=100,
    #                                num_test_frames=1, processing=data_processing_val)
    dataset_val = sampler.RandomSequenceWithDistractors(
        [got10k_val], [1],
        samples_per_epoch=5000,
        max_gap=30,
        frame_sample_mode='causal',
        num_seq_test_frames=3,
        num_class_distractor_frames=0,
        num_seq_train_frames=3,
        num_class_distractor_train_frames=0,
        processing=data_processing_val)

    loader_val = LTRLoader('val',
                           dataset_val,
                           training=False,
                           batch_size=settings.batch_size,
                           num_workers=settings.num_workers,
                           shuffle=False,
                           drop_last=True,
                           epoch_interval=5,
                           stack_dim=1)

    # Create network and actor
    net = optim_tracker_models.steepest_descent_learn_filter_resnet50_newiou(
        filter_size=settings.target_filter_sz,
        backbone_pretrained=True,
        optim_iter=5,
        clf_feat_norm=True,
        clf_feat_blocks=0,
        final_conv=True,
        out_feature_dim=512,
        optim_init_step=0.9,
        optim_init_reg=0.1,
        init_gauss_sigma=output_sigma * settings.feature_sz,
        num_dist_bins=10,
        bin_displacement=0.5,
        mask_init_factor=3.0)

    objective = {
        'iou': nn.MSELoss(),
        'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold)
    }

    loss_weight = {
        'iou': 1,
        'test_clf': 100,
        'train_clf': 0,
        'init_clf': 0,
        'test_init_clf': 100,
        'test_iter_clf': 400
    }

    actor = actors.OptimTrackerActor(net=net,
                                     objective=objective,
                                     loss_weight=loss_weight)

    # Optimizer
    optimizer = optim.Adam(
        [{
            'params': actor.net.classifier.filter_initializer.parameters(),
            'lr': 5e-5
        }, {
            'params': actor.net.classifier.filter_optimizer.parameters(),
            'lr': 5e-4
        }, {
            'params': actor.net.classifier.feature_extractor.parameters(),
            'lr': 5e-5
        }, {
            'params': actor.net.bb_regressor.parameters()
        }, {
            'params': actor.net.feature_extractor.parameters(),
            'lr': 2e-5
        }],
        lr=2e-4)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                             step_size=15,
                                             gamma=0.2)

    trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer,
                         settings, lr_scheduler)

    trainer.train(50, load_latest=True, fail_safe=True)
Example #3
0
def run(settings):
    settings.description = 'Default train settings for DiMP with ResNet50 as backbone.'
    settings.batch_size = 10
    settings.num_workers = 8
    settings.multi_gpu = False
    settings.print_interval = 1
    settings.normalize_mean = [0.485, 0.456, 0.406]
    settings.normalize_std = [0.229, 0.224, 0.225]
    settings.search_area_factor = 5.0
    settings.output_sigma_factor = 1 / 4
    settings.target_filter_sz = 4
    settings.feature_sz = 18
    settings.output_sz = settings.feature_sz * 16
    settings.center_jitter_factor = {'train': 3, 'test': 4.5}
    settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5}
    settings.hinge_threshold = 0.05
    # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/clf_ce', 'ClfTrain/test_loss']
    '''
    Depth Inputs:
        1) raw_depth                X
        2) norm_depth
        3) centered_norm_depth
        4) centered_raw_depth       X

        5) colormap
        6) centered_colormap

    '''
    # depth_inputs = 'norm_depth'
    # depth_inputs = 'colormap'
    depth_inputs = 'hha'

    # Train datasets
    # depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, split='train', dtype=depth_inputs)
    coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir,
                                 dtype=depth_inputs)
    # got10k_depth_train = MSCOCOSeq_depth(settings.env.got10kdepth_dir, dtype=depth_inputs)
    lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir,
                                    rgb_root=settings.env.lasot_dir,
                                    dtype=depth_inputs)

    # Validation datasets
    depthtrack_val = DepthTrack(root=settings.env.depthtrack_dir,
                                split='val',
                                dtype=depth_inputs)

    # Data transform
    transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05))

    transform_train = tfm.Transform(
        tfm.ToTensorAndJitter(0.2),
        tfm.Normalize(mean=settings.normalize_mean,
                      std=settings.normalize_std))

    transform_val = tfm.Transform(
        tfm.ToTensor(),
        tfm.Normalize(mean=settings.normalize_mean,
                      std=settings.normalize_std))

    # The tracking pairs processing module
    output_sigma = settings.output_sigma_factor / settings.search_area_factor
    proposal_params = {
        'min_iou': 0.1,
        'boxes_per_frame': 8,
        'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]
    }
    label_params = {
        'feature_sz': settings.feature_sz,
        'sigma_factor': output_sigma,
        'kernel_sz': settings.target_filter_sz
    }
    data_processing_train = processing.DiMPProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        proposal_params=proposal_params,
        label_function_params=label_params,
        transform=transform_train,
        joint_transform=transform_joint)

    data_processing_val = processing.DiMPProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        proposal_params=proposal_params,
        label_function_params=label_params,
        transform=transform_val,
        joint_transform=transform_joint)

    # Train sampler and loader
    dataset_train = sampler.DiMPSampler([coco_train, lasot_depth_train],
                                        [1, 1],
                                        samples_per_epoch=26000,
                                        max_gap=30,
                                        num_test_frames=3,
                                        num_train_frames=3,
                                        processing=data_processing_train)

    loader_train = LTRLoader('train',
                             dataset_train,
                             training=True,
                             batch_size=settings.batch_size,
                             num_workers=settings.num_workers,
                             shuffle=True,
                             drop_last=True,
                             stack_dim=1)

    # Validation samplers and loaders
    dataset_val = sampler.DiMPSampler([depthtrack_val], [1],
                                      samples_per_epoch=5000,
                                      max_gap=30,
                                      num_test_frames=3,
                                      num_train_frames=3,
                                      processing=data_processing_val)

    loader_val = LTRLoader('val',
                           dataset_val,
                           training=False,
                           batch_size=settings.batch_size,
                           num_workers=settings.num_workers,
                           shuffle=False,
                           drop_last=True,
                           epoch_interval=5,
                           stack_dim=1)

    # Create network and actor
    net = dimpnet.dimpnet50(
        filter_size=settings.target_filter_sz,
        backbone_pretrained=True,
        optim_iter=5,  # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # net = dimpnet.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=False, optim_iter=5,  # !!!!!!!!!!!!!!!!!!!!!!!!!!!!
        clf_feat_norm=True,
        clf_feat_blocks=0,
        final_conv=True,
        out_feature_dim=512,
        optim_init_step=0.9,
        optim_init_reg=0.1,
        init_gauss_sigma=output_sigma * settings.feature_sz,
        num_dist_bins=100,
        bin_displacement=0.1,
        mask_init_factor=3.0,
        target_mask_act='sigmoid',
        score_act='relu')

    # Wrap the network for multi GPU training
    if settings.multi_gpu:
        net = MultiGPU(net, dim=1)

    objective = {
        'iou': nn.MSELoss(),
        'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold)
    }

    loss_weight = {
        'iou': 1,
        'test_clf': 100,
        'test_init_clf': 100,
        'test_iter_clf': 400
    }

    actor = actors.DiMPActor(net=net,
                             objective=objective,
                             loss_weight=loss_weight)

    # Optimizer
    optimizer = optim.Adam(
        [{
            'params': actor.net.classifier.filter_initializer.parameters(),
            'lr': 5e-5
        }, {
            'params': actor.net.classifier.filter_optimizer.parameters(),
            'lr': 5e-4
        }, {
            'params': actor.net.classifier.feature_extractor.parameters(),
            'lr': 5e-5
        }, {
            'params': actor.net.bb_regressor.parameters()
        }, {
            'params': actor.net.feature_extractor.parameters(),
            'lr': 2e-5
        }],
        lr=2e-4)

    lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                             step_size=15,
                                             gamma=0.2)

    trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer,
                         settings, lr_scheduler)

    trainer.train(50, load_latest=True, fail_safe=True)
Example #4
0
def run(settings):
    settings.description = 'Default train settings for FCOT with ResNet50 as backbone.'
    settings.multi_gpu = True
    settings.print_interval = 1
    settings.normalize_mean = [0.485, 0.456, 0.406]
    settings.normalize_std = [0.229, 0.224, 0.225]
    settings.search_area_factor = 5.0
    settings.output_sigma_factor = 1 / 4
    settings.clf_target_filter_sz = 4
    settings.reg_target_filter_sz = 3
    settings.feature_sz = 18
    settings.output_sz = settings.feature_sz * 16
    settings.center_jitter_factor = {'train': 3, 'test': 4.5}
    settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5}
    settings.hinge_threshold = 0.05
    settings.logging_file = 'fcot_log.txt'

    # Train datasets
    lasot_train = Lasot(settings.env.lasot_dir, split='train')
    got10k_train = Got10k(settings.env.got10k_dir, split='vottrain')
    trackingnet_train = TrackingNet(settings.env.trackingnet_dir,
                                    set_ids=list(range(4)))
    coco_train = MSCOCOSeq(settings.env.coco_dir)

    # Validation datasets
    got10k_val = Got10k(settings.env.got10k_dir, split='votval')

    # Data transform
    transform_joint = dltransforms.ToGrayscale(probability=0.05)

    transform_train = torchvision.transforms.Compose([
        dltransforms.ToTensorAndJitter(0.2),
        torchvision.transforms.Normalize(mean=settings.normalize_mean,
                                         std=settings.normalize_std)
    ])

    transform_val = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=settings.normalize_mean,
                                         std=settings.normalize_std)
    ])

    # The tracking pairs processing module
    output_sigma = settings.output_sigma_factor / settings.search_area_factor
    proposal_params = {
        'min_iou': 0.1,
        'boxes_per_frame': 8,
        'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]
    }
    label_params = {
        'feature_sz': settings.feature_sz,
        'sigma_factor': output_sigma,
        'kernel_sz': settings.clf_target_filter_sz
    }
    data_processing_train = processing_fcot.AnchorFreeProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        output_spatial_scale=72 / 288.,
        proposal_params=proposal_params,
        label_function_params=label_params,
        transform=transform_train,
        joint_transform=transform_joint)

    data_processing_val = processing_fcot.AnchorFreeProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        output_spatial_scale=72 / 288.,
        proposal_params=proposal_params,
        label_function_params=label_params,
        transform=transform_val,
        joint_transform=transform_joint)

    # Train sampler and loader
    dataset_train = sampler.FCOTSampler(
        [lasot_train, got10k_train, trackingnet_train, coco_train],
        [settings.lasot_rate, 1, 1, 1],
        samples_per_epoch=settings.samples_per_epoch,
        max_gap=30,
        num_test_frames=3,
        num_train_frames=3,
        processing=data_processing_train)

    loader_train = LTRLoader('train',
                             dataset_train,
                             training=True,
                             batch_size=settings.batch_size,
                             num_workers=settings.num_workers,
                             shuffle=True,
                             drop_last=True,
                             stack_dim=1)

    # Validation samplers and loaders
    dataset_val = sampler.FCOTSampler([got10k_val], [1],
                                      samples_per_epoch=5000,
                                      max_gap=30,
                                      num_test_frames=3,
                                      num_train_frames=3,
                                      processing=data_processing_val)

    loader_val = LTRLoader('val',
                           dataset_val,
                           training=False,
                           batch_size=settings.batch_size,
                           epoch_interval=5,
                           num_workers=settings.num_workers,
                           shuffle=False,
                           drop_last=True,
                           stack_dim=1)

    # Create network
    net = fcotnet.fcotnet(
        clf_filter_size=settings.clf_target_filter_sz,
        reg_filter_size=settings.reg_target_filter_sz,
        backbone_pretrained=True,
        optim_iter=5,
        norm_scale_coef=settings.norm_scale_coef,
        clf_feat_norm=True,
        clf_feat_blocks=0,
        final_conv=True,
        out_feature_dim=512,
        optim_init_step=0.9,
        optim_init_reg=0.1,
        init_gauss_sigma=output_sigma * settings.feature_sz,
        num_dist_bins=100,
        bin_displacement=0.1,
        mask_init_factor=3.0,
        target_mask_act='sigmoid',
        score_act='relu',
        train_reg_optimizer=settings.train_reg_optimizer,
        train_cls_72_and_reg_init=settings.train_cls_72_and_reg_init,
        train_cls_18=settings.train_cls_18)

    # Load dimp-model as initial weights
    device = torch.device('cuda:{}'.format(settings.devices_id[0]) if torch.
                          cuda.is_available() else 'cpu')
    if settings.use_pretrained_dimp:
        assert settings.pretrained_dimp50 is not None
        dimp50 = torch.load(settings.pretrained_dimp50, map_location=device)
        state_dict = collections.OrderedDict()
        for key, v in dimp50['net'].items():
            if key.split('.')[0] == 'feature_extractor':
                state_dict['.'.join(key.split('.')[1:])] = v

        net.feature_extractor.load_state_dict(state_dict)

        state_dict = collections.OrderedDict()
        for key, v in dimp50['net'].items():
            if key.split('.')[0] == 'classifier':
                state_dict['.'.join(key.split('.')[1:])] = v
        net.classifier_18.load_state_dict(state_dict)
        print("loading backbone and Classifier modules from DiMP50 done.")

    # Load fcot-model trained in the previous stage
    if settings.load_model:
        assert settings.fcot_model is not None
        load_dict = torch.load(settings.fcot_model)
        fcot_dict = net.state_dict()
        load_fcotnet_dict = {
            k: v
            for k, v in load_dict['net'].items() if k in fcot_dict
        }
        fcot_dict.update(load_fcotnet_dict)
        net.load_state_dict(fcot_dict)
        print("loading FCOT model done.")

    # Wrap the network for multi GPU training
    if settings.multi_gpu:
        net = MultiGPU(net, device_ids=settings.devices_id, dim=1).to(device)

    # Loss for cls_72, cls_18 and regression
    objective = {
        'test_clf_72': ltr_losses.LBHinge(threshold=settings.hinge_threshold),
        'test_clf_18': ltr_losses.LBHinge(threshold=settings.hinge_threshold),
        'reg_72': REGLoss(dim=4)
    }

    # Create actor and adam-optimizer
    if settings.train_cls_72_and_reg_init and settings.train_cls_18:
        ### train regression branch and clssification branches jointly, except for regression optimizer (TODO: fix)
        print("train cls_72, cls_18 and reg_init jointly...")
        loss_weight = {
            'test_clf_72': 100,
            'test_init_clf_72': 100,
            'test_iter_clf_72': 400,
            'test_clf_18': 100,
            'test_init_clf_18': 100,
            'test_iter_clf_18': 400,
            'reg_72': 1
        }
        actor = actors.FcotActor(net=net,
                                 objective=objective,
                                 loss_weight=loss_weight,
                                 device=device)
        optimizer = optim.Adam(
            [{
                'params':
                actor.net.classifier_72.filter_initializer.parameters(),
                'lr': 5e-5
            }, {
                'params':
                actor.net.classifier_72.filter_optimizer.parameters(),
                'lr': 5e-4
            }, {
                'params':
                actor.net.classifier_72.feature_extractor.parameters(),
                'lr': 5e-5
            }, {
                'params':
                actor.net.classifier_18.filter_initializer.parameters(),
                'lr': 5e-5
            }, {
                'params':
                actor.net.classifier_18.filter_optimizer.parameters(),
                'lr': 5e-4
            }, {
                'params':
                actor.net.classifier_18.feature_extractor.parameters(),
                'lr': 5e-5
            }, {
                'params': actor.net.regressor_72.parameters()
            }, {
                'params': actor.net.pyramid_first_conv.parameters()
            }, {
                'params': actor.net.pyramid_36.parameters()
            }, {
                'params': actor.net.pyramid_72.parameters()
            }, {
                'params': actor.net.feature_extractor.parameters(),
                'lr': 2e-5
            }],
            lr=2e-4)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                      milestones=[35, 46, 60],
                                                      gamma=0.2)
    elif settings.train_cls_72_and_reg_init:
        # Setting of the first training stage: train backbone, cls_72 and regression (except for regression optimizer) branch.
        print("train cls_72 and reg_init...")
        loss_weight = {
            'test_clf_72': 100,
            'test_init_clf_72': 10,
            'test_iter_clf_72': 400,
            'test_clf_18': 0,
            'test_init_clf_18': 0,
            'test_iter_clf_18': 0,
            'reg_72': 0.3
        }
        actor = actors.FcotCls72AndRegInitActor(net=net,
                                                objective=objective,
                                                loss_weight=loss_weight,
                                                device=device)
        optimizer = optim.Adam(
            [{
                'params':
                actor.net.classifier_72.filter_initializer.parameters(),
                'lr': 5e-5
            }, {
                'params':
                actor.net.classifier_72.filter_optimizer.parameters(),
                'lr': 5e-4
            }, {
                'params':
                actor.net.classifier_72.feature_extractor.parameters(),
                'lr': 5e-5
            }, {
                'params': actor.net.regressor_72.parameters()
            }, {
                'params': actor.net.pyramid_first_conv.parameters()
            }, {
                'params': actor.net.pyramid_36.parameters()
            }, {
                'params': actor.net.pyramid_72.parameters()
            }, {
                'params': actor.net.feature_extractor.parameters(),
                'lr': 2e-5
            }],
            lr=2e-4)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                      milestones=[35, 45, 69],
                                                      gamma=0.2)
    elif settings.train_reg_optimizer:
        # Setting of the second training stage: train regression optimizer.
        print("train regression optimizer...")
        loss_weight = {
            'test_reg_72': 1,
            'test_init_reg_72': 0,
            'test_iter_reg_72': 1
        }
        actor = actors.FcotOnlineRegressionActor(net=net,
                                                 objective=objective,
                                                 loss_weight=loss_weight,
                                                 device=device)
        optimizer = optim.Adam(
            [{
                'params': actor.net.regressor_72.filter_optimizer.parameters()
            }],
            lr=5e-4)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                      milestones=[2],
                                                      gamma=0.2)
    elif settings.train_cls_18:
        print("train cls_18...")
        # Setting of the third training stage: train cls_18 branch.
        loss_weight = {
            'test_clf_18': 100,
            'test_init_clf_18': 100,
            'test_iter_clf_18': 400
        }
        actor = actors.FcotCls18Actor(net=net,
                                      objective=objective,
                                      loss_weight=loss_weight,
                                      device=device)
        optimizer = optim.Adam(
            [{
                'params':
                actor.net.classifier_18.filter_initializer.parameters(),
                'lr': 5e-5
            }, {
                'params':
                actor.net.classifier_18.filter_optimizer.parameters(),
                'lr': 5e-4
            }, {
                'params':
                actor.net.classifier_18.feature_extractor.parameters(),
                'lr': 5e-5
            }],
            lr=2e-4)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                      milestones=[25],
                                                      gamma=0.2)
    else:
        # TODO: train jointly
        raise Exception("Please run training in correct way.")

    trainer = LTRFcotTrainer(actor, [loader_train, loader_val],
                             optimizer,
                             settings,
                             device,
                             lr_scheduler,
                             logging_file=settings.logging_file)

    trainer.train(settings.total_epochs, load_latest=True, fail_safe=True)
Example #5
0
def run(settings):
    settings.description = 'Default train settings for DiMP with ResNet50 as backbone.'
    settings.batch_size = 4
    settings.num_workers = 8
    settings.multi_gpu = False
    settings.print_interval = 5
    settings.normalize_mean = [0.485, 0.456, 0.406, 0]
    settings.normalize_std =  [0.229, 0.224, 0.225, 1.0]
    settings.search_area_factor = 5.0
    settings.output_sigma_factor = 1/4
    settings.target_filter_sz = 4
    settings.feature_sz = 18
    settings.output_sz = settings.feature_sz * 16
    settings.center_jitter_factor = {'train': 3, 'test': 4.5}
    settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5}
    settings.hinge_threshold = 0.05
    # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss']

    # # Train datasets
    # lasot_train = Lasot(settings.env.lasot_dir, split='train')
    # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain')
    # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4)))
    # coco_train = MSCOCOSeq(settings.env.coco_dir)
    #
    # # Validation datasets
    # got10k_val = Got10k(settings.env.got10k_dir, split='votval')

    # Train datasets
    #lasot_train = Lasot(split='train')
    ptb_train   = PrincetonRGBD(split='validation')
    # stc_train   = StcRGBD(split='train')
    # kevinlai_train=kevinlaiRGBD(split='train')
    #trackingnet_train = TrackingNet(set_ids=list(range(11)))
    #coco_train = MSCOCOSeq()


    # Validation datasets
    #lasot_val = Lasot(split='train')#TrackingNet(set_ids=list(range(11,12)))
    ptb_val   = PrincetonRGBD(split='validation')


    # Data transform
    transform_joint = dltransforms.ToGrayscale(probability=0.05)

    transform_train = torchvision.transforms.Compose([dltransforms.ToTensorAndJitter(0.2),
                                                      torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)])

    transform_val = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                    torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)])

    # The tracking pairs processing module
    output_sigma = settings.output_sigma_factor / settings.search_area_factor
    proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]}
    label_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz}
    data_processing_train = processing.DiMPProcessing(search_area_factor=settings.search_area_factor,
                                                      output_sz=settings.output_sz,
                                                      center_jitter_factor=settings.center_jitter_factor,
                                                      scale_jitter_factor=settings.scale_jitter_factor,
                                                      mode='sequence',
                                                      proposal_params=proposal_params,
                                                      label_function_params=label_params,
                                                      transform=transform_train,
                                                      joint_transform=transform_joint)

    data_processing_val = processing.DiMPProcessing(search_area_factor=settings.search_area_factor,
                                                    output_sz=settings.output_sz,
                                                    center_jitter_factor=settings.center_jitter_factor,
                                                    scale_jitter_factor=settings.scale_jitter_factor,
                                                    mode='sequence',
                                                    proposal_params=proposal_params,
                                                    label_function_params=label_params,
                                                    transform=transform_val,
                                                    joint_transform=transform_joint)

    # Train sampler and loader
    dataset_train = sampler.DiMPSampler([ptb_train], [1],
                                        samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3,
                                        processing=data_processing_train)

    loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers,
                             shuffle=True, drop_last=True, stack_dim=1)

    # Validation samplers and loaders
    # dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=30,
    #                                   num_test_frames=3, num_train_frames=3,
    #                                   processing=data_processing_val)
    dataset_val = sampler.DiMPSampler([ptb_val], [1], samples_per_epoch=5000, max_gap=30,
                                      num_test_frames=3, num_train_frames=3,
                                      processing=data_processing_val)

    loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers,
                           shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1)

    # Create network and actor
    net = dimpnet_rgbd_locc.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5,
                            clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512,
                            optim_init_step=0.9, optim_init_reg=0.1,
                            init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100,
                            bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu')

    # Wrap the network for multi GPU training
    if settings.multi_gpu:
        net = MultiGPU(net, dim=1)

    objective = {'iou': nn.MSELoss(), 'occ': nn.SmoothL1Loss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold)}

    loss_weight = {'iou': 1, 'occ':1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400}

    actor = actors.DiMPActor_OCC(net=net, objective=objective, loss_weight=loss_weight)


    # Optimizer
    optimizer = optim.Adam([{'params': actor.net.classifier.filter_initializer.parameters(),      'lr': 0*5e-5},
                            {'params': actor.net.classifier.filter_optimizer.parameters(),        'lr': 0*5e-4},
                            {'params': actor.net.classifier.feature_extractor.parameters(),       'lr': 0*5e-5},
                            {'params': actor.net.occ_classifer.parameters(),                      'lr': 2e-3},
                            {'params': actor.net.bb_regressor.parameters(),                       'lr': 0*2e-4},
                            {'params': actor.net.feature_extractor.parameters(),                  'lr': 0*2e-5},
                            {'params': actor.net.feature_extractor_depth.parameters(),            'lr': 0.1*2e-5}],
                           lr=0.1*2e-4)

    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2)

    trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler)

    #trainer.train(10, load_latest=True, fail_safe=True, path_pretrained=None)#'./checkpoints/dimp50.pth')
    #trainer.train(50, load_latest=True, fail_safe=True, path_pretrained=None)
    trainer.train(50, load_latest=True, fail_safe=True, path_pretrained='./checkpoints/dimp50.pth')
Example #6
0
def run(settings):
    settings.description = 'Transformer-assisted tracker. Our baseline approach is SuperDiMP'
    settings.batch_size = 40
    settings.num_workers = 8
    settings.multi_gpu = True
    settings.print_interval = 1
    settings.normalize_mean = [0.485, 0.456, 0.406]
    settings.normalize_std = [0.229, 0.224, 0.225]
    settings.search_area_factor = 6.0
    settings.output_sigma_factor = 1 / 4
    settings.target_filter_sz = 4
    settings.feature_sz = 22
    settings.output_sz = settings.feature_sz * 16
    settings.center_jitter_factor = {'train': 3, 'test': 5.5}
    settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5}
    settings.hinge_threshold = 0.05
    # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss']

    # Train datasets
    lasot_train = Lasot(settings.env.lasot_dir, split='train')
    got10k_train = Got10k(settings.env.got10k_dir, split='vottrain')
    trackingnet_train = TrackingNet(settings.env.trackingnet_dir,
                                    set_ids=list(range(4)))
    coco_train = MSCOCOSeq(settings.env.coco_dir)

    # Validation datasets
    got10k_val = Got10k(settings.env.got10k_dir, split='votval')

    # Data transform
    transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05),
                                    tfm.RandomHorizontalFlip(probability=0.5))

    transform_train = tfm.Transform(
        tfm.ToTensorAndJitter(0.2), tfm.RandomHorizontalFlip(probability=0.5),
        tfm.Normalize(mean=settings.normalize_mean,
                      std=settings.normalize_std))

    transform_val = tfm.Transform(
        tfm.ToTensor(),
        tfm.Normalize(mean=settings.normalize_mean,
                      std=settings.normalize_std))

    # The tracking pairs processing module
    output_sigma = settings.output_sigma_factor / settings.search_area_factor
    proposal_params = {
        'boxes_per_frame': 128,
        'gt_sigma': (0.05, 0.05),
        'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)]
    }
    label_params = {
        'feature_sz': settings.feature_sz,
        'sigma_factor': output_sigma,
        'kernel_sz': settings.target_filter_sz
    }
    label_density_params = {
        'feature_sz': settings.feature_sz,
        'sigma_factor': output_sigma,
        'kernel_sz': settings.target_filter_sz
    }

    data_processing_train = processing.KLDiMPProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        crop_type='inside_major',
        max_scale_change=1.5,
        mode='sequence',
        proposal_params=proposal_params,
        label_function_params=label_params,
        label_density_params=label_density_params,
        transform=transform_train,
        joint_transform=transform_joint)

    data_processing_val = processing.KLDiMPProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        crop_type='inside_major',
        max_scale_change=1.5,
        mode='sequence',
        proposal_params=proposal_params,
        label_function_params=label_params,
        label_density_params=label_density_params,
        transform=transform_val,
        joint_transform=transform_joint)

    # Train sampler and loader
    dataset_train = sampler.DiMPSampler(
        [lasot_train, got10k_train, trackingnet_train, coco_train],
        [1, 1, 1, 1],
        samples_per_epoch=50000,
        max_gap=500,
        num_test_frames=3,
        num_train_frames=3,
        processing=data_processing_train)

    loader_train = LTRLoader('train',
                             dataset_train,
                             training=True,
                             batch_size=settings.batch_size,
                             num_workers=settings.num_workers,
                             shuffle=True,
                             drop_last=True,
                             stack_dim=1)

    # Validation samplers and loaders
    dataset_val = sampler.DiMPSampler([got10k_val], [1],
                                      samples_per_epoch=10000,
                                      max_gap=500,
                                      num_test_frames=3,
                                      num_train_frames=3,
                                      processing=data_processing_val)

    loader_val = LTRLoader('val',
                           dataset_val,
                           training=False,
                           batch_size=settings.batch_size,
                           num_workers=settings.num_workers,
                           shuffle=False,
                           drop_last=True,
                           epoch_interval=5,
                           stack_dim=1)

    # Create network and actor
    net = dimpnet.dimpnet50(
        filter_size=settings.target_filter_sz,
        backbone_pretrained=True,
        optim_iter=5,
        clf_feat_norm=True,
        clf_feat_blocks=0,
        final_conv=True,
        out_feature_dim=512,
        optim_init_step=0.9,
        optim_init_reg=0.1,
        init_gauss_sigma=output_sigma * settings.feature_sz,
        num_dist_bins=100,
        bin_displacement=0.1,
        mask_init_factor=3.0,
        target_mask_act='sigmoid',
        score_act='relu',
        frozen_backbone_layers=['conv1', 'bn1', 'layer1', 'layer2'])

    # Wrap the network for multi GPU training
    if settings.multi_gpu:
        net = MultiGPU(net, dim=1)

    objective = {
        'bb_ce': klreg_losses.KLRegression(),
        'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold)
    }

    loss_weight = {
        'bb_ce': 0.01,
        'test_clf': 100,
        'test_init_clf': 100,
        'test_iter_clf': 400
    }

    actor = tracking_actors.KLDiMPActor(net=net,
                                        objective=objective,
                                        loss_weight=loss_weight)

    # Optimizer
    optimizer = optim.Adam(
        [{
            'params': actor.net.classifier.filter_initializer.parameters(),
            'lr': 5e-5
        }, {
            'params': actor.net.classifier.filter_optimizer.parameters(),
            'lr': 5e-4
        }, {
            'params': actor.net.classifier.feature_extractor.parameters(),
            'lr': 5e-5
        }, {
            'params': actor.net.classifier.transformer.parameters(),
            'lr': 1e-3
        }, {
            'params': actor.net.bb_regressor.parameters(),
            'lr': 1e-3
        }, {
            'params': actor.net.feature_extractor.layer3.parameters(),
            'lr': 2e-5
        }],
        lr=2e-4)

    lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                             step_size=15,
                                             gamma=0.2)

    trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer,
                         settings, lr_scheduler)

    trainer.train(50, load_latest=True, fail_safe=True)