Example #1
0
def run(settings):
    # Most common settings are assigned in the settings struct
    settings.description = 'ATOM IoUNet with ResNet18 backbone and trained with vid, lasot, coco.'
    settings.print_interval = 1  # How often to print loss and other info
    settings.batch_size = 64  # Batch size
    settings.num_workers = 4  # Number of workers for image loading
    settings.normalize_mean = [0.485, 0.456, 0.406
                               ]  # Normalize mean (default ImageNet values)
    settings.normalize_std = [0.229, 0.224,
                              0.225]  # Normalize std (default ImageNet values)
    settings.search_area_factor = 5.0  # Image patch size relative to target size
    settings.feature_sz = 18  # Size of feature map
    settings.output_sz = settings.feature_sz * 16  # Size of input image patches

    # Settings for the image sample and proposal generation
    settings.center_jitter_factor = {'train': 0, 'test': 4.5}
    settings.scale_jitter_factor = {'train': 0, 'test': 0.5}
    settings.proposal_params = {
        'min_iou': 0.1,
        'boxes_per_frame': 16,
        'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]
    }

    # Train datasets
    vid_train = ImagenetVID()
    lasot_train = Lasot(split='train')
    coco_train = MSCOCOSeq()

    # Validation datasets
    got10k_val = Got10k(split='val')

    # The joint augmentation transform, that is applied to the pairs jointly
    transform_joint = dltransforms.ToGrayscale(probability=0.05)

    # The augmentation transform applied to the training set (individually to each image in the pair)
    transform_train = dltransforms.Compose([
        dltransforms.ToArrayAndJitter(0.2),
        dltransforms.Normalize(mean=settings.normalize_mean,
                               std=settings.normalize_std)
    ])

    # The augmentation transform applied to the validation set (individually to each image in the pair)
    transform_val = dltransforms.Compose([
        dltransforms.ToArray(),
        dltransforms.Normalize(mean=settings.normalize_mean,
                               std=settings.normalize_std)
    ])

    # Data processing to do on the training pairs
    data_processing_train = processing.ATOMProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        proposal_params=settings.proposal_params,
        transform=transform_train,
        joint_transform=transform_joint)

    # Data processing to do on the validation pairs
    data_processing_val = processing.ATOMProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        mode='sequence',
        proposal_params=settings.proposal_params,
        transform=transform_val,
        joint_transform=transform_joint)

    # The sampler for training
    dataset_train = sampler.ATOMSampler(
        [vid_train, lasot_train, coco_train], [1, 1, 1],
        samples_per_epoch=1000 * settings.batch_size,
        max_gap=50,
        processing=data_processing_train)

    # The loader for training
    train_loader = loader.LTRLoader('train',
                                    dataset_train,
                                    training=True,
                                    batch_size=settings.batch_size,
                                    num_workers=4,
                                    stack_dim=1)

    # The sampler for validation
    dataset_val = sampler.ATOMSampler([got10k_val], [
        1,
    ],
                                      samples_per_epoch=500 *
                                      settings.batch_size,
                                      max_gap=50,
                                      processing=data_processing_val)

    # The loader for validation
    val_loader = loader.LTRLoader('val',
                                  dataset_val,
                                  training=False,
                                  batch_size=settings.batch_size,
                                  epoch_interval=5,
                                  num_workers=4,
                                  stack_dim=1)

    # creat network, set objective, creat optimizer, learning rate scheduler, trainer
    with dygraph.guard():
        # Create network
        net = atom_resnet18(backbone_pretrained=True)

        # Freeze backbone
        state_dicts = net.state_dict()
        for k in state_dicts.keys():
            if 'feature_extractor' in k and "running" not in k:
                state_dicts[k].stop_gradient = True

        # Set objective
        objective = fluid.layers.square_error_cost

        # Create actor, which wraps network and objective
        actor = actors.AtomActor(net=net, objective=objective)

        # Set to training mode
        actor.train()

        # define optimizer and learning rate
        gama = 0.2
        lr = 1e-3
        lr_scheduler = fluid.dygraph.PiecewiseDecay(
            [15, 30, 45],
            values=[lr, lr * gama, lr * gama * gama],
            step=1000,
            begin=0)

        optimizer = fluid.optimizer.Adam(
            parameter_list=net.bb_regressor.parameters(),
            learning_rate=lr_scheduler)

        trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer,
                             settings, lr_scheduler)
        trainer.train(40, load_latest=False, fail_safe=False)
def run(settings):
    # Most common settings are assigned in the settings struct
    settings.description = 'SiamFC with Alexnet backbone and trained with vid'
    settings.print_interval = 1  # How often to print loss and other info
    settings.batch_size = 8  # Batch size
    settings.num_workers = 8  # Number of workers for image loading
    settings.normalize_mean = [0., 0., 0.]  # Normalize mean
    settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.]  # Normalize std
    settings.search_area_factor = {
        'train': 1.0,
        'test': 2.0078740157480315
    }  # roughly the same as SiamFC
    settings.output_sz = {'train': 127, 'test': 255}
    settings.scale_type = 'context'
    settings.border_type = 'replicate'

    # Settings for the image sample and proposal generation
    settings.center_jitter_factor = {'train': 0, 'test': 0}
    settings.scale_jitter_factor = {'train': 0, 'test': 0.}

    # Train datasets
    vid_train = ImagenetVID()

    # Validation datasets
    got10k_val = Got10k(split='val')

    # The joint augmentation transform, that is applied to the pairs jointly
    transform_joint = dltransforms.ToGrayscale(probability=0.25)

    # The augmentation transform applied to the training set (individually to each image in the pair)
    transform_exemplar = dltransforms.Compose([
        dltransforms.ToArray(),
        dltransforms.Normalize(mean=settings.normalize_mean,
                               std=settings.normalize_std)
    ])
    transform_instance = dltransforms.Compose([
        DataAug(),
        dltransforms.ToArray(),
        dltransforms.Normalize(mean=settings.normalize_mean,
                               std=settings.normalize_std)
    ])

    # Data processing to do on the training pairs
    data_processing_train = processing.SiamFCProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        scale_type=settings.scale_type,
        border_type=settings.border_type,
        mode='sequence',
        train_transform=transform_exemplar,
        test_transform=transform_instance,
        joint_transform=transform_joint)

    # Data processing to do on the validation pairs
    data_processing_val = processing.SiamFCProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        scale_type=settings.scale_type,
        border_type=settings.border_type,
        mode='sequence',
        transform=transform_exemplar,
        joint_transform=transform_joint)

    # The sampler for training
    dataset_train = sampler.ATOMSampler([vid_train], [
        1,
    ],
                                        samples_per_epoch=6650 *
                                        settings.batch_size,
                                        max_gap=100,
                                        processing=data_processing_train)

    # The loader for training
    train_loader = loader.LTRLoader('train',
                                    dataset_train,
                                    training=True,
                                    batch_size=settings.batch_size,
                                    num_workers=settings.num_workers,
                                    stack_dim=1)

    # The sampler for validation
    dataset_val = sampler.ATOMSampler([got10k_val], [
        1,
    ],
                                      samples_per_epoch=1000 *
                                      settings.batch_size,
                                      max_gap=100,
                                      processing=data_processing_val)

    # The loader for validation
    val_loader = loader.LTRLoader('val',
                                  dataset_val,
                                  training=False,
                                  batch_size=settings.batch_size,
                                  num_workers=settings.num_workers,
                                  epoch_interval=5,
                                  stack_dim=1)

    # creat network, set objective, creat optimizer, learning rate scheduler, trainer
    with dygraph.guard():
        # Create network
        net = siamfc_alexnet()

        # Create actor, which wraps network and objective
        actor = actors.SiamFCActor(net=net,
                                   objective=None,
                                   batch_size=settings.batch_size,
                                   shape=(17, 17),
                                   radius=16,
                                   stride=8)

        # Set to training mode
        actor.train()

        # define optimizer and learning rate
        lr_scheduler = fluid.layers.exponential_decay(learning_rate=0.01,
                                                      decay_steps=6650,
                                                      decay_rate=0.8685,
                                                      staircase=True)
        regularizer = fluid.regularizer.L2DecayRegularizer(
            regularization_coeff=0.0005)
        optimizer = fluid.optimizer.Momentum(momentum=0.9,
                                             regularization=regularizer,
                                             parameter_list=net.parameters(),
                                             learning_rate=lr_scheduler)

        trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer,
                             settings, lr_scheduler)
        trainer.train(50, load_latest=False, fail_safe=False)
Example #3
0
def run(settings):
    # Most common settings are assigned in the settings struct
    settings.description = 'SiamRPN with AlexNet backbone.'
    settings.print_interval = 100  # How often to print loss and other info
    settings.batch_size = 512  # Batch size
    settings.samples_per_epoch = 600000 # Number of training pairs per epoch
    settings.num_workers = 8  # Number of workers for image loading
    settings.search_area_factor = {'train': 1.0, 'test': 2.0}
    settings.output_sz = {'train': 127, 'test': 255}
    settings.scale_type = 'context'
    settings.border_type = 'meanpad'

    # Settings for the image sample and label generation
    settings.center_jitter_factor = {'train': 0.125, 'test': 2.0}
    settings.scale_jitter_factor = {'train': 0.05, 'test': 0.18}
    settings.label_params = {
        'search_size': 255,
        'output_size': 17,
        'anchor_stride': 8,
        'anchor_ratios': [0.33, 0.5, 1, 2, 3],
        'anchor_scales': [8],
        'num_pos': 16,
        'num_neg': 16,
        'num_total': 64,
        'thr_high': 0.6,
        'thr_low': 0.3
    }
    settings.loss_weights = {'cls': 1., 'loc': 1.2}
    settings.neg = 0.2

    # Train datasets
    vos_train = YoutubeVOS()
    vid_train = ImagenetVID()
    coco_train = MSCOCOSeq()
    det_train = ImagenetDET()
    #lasot_train = Lasot(split='train')
    #got10k_train = Got10k(split='train')

    # Validation datasets
    vid_val = ImagenetVID()

    # The joint augmentation transform, that is applied to the pairs jointly
    transform_joint = dltransforms.ToGrayscale(probability=0.25)

    # The augmentation transform applied to the training set (individually to each image in the pair)
    transform_exemplar = dltransforms.Transpose()
    transform_instance = dltransforms.Compose(
        [
            dltransforms.Color(probability=1.0),
            dltransforms.Blur(probability=0.18),
            dltransforms.Transpose()
        ])
    transform_instance_mask = dltransforms.Transpose()

    # Data processing to do on the training pairs
    data_processing_train = processing.SiamProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        scale_type=settings.scale_type,
        border_type=settings.border_type,
        mode='sequence',
        label_params=settings.label_params,
        train_transform=transform_exemplar,
        test_transform=transform_instance,
        test_mask_transform=transform_instance_mask,
        joint_transform=transform_joint)

    # Data processing to do on the validation pairs
    data_processing_val = processing.SiamProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        scale_type=settings.scale_type,
        border_type=settings.border_type,
        mode='sequence',
        label_params=settings.label_params,
        transform=transform_exemplar,
        joint_transform=transform_joint)

    nums_per_epoch = settings.samples_per_epoch // settings.batch_size
    # The sampler for training
    dataset_train = sampler.MaskSampler(
        [vid_train, coco_train, det_train, vos_train],
        [2, 1, 1, 2],
        samples_per_epoch=nums_per_epoch * settings.batch_size,
        max_gap=100,
        processing=data_processing_train,
        neg=settings.neg)

    # The loader for training
    train_loader = loader.LTRLoader(
        'train',
        dataset_train,
        training=True,
        batch_size=settings.batch_size,
        num_workers=settings.num_workers,
        stack_dim=0)

    # The sampler for validation
    dataset_val = sampler.MaskSampler(
        [vid_val],
        [1, ],
        samples_per_epoch=100 * settings.batch_size,
        max_gap=100,
        processing=data_processing_val)

    # The loader for validation
    val_loader = loader.LTRLoader(
        'val',
        dataset_val,
        training=False,
        batch_size=settings.batch_size,
        num_workers=settings.num_workers,
        stack_dim=0)

    # creat network, set objective, creat optimizer, learning rate scheduler, trainer
    with dygraph.guard():
        # Create network

        def scale_loss(loss):
            total_loss = 0
            for k in settings.loss_weights:
                total_loss += loss[k] * settings.loss_weights[k]
            return total_loss
        
        net = SiamRPN_AlexNet(scale_loss=scale_loss)

        # Define objective
        objective = {
            'cls': select_softmax_with_cross_entropy_loss,
            'loc': weight_l1_loss,
        }

        # Create actor, which wraps network and objective
        actor = actors.SiamActor(net=net, objective=objective)

        # Define optimizer and learning rate
        decayed_lr = fluid.layers.exponential_decay(
            learning_rate=0.01,
            decay_steps=nums_per_epoch,
            decay_rate=0.9407,
            staircase=True)
        lr_scheduler = LinearLrWarmup(
            learning_rate=decayed_lr,
            warmup_steps=5*nums_per_epoch,
            start_lr=0.005,
            end_lr=0.01)
        optimizer = fluid.optimizer.Adam(
            parameter_list=net.rpn_head.parameters(),
            learning_rate=lr_scheduler)

        trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler)
        trainer.train(50, load_latest=False, fail_safe=False)
def run(settings):
    # Most common settings are assigned in the settings struct
    settings.base_model = ''
    settings.description = 'SiamMask_sharp with ResNet-50 backbone.'
    settings.print_interval = 100  # How often to print loss and other info
    settings.batch_size = 64  # Batch size
    settings.samples_per_epoch = 600000  # Number of training pairs per epoch
    settings.num_workers = 8  # Number of workers for image loading
    settings.search_area_factor = {'train': 1.0, 'test': 143. / 127.}
    settings.output_sz = {'train': 127, 'test': 143}
    settings.scale_type = 'context'
    settings.border_type = 'meanpad'

    # Settings for the image sample and label generation
    settings.center_jitter_factor = {'train': 0.2, 'test': 0.4}
    settings.scale_jitter_factor = {'train': 0.05, 'test': 0.18}
    settings.label_params = {
        'search_size': 143,
        'output_size': 3,
        'anchor_stride': 8,
        'anchor_ratios': [0.33, 0.5, 1, 2, 3],
        'anchor_scales': [8],
        'num_pos': 16,
        'num_neg': 16,
        'num_total': 64,
        'thr_high': 0.6,
        'thr_low': 0.3
    }
    settings.loss_weights = {'cls': 0., 'loc': 0., 'mask': 1}
    settings.neg = 0

    # Train datasets
    vos_train = YoutubeVOS()
    coco_train = MSCOCOSeq()

    # Validation datasets
    vos_val = vos_train

    # The joint augmentation transform, that is applied to the pairs jointly
    transform_joint = dltransforms.ToGrayscale(probability=0.25)

    # The augmentation transform applied to the training set (individually to each image in the pair)
    transform_exemplar = dltransforms.Transpose()
    transform_instance = dltransforms.Compose([
        dltransforms.Color(probability=1.0),
        dltransforms.Blur(probability=0.18),
        dltransforms.Transpose()
    ])
    transform_instance_mask = dltransforms.Transpose()

    # Data processing to do on the training pairs
    data_processing_train = processing.SiamProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        scale_type=settings.scale_type,
        border_type=settings.border_type,
        mode='sequence',
        label_params=settings.label_params,
        train_transform=transform_exemplar,
        test_transform=transform_instance,
        test_mask_transform=transform_instance_mask,
        joint_transform=transform_joint)

    # Data processing to do on the validation pairs
    data_processing_val = processing.SiamProcessing(
        search_area_factor=settings.search_area_factor,
        output_sz=settings.output_sz,
        center_jitter_factor=settings.center_jitter_factor,
        scale_jitter_factor=settings.scale_jitter_factor,
        scale_type=settings.scale_type,
        border_type=settings.border_type,
        mode='sequence',
        label_params=settings.label_params,
        transform=transform_exemplar,
        joint_transform=transform_joint)

    nums_per_epoch = settings.samples_per_epoch // settings.batch_size
    # The sampler for training
    dataset_train = sampler.MaskSampler([coco_train, vos_train], [1, 1],
                                        samples_per_epoch=nums_per_epoch *
                                        settings.batch_size,
                                        max_gap=100,
                                        processing=data_processing_train,
                                        neg=settings.neg)

    # The loader for training
    train_loader = loader.LTRLoader('train',
                                    dataset_train,
                                    training=True,
                                    batch_size=settings.batch_size,
                                    num_workers=settings.num_workers,
                                    stack_dim=0)

    # The sampler for validation
    dataset_val = sampler.MaskSampler([vos_val], [
        1,
    ],
                                      samples_per_epoch=100 *
                                      settings.batch_size,
                                      max_gap=100,
                                      processing=data_processing_val)

    # The loader for validation
    val_loader = loader.LTRLoader('val',
                                  dataset_val,
                                  training=False,
                                  batch_size=settings.batch_size,
                                  num_workers=settings.num_workers,
                                  stack_dim=0)

    # creat network, set objective, creat optimizer, learning rate scheduler, trainer
    with dygraph.guard():
        # Create network

        def scale_loss(loss):
            total_loss = 0
            for k in settings.loss_weights:
                total_loss += loss[k] * settings.loss_weights[k]
            return total_loss

        net = SiamMask_ResNet50_sharp(scale_loss=scale_loss)

        # Load parameters from the best_base_model
        if settings.base_model == '':
            raise Exception(
                'The base_model path is not setup. Check settings.base_model in "ltr/train_settings/siammask/siammask_res50_sharp.py".'
            )
        para_dict, _ = fluid.load_dygraph(settings.base_model)
        model_dict = net.state_dict()

        for key in model_dict.keys():
            if key in para_dict.keys():
                model_dict[key] = para_dict[key]

        net.set_dict(model_dict)

        # Define objective
        objective = {
            'cls': select_softmax_with_cross_entropy_loss,
            'loc': weight_l1_loss,
            'mask': select_mask_logistic_loss
        }

        # Create actor, which wraps network and objective
        actor = actors.SiamActor(net=net, objective=objective)

        # Set to training mode
        actor.train()

        # Define optimizer and learning rate
        decayed_lr = fluid.layers.exponential_decay(learning_rate=0.0005,
                                                    decay_steps=nums_per_epoch,
                                                    decay_rate=0.9,
                                                    staircase=True)
        lr_scheduler = LinearLrWarmup(learning_rate=decayed_lr,
                                      warmup_steps=5 * nums_per_epoch,
                                      start_lr=0.0001,
                                      end_lr=0.0005)

        optimizer = fluid.optimizer.Adam(
            parameter_list=net.mask_head.parameters() +
            net.refine_head.parameters(),
            learning_rate=lr_scheduler)

        trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer,
                             settings, lr_scheduler)
        trainer.train(20, load_latest=False, fail_safe=False)