def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with ResNet18 backbone and trained with vid, lasot, coco.' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 64 # Batch size settings.num_workers = 4 # Number of workers for image loading settings.normalize_mean = [0.485, 0.456, 0.406 ] # Normalize mean (default ImageNet values) settings.normalize_std = [0.229, 0.224, 0.225] # Normalize std (default ImageNet values) settings.search_area_factor = 5.0 # Image patch size relative to target size settings.feature_sz = 18 # Size of feature map settings.output_sz = settings.feature_sz * 16 # Size of input image patches # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} settings.proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } # Train datasets vid_train = ImagenetVID() lasot_train = Lasot(split='train') coco_train = MSCOCOSeq() # Validation datasets got10k_val = Got10k(split='val') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.05) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = dltransforms.Compose([ dltransforms.ToArrayAndJitter(0.2), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = dltransforms.Compose([ dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=settings.proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=settings.proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [vid_train, lasot_train, coco_train], [1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training train_loader = loader.LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=4, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [ 1, ], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, epoch_interval=5, num_workers=4, stack_dim=1) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network net = atom_resnet18(backbone_pretrained=True) # Freeze backbone state_dicts = net.state_dict() for k in state_dicts.keys(): if 'feature_extractor' in k and "running" not in k: state_dicts[k].stop_gradient = True # Set objective objective = fluid.layers.square_error_cost # Create actor, which wraps network and objective actor = actors.AtomActor(net=net, objective=objective) # Set to training mode actor.train() # define optimizer and learning rate gama = 0.2 lr = 1e-3 lr_scheduler = fluid.dygraph.PiecewiseDecay( [15, 30, 45], values=[lr, lr * gama, lr * gama * gama], step=1000, begin=0) optimizer = fluid.optimizer.Adam( parameter_list=net.bb_regressor.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(40, load_latest=False, fail_safe=False)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'SiamFC with Alexnet backbone and trained with vid' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 8 # Batch size settings.num_workers = 8 # Number of workers for image loading settings.normalize_mean = [0., 0., 0.] # Normalize mean settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.] # Normalize std settings.search_area_factor = { 'train': 1.0, 'test': 2.0078740157480315 } # roughly the same as SiamFC settings.output_sz = {'train': 127, 'test': 255} settings.scale_type = 'context' settings.border_type = 'replicate' # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 0} settings.scale_jitter_factor = {'train': 0, 'test': 0.} # Train datasets vid_train = ImagenetVID() # Validation datasets got10k_val = Got10k(split='val') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.25) # The augmentation transform applied to the training set (individually to each image in the pair) transform_exemplar = dltransforms.Compose([ dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) transform_instance = dltransforms.Compose([ DataAug(), dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs data_processing_train = processing.SiamFCProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', train_transform=transform_exemplar, test_transform=transform_instance, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.SiamFCProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', transform=transform_exemplar, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([vid_train], [ 1, ], samples_per_epoch=6650 * settings.batch_size, max_gap=100, processing=data_processing_train) # The loader for training train_loader = loader.LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [ 1, ], samples_per_epoch=1000 * settings.batch_size, max_gap=100, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, epoch_interval=5, stack_dim=1) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network net = siamfc_alexnet() # Create actor, which wraps network and objective actor = actors.SiamFCActor(net=net, objective=None, batch_size=settings.batch_size, shape=(17, 17), radius=16, stride=8) # Set to training mode actor.train() # define optimizer and learning rate lr_scheduler = fluid.layers.exponential_decay(learning_rate=0.01, decay_steps=6650, decay_rate=0.8685, staircase=True) regularizer = fluid.regularizer.L2DecayRegularizer( regularization_coeff=0.0005) optimizer = fluid.optimizer.Momentum(momentum=0.9, regularization=regularizer, parameter_list=net.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=False, fail_safe=False)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'SiamRPN with AlexNet backbone.' settings.print_interval = 100 # How often to print loss and other info settings.batch_size = 512 # Batch size settings.samples_per_epoch = 600000 # Number of training pairs per epoch settings.num_workers = 8 # Number of workers for image loading settings.search_area_factor = {'train': 1.0, 'test': 2.0} settings.output_sz = {'train': 127, 'test': 255} settings.scale_type = 'context' settings.border_type = 'meanpad' # Settings for the image sample and label generation settings.center_jitter_factor = {'train': 0.125, 'test': 2.0} settings.scale_jitter_factor = {'train': 0.05, 'test': 0.18} settings.label_params = { 'search_size': 255, 'output_size': 17, 'anchor_stride': 8, 'anchor_ratios': [0.33, 0.5, 1, 2, 3], 'anchor_scales': [8], 'num_pos': 16, 'num_neg': 16, 'num_total': 64, 'thr_high': 0.6, 'thr_low': 0.3 } settings.loss_weights = {'cls': 1., 'loc': 1.2} settings.neg = 0.2 # Train datasets vos_train = YoutubeVOS() vid_train = ImagenetVID() coco_train = MSCOCOSeq() det_train = ImagenetDET() #lasot_train = Lasot(split='train') #got10k_train = Got10k(split='train') # Validation datasets vid_val = ImagenetVID() # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.25) # The augmentation transform applied to the training set (individually to each image in the pair) transform_exemplar = dltransforms.Transpose() transform_instance = dltransforms.Compose( [ dltransforms.Color(probability=1.0), dltransforms.Blur(probability=0.18), dltransforms.Transpose() ]) transform_instance_mask = dltransforms.Transpose() # Data processing to do on the training pairs data_processing_train = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, train_transform=transform_exemplar, test_transform=transform_instance, test_mask_transform=transform_instance_mask, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, transform=transform_exemplar, joint_transform=transform_joint) nums_per_epoch = settings.samples_per_epoch // settings.batch_size # The sampler for training dataset_train = sampler.MaskSampler( [vid_train, coco_train, det_train, vos_train], [2, 1, 1, 2], samples_per_epoch=nums_per_epoch * settings.batch_size, max_gap=100, processing=data_processing_train, neg=settings.neg) # The loader for training train_loader = loader.LTRLoader( 'train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # The sampler for validation dataset_val = sampler.MaskSampler( [vid_val], [1, ], samples_per_epoch=100 * settings.batch_size, max_gap=100, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader( 'val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network def scale_loss(loss): total_loss = 0 for k in settings.loss_weights: total_loss += loss[k] * settings.loss_weights[k] return total_loss net = SiamRPN_AlexNet(scale_loss=scale_loss) # Define objective objective = { 'cls': select_softmax_with_cross_entropy_loss, 'loc': weight_l1_loss, } # Create actor, which wraps network and objective actor = actors.SiamActor(net=net, objective=objective) # Define optimizer and learning rate decayed_lr = fluid.layers.exponential_decay( learning_rate=0.01, decay_steps=nums_per_epoch, decay_rate=0.9407, staircase=True) lr_scheduler = LinearLrWarmup( learning_rate=decayed_lr, warmup_steps=5*nums_per_epoch, start_lr=0.005, end_lr=0.01) optimizer = fluid.optimizer.Adam( parameter_list=net.rpn_head.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=False, fail_safe=False)
def run(settings): # Most common settings are assigned in the settings struct settings.base_model = '' settings.description = 'SiamMask_sharp with ResNet-50 backbone.' settings.print_interval = 100 # How often to print loss and other info settings.batch_size = 64 # Batch size settings.samples_per_epoch = 600000 # Number of training pairs per epoch settings.num_workers = 8 # Number of workers for image loading settings.search_area_factor = {'train': 1.0, 'test': 143. / 127.} settings.output_sz = {'train': 127, 'test': 143} settings.scale_type = 'context' settings.border_type = 'meanpad' # Settings for the image sample and label generation settings.center_jitter_factor = {'train': 0.2, 'test': 0.4} settings.scale_jitter_factor = {'train': 0.05, 'test': 0.18} settings.label_params = { 'search_size': 143, 'output_size': 3, 'anchor_stride': 8, 'anchor_ratios': [0.33, 0.5, 1, 2, 3], 'anchor_scales': [8], 'num_pos': 16, 'num_neg': 16, 'num_total': 64, 'thr_high': 0.6, 'thr_low': 0.3 } settings.loss_weights = {'cls': 0., 'loc': 0., 'mask': 1} settings.neg = 0 # Train datasets vos_train = YoutubeVOS() coco_train = MSCOCOSeq() # Validation datasets vos_val = vos_train # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.25) # The augmentation transform applied to the training set (individually to each image in the pair) transform_exemplar = dltransforms.Transpose() transform_instance = dltransforms.Compose([ dltransforms.Color(probability=1.0), dltransforms.Blur(probability=0.18), dltransforms.Transpose() ]) transform_instance_mask = dltransforms.Transpose() # Data processing to do on the training pairs data_processing_train = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, train_transform=transform_exemplar, test_transform=transform_instance, test_mask_transform=transform_instance_mask, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, transform=transform_exemplar, joint_transform=transform_joint) nums_per_epoch = settings.samples_per_epoch // settings.batch_size # The sampler for training dataset_train = sampler.MaskSampler([coco_train, vos_train], [1, 1], samples_per_epoch=nums_per_epoch * settings.batch_size, max_gap=100, processing=data_processing_train, neg=settings.neg) # The loader for training train_loader = loader.LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # The sampler for validation dataset_val = sampler.MaskSampler([vos_val], [ 1, ], samples_per_epoch=100 * settings.batch_size, max_gap=100, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network def scale_loss(loss): total_loss = 0 for k in settings.loss_weights: total_loss += loss[k] * settings.loss_weights[k] return total_loss net = SiamMask_ResNet50_sharp(scale_loss=scale_loss) # Load parameters from the best_base_model if settings.base_model == '': raise Exception( 'The base_model path is not setup. Check settings.base_model in "ltr/train_settings/siammask/siammask_res50_sharp.py".' ) para_dict, _ = fluid.load_dygraph(settings.base_model) model_dict = net.state_dict() for key in model_dict.keys(): if key in para_dict.keys(): model_dict[key] = para_dict[key] net.set_dict(model_dict) # Define objective objective = { 'cls': select_softmax_with_cross_entropy_loss, 'loc': weight_l1_loss, 'mask': select_mask_logistic_loss } # Create actor, which wraps network and objective actor = actors.SiamActor(net=net, objective=objective) # Set to training mode actor.train() # Define optimizer and learning rate decayed_lr = fluid.layers.exponential_decay(learning_rate=0.0005, decay_steps=nums_per_epoch, decay_rate=0.9, staircase=True) lr_scheduler = LinearLrWarmup(learning_rate=decayed_lr, warmup_steps=5 * nums_per_epoch, start_lr=0.0001, end_lr=0.0005) optimizer = fluid.optimizer.Adam( parameter_list=net.mask_head.parameters() + net.refine_head.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(20, load_latest=False, fail_safe=False)