def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with default settings according to the paper.' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets trackingnet_val = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11,12))) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform(tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} data_processing_train = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([lasot_train, trackingnet_train, coco_train], [1,1,1], samples_per_epoch=1000*settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([trackingnet_val], [1], samples_per_epoch=500*settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18(backbone_pretrained=True) objective = nn.MSELoss() actor = actors.AtomActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for PrDiMP with ResNet50 as backbone.' settings.batch_size = 10 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1/4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 settings.print_stats = ['Loss/total', 'Loss/bb_ce', 'ClfTrain/clf_ce'] # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform(tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = {'boxes_per_frame': 128, 'gt_sigma': (0.05, 0.05), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)]} label_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz} label_density_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz, 'normalize': True} data_processing_train = processing.KLDiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.KLDiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([lasot_train, got10k_train, trackingnet_train, coco_train], [0.25,1,1,1], samples_per_epoch=26000, max_gap=200, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=200, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.klcedimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=1.0, optim_init_reg=0.05, optim_min_reg=0.05, gauss_sigma=output_sigma * settings.feature_sz, alpha_eps=0.05, normalize_label=True, init_initializer='zero') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = {'bb_ce': klreg_losses.KLRegression(), 'clf_ce': klreg_losses.KLRegressionGrid()} loss_weight = {'bb_ce': 0.0025, 'clf_ce': 0.25, 'clf_ce_init': 0.25, 'clf_ce_iter': 1.0} actor = tracking_actors.KLDiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam([{'params': actor.net.classifier.parameters(), 'lr': 1e-3}, {'params': actor.net.bb_regressor.parameters(), 'lr': 1e-3}, {'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5}], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for DiMP with ResNet50 as backbone.' settings.batch_size = 10 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/clf_ce', 'ClfTrain/test_loss'] ''' Depth Inputs: 1) raw_depth X 2) norm_depth 3) centered_norm_depth 4) centered_raw_depth X 5) colormap 6) centered_colormap ''' # depth_inputs = 'norm_depth' # depth_inputs = 'colormap' depth_inputs = 'hha' # Train datasets # depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, split='train', dtype=depth_inputs) coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir, dtype=depth_inputs) # got10k_depth_train = MSCOCOSeq_depth(settings.env.got10kdepth_dir, dtype=depth_inputs) lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir, rgb_root=settings.env.lasot_dir, dtype=depth_inputs) # Validation datasets depthtrack_val = DepthTrack(root=settings.env.depthtrack_dir, split='val', dtype=depth_inputs) # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([coco_train, lasot_depth_train], [1, 1], samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([depthtrack_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.dimpnet50( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # net = dimpnet.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=False, optim_iter=5, # !!!!!!!!!!!!!!!!!!!!!!!!!!!! clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'iou': nn.MSELoss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'iou': 1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = actors.DiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): settings.move_data_to_gpu = False settings.description = '' settings.batch_size = 10 settings.test_sequence_length = 50 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_param = { 'train_mode': 'uniform', 'train_factor': 3.0, 'train_limit_motion': False, 'test_mode': 'uniform', 'test_factor': 4.5, 'test_limit_motion': True } settings.scale_jitter_param = {'train_factor': 0.25, 'test_factor': 0.3} settings.hinge_threshold = 0.05 settings.print_stats = [ 'Loss/total', 'Loss/raw/test_clf', 'Loss/raw/test_clf_acc', 'Loss/raw/dimp_clf_acc', 'Loss/raw/is_target', 'Loss/raw/is_target_after_prop', 'Loss/raw/test_seq_acc', 'Loss/raw/dimp_seq_acc' ] lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=[0, 1, 2, 3, 4]) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = None label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz, 'end_pad_if_even': True } data_processing_train = processing.KYSProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_param=settings.center_jitter_param, scale_jitter_param=settings.scale_jitter_param, proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint, min_crop_inside_ratio=0.1) data_processing_val = processing.KYSProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_param=settings.center_jitter_param, scale_jitter_param=settings.scale_jitter_param, proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint, min_crop_inside_ratio=0.1) # Train sampler and loader sequence_sample_info = { 'num_train_frames': 3, 'num_test_frames': settings.test_sequence_length, 'max_train_gap': 30, 'allow_missing_target': True, 'min_fraction_valid_frames': 0.5, 'mode': 'Sequence' } dataset_train = sampler.KYSSampler( [got10k_train, trackingnet_train, lasot_train], [0.3, 0.3, 0.25], samples_per_epoch=settings.batch_size * 150, sequence_sample_info=sequence_sample_info, processing=data_processing_train, sample_occluded_sequences=True) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.KYSSampler([got10k_val], [1], samples_per_epoch=1000, sequence_sample_info=sequence_sample_info, processing=data_processing_val, sample_occluded_sequences=True) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # load base dimp dimp_weights_path = os.path.join(settings.env.pretrained_networks, 'dimp50.pth') base_net, _ = network_loading.load_network(checkpoint=dimp_weights_path) net = kysnet_models.kysnet_res50(optim_iter=3, cv_kernel_size=3, cv_max_displacement=9, cv_stride=1, init_gauss_sigma=output_sigma * settings.feature_sz, train_feature_extractor=False, train_iounet=False, detach_length=0, state_dim=8, representation_predictor_dims=(16, ), conf_measure='entropy', dimp_thresh=0.05) # Move pre-trained dimp weights net.backbone_feature_extractor.load_state_dict( base_net.feature_extractor.state_dict()) net.dimp_classifier.load_state_dict(base_net.classifier.state_dict()) net.bb_regressor.load_state_dict(base_net.bb_regressor.state_dict()) # To be safe for p in net.backbone_feature_extractor.parameters(): p.requires_grad_(False) for p in net.dimp_classifier.parameters(): p.requires_grad_(False) for p in net.bb_regressor.parameters(): p.requires_grad_(False) objective = { 'test_clf': ltr_losses.LBHingev2(threshold=settings.hinge_threshold, return_per_sequence=False), 'dimp_clf': ltr_losses.LBHingev2(threshold=settings.hinge_threshold, return_per_sequence=False), 'is_target': ltr_losses.IsTargetCellLoss(return_per_sequence=False), 'clf_acc': ltr_losses.TrackingClassificationAccuracy(threshold=0.25) } loss_weight = { 'test_clf': 1.0 * 500, 'test_clf_orig': 50, 'is_target': 0.1 * 500, 'is_target_after_prop': 0.1 * 500 } dimp_jitter_fn = DiMPScoreJittering(distractor_ratio=0.1, p_distractor=0.3, max_distractor_enhance_factor=1.3, min_distractor_enhance_factor=0.8) actor = actors.KYSActor(net=net, objective=objective, loss_weight=loss_weight, dimp_jitter_fn=dimp_jitter_fn) optimizer = optim.Adam([{ 'params': actor.net.predictor.parameters(), 'lr': 1e-2 }], lr=1e-2) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(40, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'distilled ATOM IoUNet with default settings according to the paper.' settings.batch_size = 32 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets trackingnet_val = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11, 12))) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [lasot_train, trackingnet_train, coco_train], [1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([trackingnet_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Load teacher network teacher_net = atom_models.atom_resnet18(backbone_pretrained=True) teacher_path = '/home/ddanier/CFKD/pytracking/networks/atom_default.pth' teacher_net = loading.load_weights(teacher_net, teacher_path, strict=True) print( '*******************Teacher net loaded successfully*******************' ) # Create student network and actor student_net = atom_models.atom_mobilenetsmall(backbone_pretrained=False) ########################################################## ### Distil backbone first, turn off grad for regressor ### ########################################################## for p in student_net.bb_regressor.parameters(): p.requires_grad_(False) objective = distillation.CFKDLoss( reg_loss=nn.MSELoss(), w_ts=0., w_ah=0., w_cf=0.01, w_fd=100., cf_layers=['conv1', 'layer1', 'layer2', 'layer3']) actor = actors.AtomCompressionActor(student_net, teacher_net, objective) # Optimizer optimizer = optim.Adam(actor.student_net.feature_extractor.parameters(), lr=1e-2) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1) # Create trainer trainer = LTRDistillationTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=False, fail_safe=True) ######################################################## ## Distil regressor next, turn off grad for backbone ### ######################################################## for p in trainer.actor.student_net.bb_regressor.parameters(): p.requires_grad_(True) for p in trainer.actor.student_net.feature_extractor.parameters(): p.requires_grad_(False) objective = distillation.CFKDLoss(reg_loss=nn.MSELoss(), w_ts=1., w_ah=0.1, w_cf=0., w_fd=0.) trainer.actor.objective = objective # Optimizer trainer.optimizer = optim.Adam( trainer.actor.student_net.bb_regressor.parameters(), lr=1e-2) trainer.lr_scheduler = optim.lr_scheduler.StepLR(trainer.optimizer, step_size=15, gamma=0.1) # Run training (set fail_safe=False if you are debugging) trainer.train(100, load_latest=False, fail_safe=True)
def run(settings): settings.description = 'Transformer-assisted tracker. Our baseline approach is SuperDiMP' settings.batch_size = 40 settings.num_workers = 8 settings.multi_gpu = True settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 6.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 22 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 5.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss'] # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05), tfm.RandomHorizontalFlip(probability=0.5)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.RandomHorizontalFlip(probability=0.5), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'boxes_per_frame': 128, 'gt_sigma': (0.05, 0.05), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } label_density_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.KLDiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, crop_type='inside_major', max_scale_change=1.5, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.KLDiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, crop_type='inside_major', max_scale_change=1.5, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler( [lasot_train, got10k_train, trackingnet_train, coco_train], [1, 1, 1, 1], samples_per_epoch=50000, max_gap=500, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=10000, max_gap=500, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.dimpnet50( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu', frozen_backbone_layers=['conv1', 'bn1', 'layer1', 'layer2']) # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'bb_ce': klreg_losses.KLRegression(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'bb_ce': 0.01, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = tracking_actors.KLDiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.transformer.parameters(), 'lr': 1e-3 }, { 'params': actor.net.bb_regressor.parameters(), 'lr': 1e-3 }, { 'params': actor.net.feature_extractor.layer3.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with default settings, but additionally using GOT10k for training.' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir, dtype='rgbcolormap') lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir, dtype='rgbcolormap') depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, dtype='rgbcolormap') depthtrack_horizontal_train = DepthTrack( root=settings.env.depthtrack_horizontal_dir, dtype='rgbcolormap') depthtrack_vertical_train = DepthTrack( root=settings.env.depthtrack_vertical_dir, dtype='rgbcolormap') # Validation datasets # got10k_val = Got10k(settings.env.got10k_dir, split='votval') cdtb_val = CDTB(settings.env.cdtb_dir, split='val', dtype='rgbcolormap') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([ lasot_depth_train, depthtrack_train, depthtrack_horizontal_train, depthtrack_vertical_train, coco_train ], [1, 1, 1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([cdtb_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18_DeT( backbone_pretrained=True, merge_type='max') # 'mean', 'conv', 'weightedSum' objective = nn.MSELoss() actor = actors.AtomActor(net=net, objective=objective) # Optimizer # optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) optimizer = optim.Adam( [ { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }, { 'params': actor.net.feature_extractor_depth.parameters(), 'lr': 2e-5 }, # {'params': actor.net.merge_layer2.parameters(), 'lr': 2e-5}, # {'params': actor.net.merge_layer3.parameters(), 'lr': 2e-5}, ], lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(80, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM using the probabilistic maximum likelihood trained regression model for bounding-box' \ 'regression presented in [https://arxiv.org/abs/1909.12297].' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'boxes_per_frame': 128, 'gt_sigma': (0, 0), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)], 'add_mean_box': True } data_processing_train = processing.KLBBregProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.KLBBregProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [lasot_train, got10k_train, trackingnet_train, coco_train], [1, 1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=200, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=200, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18(backbone_pretrained=True) objective = klreg_losses.MLRegression() actor = bbreg_actors.AtomBBKLActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=True, fail_safe=True)