def run(settings): settings.description = 'Default train settings for training full network' settings.batch_size = 20 settings.num_workers = 8 settings.multi_gpu = True settings.print_interval = 1 settings.normalize_mean = [102.9801, 115.9465, 122.7717] settings.normalize_std = [1.0, 1.0, 1.0] settings.feature_sz = (52, 30) # Settings used for generating the image crop input to the network. See documentation of LWTLProcessing class in # ltr/data/processing.py for details. settings.output_sz = (settings.feature_sz[0] * 16, settings.feature_sz[1] * 16 ) # Size of input image crop settings.search_area_factor = 5.0 settings.crop_type = 'inside_major' settings.max_scale_change = None settings.center_jitter_factor = {'train': 3, 'test': (5.5, 4.5)} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} # Datasets ytvos_train = YouTubeVOS(version="2019", multiobj=False, split='jjtrain') davis_train = Davis(version='2017', multiobj=False, split='train') ytvos_val = YouTubeVOS(version="2019", multiobj=False, split='jjvalid') # Data transform transform_joint = tfm.Transform(tfm.ToBGR(), tfm.ToGrayscale(probability=0.05), tfm.RandomHorizontalFlip(probability=0.5)) transform_train = tfm.Transform( tfm.RandomAffine(p_flip=0.0, max_rotation=15.0, max_shear=0.0, max_ar_factor=0.0, max_scale=0.2, pad_amount=0), tfm.ToTensorAndJitter(0.2, normalize=False), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensorAndJitter(0.0, normalize=False), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) data_processing_train = processing.LWLProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', crop_type=settings.crop_type, max_scale_change=settings.max_scale_change, transform=transform_train, joint_transform=transform_joint, new_roll=True) data_processing_val = processing.LWLProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', crop_type=settings.crop_type, max_scale_change=settings.max_scale_change, transform=transform_val, joint_transform=transform_joint, new_roll=True) # Train sampler and loader dataset_train = sampler.LWLSampler([ytvos_train, davis_train], [6, 1], samples_per_epoch=settings.batch_size * 1000, max_gap=100, num_test_frames=3, num_train_frames=1, processing=data_processing_train) dataset_val = sampler.LWLSampler([ytvos_val], [1], samples_per_epoch=settings.batch_size * 100, max_gap=100, num_test_frames=3, num_train_frames=1, processing=data_processing_val) loader_train = LTRLoader('train', dataset_train, training=True, num_workers=settings.num_workers, stack_dim=1, batch_size=settings.batch_size) loader_val = LTRLoader('val', dataset_val, training=False, num_workers=settings.num_workers, epoch_interval=5, stack_dim=1, batch_size=settings.batch_size) # Network net = lwl_networks.steepest_descent_resnet50( filter_size=3, num_filters=16, optim_iter=5, backbone_pretrained=True, out_feature_dim=512, frozen_backbone_layers=['conv1', 'bn1', 'layer1'], label_encoder_dims=(16, 32, 64), use_bn_in_label_enc=False, clf_feat_blocks=0, final_conv=True, backbone_type='mrcnn') base_net = network_loading.load_trained_network( settings.env.workspace_dir, 'ltr/lwl/lwl_stage1/LWTLNet_ep0070.pth.tar') net.load_state_dict(base_net.state_dict()) # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) # Loss function objective = { 'segm': LovaszSegLoss(per_image=False), } loss_weight = {'segm': 100.0} actor = segm_actors.LWLActor(net=net, objective=objective, loss_weight=loss_weight, num_refinement_iter=2, disable_all_bn=True) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.target_model.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.target_model.filter_optimizer.parameters(), 'lr': 2e-5 }, { 'params': actor.net.target_model.feature_extractor.parameters(), 'lr': 2e-5 }, { 'params': actor.net.decoder.parameters(), 'lr': 2e-5 }, { 'params': actor.net.label_encoder.parameters(), 'lr': 2e-5 }, { 'params': actor.net.feature_extractor.layer2.parameters(), 'lr': 2e-5 }, { 'params': actor.net.feature_extractor.layer3.parameters(), 'lr': 2e-5 }, { 'params': actor.net.feature_extractor.layer4.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [25, 75], gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(80, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for PrDiMP with ResNet50 as backbone.' settings.batch_size = 10 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1/4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 settings.print_stats = ['Loss/total', 'Loss/bb_ce', 'ClfTrain/clf_ce'] # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform(tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = {'boxes_per_frame': 128, 'gt_sigma': (0.05, 0.05), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)]} label_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz} label_density_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz, 'normalize': True} data_processing_train = processing.KLDiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.KLDiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([lasot_train, got10k_train, trackingnet_train, coco_train], [0.25,1,1,1], samples_per_epoch=26000, max_gap=200, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=200, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.klcedimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=1.0, optim_init_reg=0.05, optim_min_reg=0.05, gauss_sigma=output_sigma * settings.feature_sz, alpha_eps=0.05, normalize_label=True, init_initializer='zero') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = {'bb_ce': klreg_losses.KLRegression(), 'clf_ce': klreg_losses.KLRegressionGrid()} loss_weight = {'bb_ce': 0.0025, 'clf_ce': 0.25, 'clf_ce_init': 0.25, 'clf_ce_iter': 1.0} actor = tracking_actors.KLDiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam([{'params': actor.net.classifier.parameters(), 'lr': 1e-3}, {'params': actor.net.bb_regressor.parameters(), 'lr': 1e-3}, {'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5}], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM using the probabilistic maximum likelihood trained regression model for bounding-box' \ 'regression presented in [https://arxiv.org/abs/1909.12297].' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'boxes_per_frame': 128, 'gt_sigma': (0, 0), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)], 'add_mean_box': True } data_processing_train = processing.KLBBregProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.KLBBregProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [lasot_train, got10k_train, trackingnet_train, coco_train], [1, 1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=200, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=200, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18(backbone_pretrained=True) objective = klreg_losses.MLRegression() actor = bbreg_actors.AtomBBKLActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'SiamRPN with AlexNet backbone.' settings.print_interval = 100 # How often to print loss and other info settings.batch_size = 512 # Batch size settings.samples_per_epoch = 600000 # Number of training pairs per epoch settings.num_workers = 8 # Number of workers for image loading settings.search_area_factor = {'train': 1.0, 'test': 2.0} settings.output_sz = {'train': 127, 'test': 255} settings.scale_type = 'context' settings.border_type = 'meanpad' # Settings for the image sample and label generation settings.center_jitter_factor = {'train': 0.125, 'test': 2.0} settings.scale_jitter_factor = {'train': 0.05, 'test': 0.18} settings.label_params = { 'search_size': 255, 'output_size': 17, 'anchor_stride': 8, 'anchor_ratios': [0.33, 0.5, 1, 2, 3], 'anchor_scales': [8], 'num_pos': 16, 'num_neg': 16, 'num_total': 64, 'thr_high': 0.6, 'thr_low': 0.3 } settings.loss_weights = {'cls': 1., 'loc': 1.2} settings.neg = 0.2 # Train datasets vos_train = YoutubeVOS() vid_train = ImagenetVID() coco_train = MSCOCOSeq() det_train = ImagenetDET() #lasot_train = Lasot(split='train') #got10k_train = Got10k(split='train') # Validation datasets vid_val = ImagenetVID() # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.25) # The augmentation transform applied to the training set (individually to each image in the pair) transform_exemplar = dltransforms.Transpose() transform_instance = dltransforms.Compose( [ dltransforms.Color(probability=1.0), dltransforms.Blur(probability=0.18), dltransforms.Transpose() ]) transform_instance_mask = dltransforms.Transpose() # Data processing to do on the training pairs data_processing_train = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, train_transform=transform_exemplar, test_transform=transform_instance, test_mask_transform=transform_instance_mask, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, transform=transform_exemplar, joint_transform=transform_joint) nums_per_epoch = settings.samples_per_epoch // settings.batch_size # The sampler for training dataset_train = sampler.MaskSampler( [vid_train, coco_train, det_train, vos_train], [2, 1, 1, 2], samples_per_epoch=nums_per_epoch * settings.batch_size, max_gap=100, processing=data_processing_train, neg=settings.neg) # The loader for training train_loader = loader.LTRLoader( 'train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # The sampler for validation dataset_val = sampler.MaskSampler( [vid_val], [1, ], samples_per_epoch=100 * settings.batch_size, max_gap=100, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader( 'val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network def scale_loss(loss): total_loss = 0 for k in settings.loss_weights: total_loss += loss[k] * settings.loss_weights[k] return total_loss net = SiamRPN_AlexNet(scale_loss=scale_loss) # Define objective objective = { 'cls': select_softmax_with_cross_entropy_loss, 'loc': weight_l1_loss, } # Create actor, which wraps network and objective actor = actors.SiamActor(net=net, objective=objective) # Define optimizer and learning rate decayed_lr = fluid.layers.exponential_decay( learning_rate=0.01, decay_steps=nums_per_epoch, decay_rate=0.9407, staircase=True) lr_scheduler = LinearLrWarmup( learning_rate=decayed_lr, warmup_steps=5*nums_per_epoch, start_lr=0.005, end_lr=0.01) optimizer = fluid.optimizer.Adam( parameter_list=net.rpn_head.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=False, fail_safe=False)
def run(settings): """ Training Entry """ ''' ##### Configure the training parameters for AlphaRefine ##### ''' # Most common settings are assigned in the settings struct settings.description = 'Settings of SEcm module' ''' !!! some important hyperparameters !!! ''' settings.batch_size = 8 # Batch size settings.search_area_factor = 2.0 # Image patch size relative to target size settings.feature_sz = 16 # Size of feature map settings.output_sz = settings.feature_sz * 16 # Size of input image patches settings.used_layers = ['layer3'] # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 0.25} settings.scale_jitter_factor = {'train': 0, 'test': 0.25} settings.max_gap = 50 settings.sample_per_epoch = 400 settings.save_interval = 5 # the interval of saving the checkpoints '''others''' settings.print_interval = 100 # How often to print loss and other info settings.num_workers = 4 # Number of workers for image loading settings.normalize_mean = [ 0.485, 0.456, 0.406 ] # Normalize mean (default pytorch ImageNet values) settings.normalize_std = [ 0.229, 0.224, 0.225 ] # Normalize std (default pytorch ImageNet values) ''' ##### Prepare data for training and validation ##### ''' ''' 1. build trainning dataset and dataloader ''' # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.05) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = torchvision.transforms.Compose([ dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs '''Data_process class. In SEMaskProcessing, we use zero-padding for images and masks.''' data_processing_train = SEprocessing.SEMaskProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', transform=transform_train, joint_transform=transform_joint) # Train datasets # - bbox and corner datasets # got_10k_train = Got10k(settings.env.got10k_dir, split='train') # lasot_train = Lasot(split='train') # coco_train = MSCOCOSeq() # imagenet_vid = ImagenetVID() # imagenet_det = ImagenetDET() # - mask datasets youtube_vos = Youtube_VOS() # saliency = Saliency() # The sampler for training '''Build training dataset. focus on "__getitem__" and "__len__"''' # dataset_train = SEsampler.SEMaskSampler([lasot_train,got_10k_train,coco_train,imagenet_vid,imagenet_det,youtube_vos,saliency], # [1, 1, 1, 1, 1, 2, 3], # samples_per_epoch= settings.sample_per_epoch * settings.batch_size, # max_gap=settings.max_gap, # processing=data_processing_train) dataset_train = SEsampler.SEMaskSampler( [youtube_vos], [1], samples_per_epoch=settings.sample_per_epoch * settings.batch_size, max_gap=settings.max_gap, processing=data_processing_train) # The loader for training ''' using distributed sampler ''' train_sampler = DistributedSampler(dataset_train) ''' "sampler" is exclusive with "shuffle" ''' loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, drop_last=True, stack_dim=1, sampler=train_sampler, pin_memory=False) ''' 2. build validation dataset and dataloader ''' lasot_test = Lasot(split='test') # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the validation pairs data_processing_val = SEprocessing.SEMaskProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', transform=transform_val, joint_transform=transform_joint) # The sampler for validation dataset_val = SEsampler.SEMaskSampler([lasot_test], [1], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network net = SEx.SEcm_resnet34(backbone_pretrained=True, used_layers=settings.used_layers, pool_size=int(settings.feature_sz / 2), unfreeze_layer3=True) # wrap network to distributed one net.cuda() net = torch.nn.parallel.DistributedDataParallel( net, device_ids=[settings.local_rank], find_unused_parameters=True) # Set objective objective = {} objective['corner'] = nn.MSELoss() # take average of all elements objective['mask'] = nn.BCELoss() # Basic BCE Loss # Create actor, which wraps network and objective actor = actors.SEcm_Actor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(net.parameters(), lr=1e-3) # optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9) # Learning rate scheduler lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.5) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # load specified pre-trained parameter load_pretrained = False if hasattr(settings, 'pretrained') and settings.pretrained is not None: trainer.load_pretrained(settings.pretrained) load_pretrained = True # launch training process trainer.train(40, load_latest=not load_pretrained, fail_safe=False)
def run(settings): settings.description = 'First training with gradient descent.' settings.batch_size = 6 settings.num_workers = 16 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 settings.print_stats = [ 'Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/train_loss', 'ClfTrain/iter_loss', 'ClfTrain/test_loss', 'ClfTrain/test_init_loss', 'ClfTrain/test_iter_loss' ] # Train datasets #lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k_i(settings.env.got10k_dir, split='train') #trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=[0, 1, 2, 3]) #coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets # lasot_val = Lasot(settings.env.lasot_dir, vid_ids=list(range(17, 21))) got10k_val = Got10k_i(settings.env.got10k_dir, split='val') # Data transform transform_joint = dltransforms.ToGrayscale(probability=0.05) transform_train = torchvision.transforms.Compose([ dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) transform_val = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.TrackingProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.TrackingProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.RandomSequenceWithDistractors( [got10k_train], [1], samples_per_epoch=26000, max_gap=30, frame_sample_mode='causal', num_seq_test_frames=3, num_class_distractor_frames=0, num_seq_train_frames=3, num_class_distractor_train_frames=0, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders # dataset_val = sampler.RandomSequence([lasot_val, got10k_val], [1,1], samples_per_epoch=5000, max_gap=100, # num_test_frames=1, processing=data_processing_val) dataset_val = sampler.RandomSequenceWithDistractors( [got10k_val], [1], samples_per_epoch=5000, max_gap=30, frame_sample_mode='causal', num_seq_test_frames=3, num_class_distractor_frames=0, num_seq_train_frames=3, num_class_distractor_train_frames=0, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = optim_tracker_models.steepest_descent_learn_filter_resnet50_newiou( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=10, bin_displacement=0.5, mask_init_factor=3.0) objective = { 'iou': nn.MSELoss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'iou': 1, 'test_clf': 100, 'train_clf': 0, 'init_clf': 0, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = actors.OptimTrackerActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): settings.move_data_to_gpu = False settings.description = '' settings.batch_size = 10 settings.test_sequence_length = 50 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_param = { 'train_mode': 'uniform', 'train_factor': 3.0, 'train_limit_motion': False, 'test_mode': 'uniform', 'test_factor': 4.5, 'test_limit_motion': True } settings.scale_jitter_param = {'train_factor': 0.25, 'test_factor': 0.3} settings.hinge_threshold = 0.05 settings.print_stats = [ 'Loss/total', 'Loss/raw/test_clf', 'Loss/raw/test_clf_acc', 'Loss/raw/dimp_clf_acc', 'Loss/raw/is_target', 'Loss/raw/is_target_after_prop', 'Loss/raw/test_seq_acc', 'Loss/raw/dimp_seq_acc' ] lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=[0, 1, 2, 3, 4]) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = None label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz, 'end_pad_if_even': True } data_processing_train = processing.KYSProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_param=settings.center_jitter_param, scale_jitter_param=settings.scale_jitter_param, proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint, min_crop_inside_ratio=0.1) data_processing_val = processing.KYSProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_param=settings.center_jitter_param, scale_jitter_param=settings.scale_jitter_param, proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint, min_crop_inside_ratio=0.1) # Train sampler and loader sequence_sample_info = { 'num_train_frames': 3, 'num_test_frames': settings.test_sequence_length, 'max_train_gap': 30, 'allow_missing_target': True, 'min_fraction_valid_frames': 0.5, 'mode': 'Sequence' } dataset_train = sampler.KYSSampler( [got10k_train, trackingnet_train, lasot_train], [0.3, 0.3, 0.25], samples_per_epoch=settings.batch_size * 150, sequence_sample_info=sequence_sample_info, processing=data_processing_train, sample_occluded_sequences=True) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.KYSSampler([got10k_val], [1], samples_per_epoch=1000, sequence_sample_info=sequence_sample_info, processing=data_processing_val, sample_occluded_sequences=True) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # load base dimp dimp_weights_path = os.path.join(settings.env.pretrained_networks, 'dimp50.pth') base_net, _ = network_loading.load_network(checkpoint=dimp_weights_path) net = kysnet_models.kysnet_res50(optim_iter=3, cv_kernel_size=3, cv_max_displacement=9, cv_stride=1, init_gauss_sigma=output_sigma * settings.feature_sz, train_feature_extractor=False, train_iounet=False, detach_length=0, state_dim=8, representation_predictor_dims=(16, ), conf_measure='entropy', dimp_thresh=0.05) # Move pre-trained dimp weights net.backbone_feature_extractor.load_state_dict( base_net.feature_extractor.state_dict()) net.dimp_classifier.load_state_dict(base_net.classifier.state_dict()) net.bb_regressor.load_state_dict(base_net.bb_regressor.state_dict()) # To be safe for p in net.backbone_feature_extractor.parameters(): p.requires_grad_(False) for p in net.dimp_classifier.parameters(): p.requires_grad_(False) for p in net.bb_regressor.parameters(): p.requires_grad_(False) objective = { 'test_clf': ltr_losses.LBHingev2(threshold=settings.hinge_threshold, return_per_sequence=False), 'dimp_clf': ltr_losses.LBHingev2(threshold=settings.hinge_threshold, return_per_sequence=False), 'is_target': ltr_losses.IsTargetCellLoss(return_per_sequence=False), 'clf_acc': ltr_losses.TrackingClassificationAccuracy(threshold=0.25) } loss_weight = { 'test_clf': 1.0 * 500, 'test_clf_orig': 50, 'is_target': 0.1 * 500, 'is_target_after_prop': 0.1 * 500 } dimp_jitter_fn = DiMPScoreJittering(distractor_ratio=0.1, p_distractor=0.3, max_distractor_enhance_factor=1.3, min_distractor_enhance_factor=0.8) actor = actors.KYSActor(net=net, objective=objective, loss_weight=loss_weight, dimp_jitter_fn=dimp_jitter_fn) optimizer = optim.Adam([{ 'params': actor.net.predictor.parameters(), 'lr': 1e-2 }], lr=1e-2) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(40, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'SegmentationNet with default settings.' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 64 # Batch size settings.num_workers = 1 # Number of workers for image loading settings.normalize_mean = [ 0.485, 0.456, 0.406 ] # Normalize mean (default pytorch ImageNet values) settings.normalize_std = [ 0.229, 0.224, 0.225 ] # Normalize std (default pytorch ImageNet values) settings.search_area_factor = 4.0 # Image patch size relative to target size settings.feature_sz = 24 # Size of feature map settings.output_sz = settings.feature_sz * 16 # Size of input image patches # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 1.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.25} settings.segm_topk_pos = 3 settings.segm_topk_neg = 3 settings.segm_use_distance = True mixer_channels = 3 # check if debug folder exists if not os.path.isdir(settings.env.workspace_dir): os.mkdir(settings.env.workspace_dir) settings.env.images_dir = os.path.join(settings.env.workspace_dir, 'images') if not os.path.isdir(settings.env.images_dir): os.mkdir(settings.env.images_dir) # Train datasets vos_train = Vos(split='train') # Validation datasets vos_val = Vos(split='val') # The joint augmentation transform, that is applied to the pairs jointly # No need for grayscale transformation since we are doing color segmentation # transform_joint = dltransforms.ToGrayscale(probability=0.05) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = torchvision.transforms.Compose([ dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs data_processing_train = segm_processing.SegmProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='pair', transform=transform_train, use_distance=settings.segm_use_distance) # Data processing to do on the validation pairs data_processing_val = segm_processing.SegmProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='pair', transform=transform_val, use_distance=settings.segm_use_distance) # The sampler for training dataset_train = segm_sampler.SegmSampler([vos_train], [1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # # The sampler for validation dataset_val = segm_sampler.SegmSampler([vos_val], [1], samples_per_epoch=10 * settings.batch_size, max_gap=50, processing=data_processing_val) # # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=10, stack_dim=1) # Create network # resnet50 or resnet18 net = segm_models.segm_resnet50(backbone_pretrained=True, topk_pos=settings.segm_topk_pos, topk_neg=settings.segm_topk_neg, mixer_channels=mixer_channels) # Set objective objective = nn.BCEWithLogitsLoss() # Create actor, which wraps network and objective actor = actors.SegmActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.segm_predictor.parameters(), lr=1e-3) # Learning rate scheduler lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(40, load_latest=True, fail_safe=False)
def run(settings): settings.description = 'Transformer-assisted tracker. Our baseline approach is SuperDiMP' settings.batch_size = 40 settings.num_workers = 8 settings.multi_gpu = True settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 6.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 22 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 5.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss'] # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05), tfm.RandomHorizontalFlip(probability=0.5)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.RandomHorizontalFlip(probability=0.5), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'boxes_per_frame': 128, 'gt_sigma': (0.05, 0.05), 'proposal_sigma': [(0.05, 0.05), (0.5, 0.5)] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } label_density_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.KLDiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, crop_type='inside_major', max_scale_change=1.5, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.KLDiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, crop_type='inside_major', max_scale_change=1.5, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, label_density_params=label_density_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler( [lasot_train, got10k_train, trackingnet_train, coco_train], [1, 1, 1, 1], samples_per_epoch=50000, max_gap=500, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=10000, max_gap=500, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.dimpnet50( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu', frozen_backbone_layers=['conv1', 'bn1', 'layer1', 'layer2']) # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'bb_ce': klreg_losses.KLRegression(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'bb_ce': 0.01, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = tracking_actors.KLDiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.transformer.parameters(), 'lr': 1e-3 }, { 'params': actor.net.bb_regressor.parameters(), 'lr': 1e-3 }, { 'params': actor.net.feature_extractor.layer3.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with default settings.' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 64 # Batch size settings.num_workers = 4 # Number of workers for image loading settings.normalize_mean = [ 0.485, 0.456, 0.406 ] # Normalize mean (default pytorch ImageNet values) settings.normalize_std = [ 0.229, 0.224, 0.225 ] # Normalize std (default pytorch ImageNet values) settings.search_area_factor = 5.0 # Image patch size relative to target size settings.feature_sz = 18 # Size of feature map settings.output_sz = settings.feature_sz * 16 # Size of input image patches # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} settings.proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } # Train datasets lasot_train = Lasot(split='train') trackingnet_train = TrackingNet(set_ids=list(range(11))) coco_train = MSCOCOSeq() # Validation datasets trackingnet_val = TrackingNet(set_ids=list(range(11, 12))) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.05) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = torchvision.transforms.Compose([ dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=settings.proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=settings.proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [lasot_train, trackingnet_train, coco_train], [1, 1, 1], samples_per_epoch=1800 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([trackingnet_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network net = atom_models.atom_resnet50(backbone_pretrained=True) # Set objective objective = nn.MSELoss() # Create actor, which wraps network and objective actor = actors.AtomActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) # Learning rate scheduler lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(40, load_latest=True, fail_safe=False)
def run(settings): # Most common settings are assigned in the settings struct debug = False if debug: settings.batch_size = 4 # 8 # 4 # 120 # 70 # 38 settings.num_workers = 0 # 24 # 30 # 10 # 35 # 30 min(settings.batch_size, 16) settings.multi_gpu = False # True # True # True # True # True # True # True else: settings.batch_size = 38 # 8 # 4 # 120 # 70 # 38 settings.num_workers = 20 # 24 # 30 # 10 # 35 # 30 min(settings.batch_size, 16) settings.multi_gpu = True # True # True # True # True # True # True # True settings.device = 'cuda' settings.description = 'TransT with default settings.' settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 6. # 4.0 settings.template_area_factor = 2. settings.search_feature_sz = 32 settings.template_feature_sz = 16 settings.search_sz = settings.search_feature_sz * 8 settings.temp_sz = settings.template_feature_sz * 8 settings.center_jitter_factor = {'search': 2.0, 'template': 0} # 3 settings.scale_jitter_factor = {'search': 0.05, 'template': 0} # 0.25 settings.sequence_length = 34 # 30 # 64 NEXT # Same as PT settings.rand = False # settings.search_gap = 1 # Depreciated settings.init_ckpt = "pytracking/networks/transt.pth" # Transformer settings.position_embedding = 'sine' settings.hidden_dim = 256 settings.dropout = 0.1 settings.nheads = 8 settings.dim_feedforward = 2048 settings.featurefusion_layers = 4 # settings.sigma = 1 / 4 / 5. # settings.kernel = 4 # settings.feature = 32 # 18 # settings.output_sz = 256 # settings.feature * 16 # settings.end_pad_if_even = False # settings.label_function_params = True settings.move_data_to_gpu = True # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # votval trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) # transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), # tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), # tfm.RandomHorizontalFlip(), # tfm.RandomAffine(p_flip=0.5, max_scale=1.5), # tfm.RandomBlur(1), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs data_processing_train = processing.TransTProcessing(search_area_factor=settings.search_area_factor, template_area_factor = settings.template_area_factor, search_sz=settings.search_sz, temp_sz=settings.temp_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', joint=False, # Whether or not to apply same transform to every image transform=transform_train, rand=settings.rand, label_function_params=None, # settings.label_function_params, joint_transform=transform_joint) # The sampler for training # dataset_train = sampler.TransTSampler([got10k_train], [1], samples_per_epoch=1000*settings.batch_size, max_gap=100, processing=data_processing_train, num_search_frames=settings.sequence_length, frame_sample_mode="rnn_causal") # dataset_train = sampler.TransTSampler([got10k_train, trackingnet_train], [1, 1], samples_per_epoch=1000*settings.batch_size, max_gap=100, processing=data_processing_train, num_search_frames=settings.sequence_length, frame_sample_mode="interval") dataset_train = sampler.TransTSampler([lasot_train, got10k_train, trackingnet_train], [1,1,1], samples_per_epoch=1000*settings.batch_size, max_gap=100, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=0, pin_memory=settings.move_data_to_gpu == False) # Create network and actor model = transt_models.transt_resnet50(settings) # Wrap the network for multi GPU training if settings.multi_gpu: model = MultiGPU(model, dim=0) objective = transt_models.transt_loss(settings) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) actor = actors.TranstActor(net=model, objective=objective) # Optimizer # Change learning rate forthe Q we have changed and the RNN and the readout # q = self.mix_q(torch.cat([q, self.mix_norm(exc)], -1)) # self.class_embed_new = MLP(hidden_dim * 2, hidden_dim, num_classes + 1, 3) # self.bbox_embed_new = MLP(hidden_dim * 2, hidden_dim, 4, 3) param_dicts = [ {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], "lr": 1e-5, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=1e-4, weight_decay=1e-4) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 500) # Create trainer trainer = LTRTrainer(actor, [loader_train], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(1000, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with default settings, but additionally using GOT10k for training.' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir, dtype='rgbcolormap') lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir, dtype='rgbcolormap') depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, dtype='rgbcolormap') depthtrack_horizontal_train = DepthTrack( root=settings.env.depthtrack_horizontal_dir, dtype='rgbcolormap') depthtrack_vertical_train = DepthTrack( root=settings.env.depthtrack_vertical_dir, dtype='rgbcolormap') # Validation datasets # got10k_val = Got10k(settings.env.got10k_dir, split='votval') cdtb_val = CDTB(settings.env.cdtb_dir, split='val', dtype='rgbcolormap') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([ lasot_depth_train, depthtrack_train, depthtrack_horizontal_train, depthtrack_vertical_train, coco_train ], [1, 1, 1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([cdtb_val], [1], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18_DeT( backbone_pretrained=True, merge_type='max') # 'mean', 'conv', 'weightedSum' objective = nn.MSELoss() actor = actors.AtomActor(net=net, objective=objective) # Optimizer # optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) optimizer = optim.Adam( [ { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }, { 'params': actor.net.feature_extractor_depth.parameters(), 'lr': 2e-5 }, # {'params': actor.net.merge_layer2.parameters(), 'lr': 2e-5}, # {'params': actor.net.merge_layer3.parameters(), 'lr': 2e-5}, ], lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(80, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.base_model = '' settings.description = 'SiamMask_sharp with ResNet-50 backbone.' settings.print_interval = 100 # How often to print loss and other info settings.batch_size = 64 # Batch size settings.samples_per_epoch = 600000 # Number of training pairs per epoch settings.num_workers = 8 # Number of workers for image loading settings.search_area_factor = {'train': 1.0, 'test': 143. / 127.} settings.output_sz = {'train': 127, 'test': 143} settings.scale_type = 'context' settings.border_type = 'meanpad' # Settings for the image sample and label generation settings.center_jitter_factor = {'train': 0.2, 'test': 0.4} settings.scale_jitter_factor = {'train': 0.05, 'test': 0.18} settings.label_params = { 'search_size': 143, 'output_size': 3, 'anchor_stride': 8, 'anchor_ratios': [0.33, 0.5, 1, 2, 3], 'anchor_scales': [8], 'num_pos': 16, 'num_neg': 16, 'num_total': 64, 'thr_high': 0.6, 'thr_low': 0.3 } settings.loss_weights = {'cls': 0., 'loc': 0., 'mask': 1} settings.neg = 0 # Train datasets vos_train = YoutubeVOS() coco_train = MSCOCOSeq() # Validation datasets vos_val = vos_train # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.25) # The augmentation transform applied to the training set (individually to each image in the pair) transform_exemplar = dltransforms.Transpose() transform_instance = dltransforms.Compose([ dltransforms.Color(probability=1.0), dltransforms.Blur(probability=0.18), dltransforms.Transpose() ]) transform_instance_mask = dltransforms.Transpose() # Data processing to do on the training pairs data_processing_train = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, train_transform=transform_exemplar, test_transform=transform_instance, test_mask_transform=transform_instance_mask, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.SiamProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', label_params=settings.label_params, transform=transform_exemplar, joint_transform=transform_joint) nums_per_epoch = settings.samples_per_epoch // settings.batch_size # The sampler for training dataset_train = sampler.MaskSampler([coco_train, vos_train], [1, 1], samples_per_epoch=nums_per_epoch * settings.batch_size, max_gap=100, processing=data_processing_train, neg=settings.neg) # The loader for training train_loader = loader.LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # The sampler for validation dataset_val = sampler.MaskSampler([vos_val], [ 1, ], samples_per_epoch=100 * settings.batch_size, max_gap=100, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=0) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network def scale_loss(loss): total_loss = 0 for k in settings.loss_weights: total_loss += loss[k] * settings.loss_weights[k] return total_loss net = SiamMask_ResNet50_sharp(scale_loss=scale_loss) # Load parameters from the best_base_model if settings.base_model == '': raise Exception( 'The base_model path is not setup. Check settings.base_model in "ltr/train_settings/siammask/siammask_res50_sharp.py".' ) para_dict, _ = fluid.load_dygraph(settings.base_model) model_dict = net.state_dict() for key in model_dict.keys(): if key in para_dict.keys(): model_dict[key] = para_dict[key] net.set_dict(model_dict) # Define objective objective = { 'cls': select_softmax_with_cross_entropy_loss, 'loc': weight_l1_loss, 'mask': select_mask_logistic_loss } # Create actor, which wraps network and objective actor = actors.SiamActor(net=net, objective=objective) # Set to training mode actor.train() # Define optimizer and learning rate decayed_lr = fluid.layers.exponential_decay(learning_rate=0.0005, decay_steps=nums_per_epoch, decay_rate=0.9, staircase=True) lr_scheduler = LinearLrWarmup(learning_rate=decayed_lr, warmup_steps=5 * nums_per_epoch, start_lr=0.0001, end_lr=0.0005) optimizer = fluid.optimizer.Adam( parameter_list=net.mask_head.parameters() + net.refine_head.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(20, load_latest=False, fail_safe=False)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with ResNet18 backbone and trained with vid, lasot, coco.' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 64 # Batch size settings.num_workers = 4 # Number of workers for image loading settings.normalize_mean = [0.485, 0.456, 0.406 ] # Normalize mean (default ImageNet values) settings.normalize_std = [0.229, 0.224, 0.225] # Normalize std (default ImageNet values) settings.search_area_factor = 5.0 # Image patch size relative to target size settings.feature_sz = 18 # Size of feature map settings.output_sz = settings.feature_sz * 16 # Size of input image patches # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} settings.proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } # Train datasets vid_train = ImagenetVID() lasot_train = Lasot(split='train') coco_train = MSCOCOSeq() # Validation datasets got10k_val = Got10k(split='val') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.05) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = dltransforms.Compose([ dltransforms.ToArrayAndJitter(0.2), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = dltransforms.Compose([ dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs data_processing_train = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=settings.proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=settings.proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler( [vid_train, lasot_train, coco_train], [1, 1, 1], samples_per_epoch=1000 * settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training train_loader = loader.LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=4, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [ 1, ], samples_per_epoch=500 * settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, epoch_interval=5, num_workers=4, stack_dim=1) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network net = atom_resnet18(backbone_pretrained=True) # Freeze backbone state_dicts = net.state_dict() for k in state_dicts.keys(): if 'feature_extractor' in k and "running" not in k: state_dicts[k].stop_gradient = True # Set objective objective = fluid.layers.square_error_cost # Create actor, which wraps network and objective actor = actors.AtomActor(net=net, objective=objective) # Set to training mode actor.train() # define optimizer and learning rate gama = 0.2 lr = 1e-3 lr_scheduler = fluid.dygraph.PiecewiseDecay( [15, 30, 45], values=[lr, lr * gama, lr * gama * gama], step=1000, begin=0) optimizer = fluid.optimizer.Adam( parameter_list=net.bb_regressor.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(40, load_latest=False, fail_safe=False)
def run(settings): settings.description = 'Default train settings with backbone weights fixed. We initialize the backbone ResNet with ' \ 'pre-trained Mask-RCNN weights. These weights can be obtained from ' \ 'https://drive.google.com/file/d/12pVHmhqtxaJ151dZrXN1dcgUa7TuAjdA/view?usp=sharing. ' \ 'Download and save these weights in env_settings.pretrained_networks directory' settings.batch_size = 20 settings.num_workers = 8 settings.multi_gpu = True settings.print_interval = 1 settings.normalize_mean = [102.9801, 115.9465, 122.7717] settings.normalize_std = [1.0, 1.0, 1.0] settings.feature_sz = (52, 30) # Settings used for generating the image crop input to the network. See documentation of LWTLProcessing class in # ltr/data/processing.py for details. settings.output_sz = (settings.feature_sz[0] * 16, settings.feature_sz[1] * 16 ) # Size of input image crop settings.search_area_factor = 5.0 settings.crop_type = 'inside_major' settings.max_scale_change = None settings.center_jitter_factor = {'train': 3, 'test': (5.5, 4.5)} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} # Datasets ytvos_train = YouTubeVOS(version="2019", multiobj=False, split='jjtrain') davis_train = Davis(version='2017', multiobj=False, split='train') ytvos_val = YouTubeVOS(version="2019", multiobj=False, split='jjvalid') # Data transform transform_joint = tfm.Transform(tfm.ToBGR(), tfm.ToGrayscale(probability=0.05), tfm.RandomHorizontalFlip(probability=0.5)) transform_train = tfm.Transform( tfm.RandomAffine(p_flip=0.0, max_rotation=15.0, max_shear=0.0, max_ar_factor=0.0, max_scale=0.2, pad_amount=0), tfm.ToTensorAndJitter(0.2, normalize=False), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensorAndJitter(0.0, normalize=False), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) data_processing_train = processing.LWLProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', crop_type=settings.crop_type, max_scale_change=settings.max_scale_change, transform=transform_train, joint_transform=transform_joint, new_roll=True) data_processing_val = processing.LWLProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', crop_type=settings.crop_type, max_scale_change=settings.max_scale_change, transform=transform_val, joint_transform=transform_joint, new_roll=True) # Train sampler and loader dataset_train = sampler.LWLSampler([ytvos_train, davis_train], [6, 1], samples_per_epoch=settings.batch_size * 1000, max_gap=100, num_test_frames=3, num_train_frames=1, processing=data_processing_train) dataset_val = sampler.LWLSampler([ytvos_val], [1], samples_per_epoch=settings.batch_size * 100, max_gap=100, num_test_frames=3, num_train_frames=1, processing=data_processing_val) loader_train = LTRLoader('train', dataset_train, training=True, num_workers=settings.num_workers, stack_dim=1, batch_size=settings.batch_size) loader_val = LTRLoader('val', dataset_val, training=False, num_workers=settings.num_workers, epoch_interval=5, stack_dim=1, batch_size=settings.batch_size) # Network net = lwl_networks.steepest_descent_resnet50(filter_size=3, num_filters=16, optim_iter=5, backbone_pretrained=True, out_feature_dim=512, frozen_backbone_layers=[ 'conv1', 'bn1', 'layer1', 'layer2', 'layer3', 'layer4' ], label_encoder_dims=(16, 32, 64), use_bn_in_label_enc=False, clf_feat_blocks=0, final_conv=True, backbone_type='mrcnn') # Load pre-trained maskrcnn weights weights_path = os.path.join(settings.env.pretrained_networks, 'e2e_mask_rcnn_R_50_FPN_1x_converted.pkl') pretrained_weights = torch.load(weights_path) net.feature_extractor.load_state_dict(pretrained_weights) # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) # Loss function objective = { 'segm': LovaszSegLoss(per_image=False), } loss_weight = {'segm': 100.0} actor = segm_actors.LWLActor(net=net, objective=objective, loss_weight=loss_weight, num_refinement_iter=2, disable_all_bn=True) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.target_model.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.target_model.filter_optimizer.parameters(), 'lr': 1e-4 }, { 'params': actor.net.target_model.feature_extractor.parameters(), 'lr': 2e-5 }, { 'params': actor.net.decoder.parameters(), 'lr': 1e-4 }, { 'params': actor.net.label_encoder.parameters(), 'lr': 2e-4 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[ 40, ], gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(70, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'Siam selection for detection with default settings.' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 1 # Batch size assert settings.batch_size==1,"only implement for batch_size 1" settings.num_workers = 4 # Number of workers for image loading settings.normalize_mean = [0.485, 0.456, 0.406] # Normalize mean (default pytorch ImageNet values) settings.normalize_std = [0.229, 0.224, 0.225] # Normalize std (default pytorch ImageNet values) settings.search_area_factor = 5.0 # Image patch size relative to target size settings.feature_sz = 18 # Size of feature map settings.output_sz = settings.feature_sz * 16 # Size of input image patches # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} settings.proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} # Train datasets lasot_train = Lasot(split='train') trackingnet_train = TrackingNet(set_ids=list(range(11))) # coco_train = MSCOCOSeq() # Validation datasets trackingnet_val = TrackingNet(set_ids=list(range(11,12))) # # The joint augmentation transform, that is applied to the pairs jointly # transform_joint = dltransforms.ToGrayscale(probability=0.05) # # # The augmentation transform applied to the training set (individually to each image in the pair) # transform_train = torchvision.transforms.Compose([dltransforms.ToTensorAndJitter(0.2), # torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) # # # The augmentation transform applied to the validation set (individually to each image in the pair) # transform_val = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), # torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) # Data processing to do on the training pairs # data_processing_train = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, # output_sz=settings.output_sz, # center_jitter_factor=settings.center_jitter_factor, # scale_jitter_factor=settings.scale_jitter_factor, # mode='sequence', # proposal_params=settings.proposal_params, # transform=transform_train, # joint_transform=transform_joint) # # # Data processing to do on the validation pairs # data_processing_val = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, # output_sz=settings.output_sz, # center_jitter_factor=settings.center_jitter_factor, # scale_jitter_factor=settings.scale_jitter_factor, # mode='sequence', # proposal_params=settings.proposal_params, # transform=transform_val, # joint_transform=transform_joint) img_transform = ImageTransform( size_divisor=32, mean=[123.675, 116.28, 103.53],std=[58.395, 57.12, 57.375],to_rgb=True) data_processing=processing.SiamSelProcessing(transform=img_transform) # The sampler for training # dataset_train = sampler.ATOMSampler([lasot_train, trackingnet_train, coco_train], [1,1,1], # samples_per_epoch=1000*settings.batch_size, max_gap=50*20, # processing=data_processing) dataset_train = sampler.ATOMSampler([lasot_train, trackingnet_train], [1,3], samples_per_epoch=1000*settings.batch_size, max_gap=50*20, processing=data_processing) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([trackingnet_val], [1], samples_per_epoch=500*settings.batch_size, max_gap=50*20, processing=data_processing) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network # net = atom_models.atom_resnet18(backbone_pretrained=True) net=SiamSelNet() # Set objective objective = nn.BCEWithLogitsLoss() # Create actor, which wraps network and objective actor = actors.SiamSelActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.selector.parameters(), lr=1e-4,weight_decay=0.0001) # Learning rate scheduler lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.2) # lr_scheduler = WarmupMultiStepLR(optimizer,[50*1000,80*1000]) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(150, load_latest=True, fail_safe=True) #larget frame gap #without coco #lasot : trackingnet 1:3
def run(settings): settings.description = 'Default train settings for DiMP with ResNet50 as backbone.' settings.batch_size = 4 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 5 settings.normalize_mean = [0.485, 0.456, 0.406, 0] settings.normalize_std = [0.229, 0.224, 0.225, 1.0] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1/4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss'] # # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) # # # Validation datasets # got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Train datasets #lasot_train = Lasot(split='train') ptb_train = PrincetonRGBD(split='validation') # stc_train = StcRGBD(split='train') # kevinlai_train=kevinlaiRGBD(split='train') #trackingnet_train = TrackingNet(set_ids=list(range(11))) #coco_train = MSCOCOSeq() # Validation datasets #lasot_val = Lasot(split='train')#TrackingNet(set_ids=list(range(11,12))) ptb_val = PrincetonRGBD(split='validation') # Data transform transform_joint = dltransforms.ToGrayscale(probability=0.05) transform_train = torchvision.transforms.Compose([dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) transform_val = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} label_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz} data_processing_train = processing.DiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.DiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([ptb_train], [1], samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders # dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=30, # num_test_frames=3, num_train_frames=3, # processing=data_processing_val) dataset_val = sampler.DiMPSampler([ptb_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet_rgbd_locc.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = {'iou': nn.MSELoss(), 'occ': nn.SmoothL1Loss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold)} loss_weight = {'iou': 1, 'occ':1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400} actor = actors.DiMPActor_OCC(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam([{'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 0*5e-5}, {'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 0*5e-4}, {'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 0*5e-5}, {'params': actor.net.occ_classifer.parameters(), 'lr': 2e-3}, {'params': actor.net.bb_regressor.parameters(), 'lr': 0*2e-4}, {'params': actor.net.feature_extractor.parameters(), 'lr': 0*2e-5}, {'params': actor.net.feature_extractor_depth.parameters(), 'lr': 0.1*2e-5}], lr=0.1*2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) #trainer.train(10, load_latest=True, fail_safe=True, path_pretrained=None)#'./checkpoints/dimp50.pth') #trainer.train(50, load_latest=True, fail_safe=True, path_pretrained=None) trainer.train(50, load_latest=True, fail_safe=True, path_pretrained='./checkpoints/dimp50.pth')
def run(settings): settings.description = 'Default train settings for DiMP with ResNet50 as backbone.' settings.batch_size = 10 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1 / 4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/clf_ce', 'ClfTrain/test_loss'] ''' Depth Inputs: 1) raw_depth X 2) norm_depth 3) centered_norm_depth 4) centered_raw_depth X 5) colormap 6) centered_colormap ''' # depth_inputs = 'norm_depth' # depth_inputs = 'colormap' depth_inputs = 'hha' # Train datasets # depthtrack_train = DepthTrack(root=settings.env.depthtrack_dir, split='train', dtype=depth_inputs) coco_train = MSCOCOSeq_depth(settings.env.cocodepth_dir, dtype=depth_inputs) # got10k_depth_train = MSCOCOSeq_depth(settings.env.got10kdepth_dir, dtype=depth_inputs) lasot_depth_train = Lasot_depth(root=settings.env.lasotdepth_dir, rgb_root=settings.env.lasot_dir, dtype=depth_inputs) # Validation datasets depthtrack_val = DepthTrack(root=settings.env.depthtrack_dir, split='val', dtype=depth_inputs) # Data transform transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = { 'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] } label_params = { 'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz } data_processing_train = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.DiMPProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([coco_train, lasot_depth_train], [1, 1], samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders dataset_val = sampler.DiMPSampler([depthtrack_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet.dimpnet50( filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # net = dimpnet.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=False, optim_iter=5, # !!!!!!!!!!!!!!!!!!!!!!!!!!!! clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'iou': nn.MSELoss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold) } loss_weight = { 'iou': 1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400 } actor = actors.DiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam( [{ 'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5 }, { 'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4 }, { 'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5 }, { 'params': actor.net.bb_regressor.parameters() }, { 'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): settings.description = 'Default train settings for training VOS with box initialization.' settings.batch_size = 8 settings.num_workers = 4 settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [102.9801, 115.9465, 122.7717] settings.normalize_std = [1.0, 1.0, 1.0] settings.feature_sz = (52, 30) settings.output_sz = (settings.feature_sz[0] * 16, settings.feature_sz[1] * 16) settings.search_area_factor = 5.0 settings.crop_type = 'inside_major' settings.max_scale_change = None settings.device = "cuda:0" settings.center_jitter_factor = {'train': 3, 'test': (5.5, 4.5)} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.min_target_area = 500 ytvos_train = YouTubeVOS(version="2019", multiobj=False, split='jjtrain') ytvos_valid = YouTubeVOS(version="2019", multiobj=False, split='jjvalid') coco_train = MSCOCOSeq() # Data transform transform_joint = tfm.Transform(tfm.ToBGR(), tfm.ToGrayscale(probability=0.05), tfm.RandomHorizontalFlip(probability=0.5)) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2, normalize=False), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) transform_val = tfm.Transform( tfm.ToTensorAndJitter(0.0, normalize=False), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) data_processing_train = processing.LWLProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', crop_type=settings.crop_type, max_scale_change=settings.max_scale_change, transform=transform_train, joint_transform=transform_joint, new_roll=True) data_processing_val = processing.LWLProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', crop_type=settings.crop_type, max_scale_change=settings.max_scale_change, transform=transform_val, joint_transform=transform_joint, new_roll=True) # Train sampler and loader dataset_train = sampler.LWLSampler([ytvos_train, coco_train], [1, 1], samples_per_epoch=settings.batch_size * 1000, max_gap=100, num_test_frames=1, num_train_frames=1, processing=data_processing_train) dataset_val = sampler.LWLSampler([ytvos_valid], [1], samples_per_epoch=settings.batch_size * 100, max_gap=100, num_test_frames=1, num_train_frames=1, processing=data_processing_val) loader_train = LTRLoader('train', dataset_train, training=True, num_workers=settings.num_workers, stack_dim=1, batch_size=settings.batch_size) loader_val = LTRLoader('val', dataset_val, training=False, num_workers=settings.num_workers, epoch_interval=5, stack_dim=1, batch_size=settings.batch_size) net = lwt_box_networks.steepest_descent_resnet50( filter_size=3, num_filters=16, optim_iter=5, backbone_pretrained=True, out_feature_dim=512, frozen_backbone_layers=['conv1', 'bn1', 'layer1'], label_encoder_dims=(16, 32, 64), use_bn_in_label_enc=False, clf_feat_blocks=0, final_conv=True, backbone_type='mrcnn', box_label_encoder_dims=( 64, 64, ), final_bn=False) base_net_weights = network_loading.load_trained_network( settings.env.workspace_dir, 'ltr/lwl/lwl_stage2/LWTLNet_ep0080.pth.tar') # Copy weights net.feature_extractor.load_state_dict( base_net_weights.feature_extractor.state_dict()) net.target_model.load_state_dict( base_net_weights.target_model.state_dict()) net.decoder.load_state_dict(base_net_weights.decoder.state_dict()) net.label_encoder.load_state_dict( base_net_weights.label_encoder.state_dict()) # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = { 'segm': LovaszSegLoss(per_image=False), } loss_weight = { 'segm': 100.0, 'segm_box': 10.0, 'segm_train': 10, } actor = lwtl_actors.LWLBoxActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam([{ 'params': actor.net.box_label_encoder.parameters(), 'lr': 1e-3 }], lr=2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.device = 'cuda' settings.description = 'TransT with default settings.' settings.batch_size = 16 # 38 settings.num_workers = min(settings.batch_size, 8) settings.multi_gpu = False settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 4.0 settings.template_area_factor = 2.0 settings.search_feature_sz = 32 settings.template_feature_sz = 16 settings.search_sz = settings.search_feature_sz * 8 settings.temp_sz = settings.template_feature_sz * 8 settings.center_jitter_factor = {'search': 3, 'template': 0} settings.scale_jitter_factor = {'search': 0.25, 'template': 0} settings.sequence_length = 16 settings.search_gap = 8 settings.init_ckpt = "pytracking/networks/transt.pth" # Transformer settings.position_embedding = 'sine' settings.hidden_dim = 256 settings.dropout = 0.1 settings.nheads = 8 settings.dim_feedforward = 2048 settings.featurefusion_layers = 4 settings.sigma = 1 / 4 / 5. settings.kernel = 4 settings.feature = 32 # 18 settings.output_sz = 256 # settings.feature * 16 settings.end_pad_if_even = False settings.label_function_params = True # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform( tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs data_processing_train = processing.TransTProcessing( search_area_factor=settings.search_area_factor, template_area_factor=settings.template_area_factor, search_sz=settings.search_sz, temp_sz=settings.temp_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', transform=transform_train, label_function_params=settings.label_function_params, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.TransTSampler( [got10k_train], [1], samples_per_epoch=1000 * settings.batch_size, max_gap=100, processing=data_processing_train, num_search_frames=settings.sequence_length) # dataset_train = sampler.TransTSampler([lasot_train, got10k_train, coco_train, trackingnet_train], [1,1,1,1], samples_per_epoch=1000*settings.batch_size, max_gap=100, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=0) # Create network and actor model = transt_models.transt_resnet50(settings) # Wrap the network for multi GPU training if settings.multi_gpu: model = MultiGPU(model, dim=0) objective = transt_models.transt_loss(settings) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) actor = actors.CircuitTranstActor(net=model, objective=objective) # Optimizer param_dicts = [ { "params": [ p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad ], "lr": 1e-5, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=1e-4, weight_decay=1e-4) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 500) # Create trainer trainer = LTRTrainer(actor, [loader_train], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(1000, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'SiamFC with Alexnet backbone and trained with vid' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 8 # Batch size settings.num_workers = 8 # Number of workers for image loading settings.normalize_mean = [0., 0., 0.] # Normalize mean settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.] # Normalize std settings.search_area_factor = { 'train': 1.0, 'test': 2.0078740157480315 } # roughly the same as SiamFC settings.output_sz = {'train': 127, 'test': 255} settings.scale_type = 'context' settings.border_type = 'replicate' # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 0} settings.scale_jitter_factor = {'train': 0, 'test': 0.} # Train datasets vid_train = ImagenetVID() # Validation datasets got10k_val = Got10k(split='val') # The joint augmentation transform, that is applied to the pairs jointly transform_joint = dltransforms.ToGrayscale(probability=0.25) # The augmentation transform applied to the training set (individually to each image in the pair) transform_exemplar = dltransforms.Compose([ dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) transform_instance = dltransforms.Compose([ DataAug(), dltransforms.ToArray(), dltransforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std) ]) # Data processing to do on the training pairs data_processing_train = processing.SiamFCProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', train_transform=transform_exemplar, test_transform=transform_instance, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.SiamFCProcessing( search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, scale_type=settings.scale_type, border_type=settings.border_type, mode='sequence', transform=transform_exemplar, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([vid_train], [ 1, ], samples_per_epoch=6650 * settings.batch_size, max_gap=100, processing=data_processing_train) # The loader for training train_loader = loader.LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([got10k_val], [ 1, ], samples_per_epoch=1000 * settings.batch_size, max_gap=100, processing=data_processing_val) # The loader for validation val_loader = loader.LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, epoch_interval=5, stack_dim=1) # creat network, set objective, creat optimizer, learning rate scheduler, trainer with dygraph.guard(): # Create network net = siamfc_alexnet() # Create actor, which wraps network and objective actor = actors.SiamFCActor(net=net, objective=None, batch_size=settings.batch_size, shape=(17, 17), radius=16, stride=8) # Set to training mode actor.train() # define optimizer and learning rate lr_scheduler = fluid.layers.exponential_decay(learning_rate=0.01, decay_steps=6650, decay_rate=0.8685, staircase=True) regularizer = fluid.regularizer.L2DecayRegularizer( regularization_coeff=0.0005) optimizer = fluid.optimizer.Momentum(momentum=0.9, regularization=regularizer, parameter_list=net.parameters(), learning_rate=lr_scheduler) trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, settings, lr_scheduler) trainer.train(50, load_latest=False, fail_safe=False)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'ATOM IoUNet with default settings according to the paper.' settings.batch_size = 64 settings.num_workers = 8 settings.print_interval = 1 settings.normalize_mean = [0.485, 0.456, 0.406] settings.normalize_std = [0.229, 0.224, 0.225] settings.search_area_factor = 5.0 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} # Train datasets lasot_train = Lasot(settings.env.lasot_dir, split='train') trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11))) coco_train = MSCOCOSeq(settings.env.coco_dir) # Validation datasets trackingnet_val = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(11,12))) # The joint augmentation transform, that is applied to the pairs jointly transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05)) # The augmentation transform applied to the training set (individually to each image in the pair) transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # The augmentation transform applied to the validation set (individually to each image in the pair) transform_val = tfm.Transform(tfm.ToTensor(), tfm.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)) # Data processing to do on the training pairs proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} data_processing_train = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_train, joint_transform=transform_joint) # Data processing to do on the validation pairs data_processing_val = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, transform=transform_val, joint_transform=transform_joint) # The sampler for training dataset_train = sampler.ATOMSampler([lasot_train, trackingnet_train, coco_train], [1,1,1], samples_per_epoch=1000*settings.batch_size, max_gap=50, processing=data_processing_train) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([trackingnet_val], [1], samples_per_epoch=500*settings.batch_size, max_gap=50, processing=data_processing_val) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = atom_models.atom_resnet18(backbone_pretrained=True) objective = nn.MSELoss() actor = actors.AtomActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.bb_regressor.parameters(), lr=1e-3) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(50, load_latest=True, fail_safe=True)
def run(settings): # Most common settings are assigned in the settings struct settings.description = 'Siam selection for detection with default settings.' settings.print_interval = 1 # How often to print loss and other info settings.batch_size = 1 # Batch size assert settings.batch_size==1,"only implement for batch_size 1" settings.num_workers = 4 # Number of workers for image loading settings.normalize_mean = [0.485, 0.456, 0.406] # Normalize mean (default pytorch ImageNet values) settings.normalize_std = [0.229, 0.224, 0.225] # Normalize std (default pytorch ImageNet values) settings.search_area_factor = 5.0 # Image patch size relative to target size settings.feature_sz = 18 # Size of feature map settings.output_sz = settings.feature_sz * 16 # Size of input image patches # Settings for the image sample and proposal generation settings.center_jitter_factor = {'train': 0, 'test': 4.5} settings.scale_jitter_factor = {'train': 0, 'test': 0.5} settings.proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 16, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} # Train datasets lasot_train = Lasot(split='train') trackingnet_train = TrackingNet(set_ids=list(range(11))) coco_train = MSCOCOSeq() # Validation datasets trackingnet_val = TrackingNet(set_ids=list(range(11,12))) # # The joint augmentation transform, that is applied to the pairs jointly # transform_joint = dltransforms.ToGrayscale(probability=0.05) # # # The augmentation transform applied to the training set (individually to each image in the pair) # transform_train = torchvision.transforms.Compose([dltransforms.ToTensorAndJitter(0.2), # torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) # # # The augmentation transform applied to the validation set (individually to each image in the pair) # transform_val = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), # torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) # Data processing to do on the training pairs # data_processing_train = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, # output_sz=settings.output_sz, # center_jitter_factor=settings.center_jitter_factor, # scale_jitter_factor=settings.scale_jitter_factor, # mode='sequence', # proposal_params=settings.proposal_params, # transform=transform_train, # joint_transform=transform_joint) # # # Data processing to do on the validation pairs # data_processing_val = processing.ATOMProcessing(search_area_factor=settings.search_area_factor, # output_sz=settings.output_sz, # center_jitter_factor=settings.center_jitter_factor, # scale_jitter_factor=settings.scale_jitter_factor, # mode='sequence', # proposal_params=settings.proposal_params, # transform=transform_val, # joint_transform=transform_joint) img_transform = ImageTransform( size_divisor=32, mean=[123.675, 116.28, 103.53],std=[58.395, 57.12, 57.375],to_rgb=True) data_processing=processing.SiamSelProcessing(transform=img_transform) # The sampler for training dataset_train = sampler.ATOMSampler([lasot_train, trackingnet_train, coco_train], [1,1,1], samples_per_epoch=1000*settings.batch_size, max_gap=50, processing=data_processing) # The loader for training loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # The sampler for validation dataset_val = sampler.ATOMSampler([trackingnet_val], [1], samples_per_epoch=500*settings.batch_size, max_gap=50, processing=data_processing) # The loader for validation loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network # net = atom_models.atom_resnet18(backbone_pretrained=True) net=SiamSelNet() # Set objective objective = nn.BCEWithLogitsLoss() # Create actor, which wraps network and objective actor = actors.SiamSelActor(net=net, objective=objective) # Optimizer optimizer = optim.Adam(actor.net.selector.parameters(), lr=1e-4) # Learning rate scheduler lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.2) # lr_scheduler = WarmupMultiStepLR(optimizer,[50*1000,80*1000]) # Create trainer trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) # Run training (set fail_safe=False if you are debugging) trainer.train(100, load_latest=True, fail_safe=True) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): # def __init__( # self, # optimizer, # milestones, # gamma=0.1, # warmup_factor=1.0 / 3, # warmup_iters=500, # warmup_method="linear", # last_epoch=-1, # ): # if not list(milestones) == sorted(milestones): # raise ValueError( # "Milestones should be a list of" " increasing integers. Got {}", # milestones, # ) # # if warmup_method not in ("constant", "linear"): # raise ValueError( # "Only 'constant' or 'linear' warmup_method accepted" # "got {}".format(warmup_method) # ) # self.milestones = milestones # self.gamma = gamma # self.warmup_factor = warmup_factor # self.warmup_iters = warmup_iters # self.warmup_method = warmup_method # super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) # # def get_lr(self): # warmup_factor = 1 # if self.last_epoch < self.warmup_iters: # if self.warmup_method == "constant": # warmup_factor = self.warmup_factor # elif self.warmup_method == "linear": # alpha = float(self.last_epoch) / self.warmup_iters # warmup_factor = self.warmup_factor * (1 - alpha) + alpha # return [ # base_lr # * warmup_factor # * self.gamma ** bisect_right(self.milestones, self.last_epoch) # for base_lr in self.base_lrs # ]