def test_bmn(): model_cfg = dict(type='BMN', temporal_dim=100, boundary_ratio=0.5, num_samples=32, num_samples_per_bin=3, feat_dim=400, soft_nms_alpha=0.4, soft_nms_low_threshold=0.5, soft_nms_high_threshold=0.9, post_process_top_k=100) if torch.cuda.is_available(): localizer_bmn = build_localizer(model_cfg).cuda() raw_feature = torch.rand(8, 400, 100).cuda() gt_bbox = np.array([[[0.1, 0.3], [0.375, 0.625]]] * 8) losses = localizer_bmn(raw_feature, gt_bbox) assert isinstance(losses, dict) # Test forward test video_meta = [ dict(video_name='v_test', duration_second=100, duration_frame=960, feature_frame=960) ] with torch.no_grad(): one_raw_feature = torch.rand(1, 400, 100).cuda() localizer_bmn(one_raw_feature, gt_bbox=None, video_meta=video_meta, return_loss=False) else: localizer_bmn = build_localizer(model_cfg) raw_feature = torch.rand(8, 400, 100) gt_bbox = torch.Tensor([[[0.1, 0.3], [0.375, 0.625]]] * 8) losses = localizer_bmn(raw_feature, gt_bbox) assert isinstance(losses, dict) # Test forward test video_meta = [ dict(video_name='v_test', duration_second=100, duration_frame=960, feature_frame=960) ] with torch.no_grad(): one_raw_feature = torch.rand(1, 400, 100) localizer_bmn(one_raw_feature, gt_bbox=None, video_meta=video_meta, return_loss=False)
def test_bmn(): model_cfg = get_localizer_cfg( 'bmn/bmn_400x100_2x8_9e_activitynet_feature.py') if torch.cuda.is_available(): localizer_bmn = build_localizer(model_cfg.model).cuda() raw_feature = torch.rand(8, 400, 100).cuda() gt_bbox = np.array([[[0.1, 0.3], [0.375, 0.625]]] * 8) losses = localizer_bmn(raw_feature, gt_bbox) assert isinstance(losses, dict) # Test forward test video_meta = [ dict(video_name='v_test', duration_second=100, duration_frame=960, feature_frame=960) ] with torch.no_grad(): one_raw_feature = torch.rand(1, 400, 100).cuda() localizer_bmn(one_raw_feature, gt_bbox=None, video_meta=video_meta, return_loss=False) else: localizer_bmn = build_localizer(model_cfg.model) raw_feature = torch.rand(8, 400, 100) gt_bbox = torch.Tensor([[[0.1, 0.3], [0.375, 0.625]]] * 8) losses = localizer_bmn(raw_feature, gt_bbox) assert isinstance(losses, dict) # Test forward test video_meta = [ dict(video_name='v_test', duration_second=100, duration_frame=960, feature_frame=960) ] with torch.no_grad(): one_raw_feature = torch.rand(1, 400, 100) localizer_bmn(one_raw_feature, gt_bbox=None, video_meta=video_meta, return_loss=False)
def test_config_build_localizer(): """Test that all mmaction models defined in the configs can be initialized.""" config_fpaths = _get_config_path_for_localizer() # test all config file in `configs/localization` directory for config_fpath in config_fpaths: config_mod = mmcv.Config.fromfile(config_fpath) print(f'Building localizer, config_fpath = {config_fpath!r}') if config_mod.get('model', None): localizer = build_localizer(config_mod.model) assert isinstance(localizer, nn.Module)
def test_tem(): model_cfg = get_localizer_cfg( 'bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py') localizer_tem = build_localizer(model_cfg.model) raw_feature = torch.rand(8, 400, 100) gt_bbox = torch.Tensor([[[1.0, 3.0], [3.0, 5.0]]] * 8) losses = localizer_tem(raw_feature, gt_bbox) assert isinstance(losses, dict) # Test forward test video_meta = [{'video_name': 'v_test'}] with torch.no_grad(): for one_raw_feature in raw_feature: one_raw_feature = one_raw_feature.reshape(1, 400, 100) localizer_tem( one_raw_feature, video_meta=video_meta, return_loss=False)
def test_pem(): model_cfg = dict(type='PEM', pem_feat_dim=32, pem_hidden_dim=256, pem_u_ratio_m=1, pem_u_ratio_l=2, pem_high_temporal_iou_threshold=0.6, pem_low_temporal_iou_threshold=2.2, soft_nms_alpha=0.75, soft_nms_low_threshold=0.65, soft_nms_high_threshold=0.9, post_process_top_k=100) localizer_pem = build_localizer(model_cfg) bsp_feature = torch.rand(8, 100, 32) reference_temporal_iou = torch.rand(8, 100) losses = localizer_pem(bsp_feature, reference_temporal_iou) assert isinstance(losses, dict) # Test forward test tmin = torch.rand(100) tmax = torch.rand(100) tmin_score = torch.rand(100) tmax_score = torch.rand(100) video_meta = [ dict(video_name='v_test', duration_second=100, duration_frame=1000, annotations=[{ 'segment': [0.3, 0.6], 'label': 'Rock climbing' }], feature_frame=900) ] with torch.no_grad(): for one_bsp_feature in bsp_feature: one_bsp_feature = one_bsp_feature.reshape(1, 100, 32) localizer_pem(one_bsp_feature, tmin=tmin, tmax=tmax, tmin_score=tmin_score, tmax_score=tmax_score, video_meta=video_meta, return_loss=False)
def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.gpus = args.gpus if cfg.checkpoint_config is not None: # save mmaction version in checkpoints as meta data cfg.checkpoint_config.meta = dict(mmact_version=__version__, config=cfg.text) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) model = build_localizer(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) train_dataset = get_trimmed_dataset(cfg.data.train) train_network(model, train_dataset, cfg, distributed=distributed, validate=args.validate, logger=logger)
def test_tem(): model_cfg = dict(type='TEM', temporal_dim=100, boundary_ratio=0.1, tem_feat_dim=400, tem_hidden_dim=512, tem_match_threshold=0.5) localizer_tem = build_localizer(model_cfg) raw_feature = torch.rand(8, 400, 100) gt_bbox = torch.Tensor([[[1.0, 3.0], [3.0, 5.0]]] * 8) losses = localizer_tem(raw_feature, gt_bbox) assert isinstance(losses, dict) # Test forward test video_meta = [{'video_name': 'v_test'}] with torch.no_grad(): for one_raw_feature in raw_feature: one_raw_feature = one_raw_feature.reshape(1, 400, 100) localizer_tem(one_raw_feature, video_meta=video_meta, return_loss=False)
def test_pem(): model_cfg = get_localizer_cfg( 'bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py') localizer_pem = build_localizer(model_cfg.model) bsp_feature = torch.rand(8, 100, 32) reference_temporal_iou = torch.rand(8, 100) losses = localizer_pem(bsp_feature, reference_temporal_iou) assert isinstance(losses, dict) # Test forward test tmin = torch.rand(100) tmax = torch.rand(100) tmin_score = torch.rand(100) tmax_score = torch.rand(100) video_meta = [ dict( video_name='v_test', duration_second=100, duration_frame=1000, annotations=[{ 'segment': [0.3, 0.6], 'label': 'Rock climbing' }], feature_frame=900) ] with torch.no_grad(): for one_bsp_feature in bsp_feature: one_bsp_feature = one_bsp_feature.reshape(1, 100, 32) localizer_pem( one_bsp_feature, tmin=tmin, tmax=tmax, tmin_score=tmin_score, tmax_score=tmax_score, video_meta=video_meta, return_loss=False)
def test_ssn_test(): test_cfg = mmcv.ConfigDict( dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16), evaluater=dict(top_k=2000, nms=0.2, softmax_before_filter=True, cls_score_dict=None, cls_top_k=2)))) base_model_cfg = dict(type='SSN', backbone=dict(type='ResNet', pretrained=None, depth=18, norm_eval=True), spatial_type='avg', dropout_ratio=0.8, cls_head=dict(type='SSNHead', dropout_ratio=0., in_channels=512, num_classes=20, consensus=dict(type='STPPTest', stpp_stage=(1, 1, 1)), use_regression=True), test_cfg=test_cfg) maxpool_model_cfg = copy.deepcopy(base_model_cfg) maxpool_model_cfg['spatial_type'] = 'max' non_regression_cfg = copy.deepcopy(base_model_cfg) non_regression_cfg['cls_head']['use_regression'] = False non_regression_cfg['cls_head']['consensus']['use_regression'] = False tuple_stage_cfg = copy.deepcopy(base_model_cfg) tuple_stage_cfg['cls_head']['consensus']['stpp_stage'] = (1, (1, 2), 1) str_stage_cfg = copy.deepcopy(base_model_cfg) str_stage_cfg['cls_head']['consensus']['stpp_stage'] = ('error', ) imgs = torch.rand(1, 8, 3, 224, 224) relative_proposal_list = torch.Tensor([[[0.2500, 0.6250], [0.3750, 0.7500]]]) scale_factor_list = torch.Tensor([[[1.0000, 1.0000], [1.0000, 0.2661]]]) proposal_tick_list = torch.LongTensor([[[1, 2, 5, 7], [20, 30, 60, 80]]]) reg_norm_consts = torch.Tensor([[[-0.0603, 0.0325], [0.0752, 0.1596]]]) localizer_ssn = build_localizer(base_model_cfg) localizer_ssn_maxpool = build_localizer(maxpool_model_cfg) localizer_ssn_non_regression = build_localizer(non_regression_cfg) localizer_ssn_tuple_stage_cfg = build_localizer(tuple_stage_cfg) with pytest.raises(ValueError): build_localizer(str_stage_cfg) if torch.cuda.is_available(): localizer_ssn = localizer_ssn.cuda() localizer_ssn_maxpool = localizer_ssn_maxpool.cuda() localizer_ssn_non_regression = localizer_ssn_non_regression.cuda() localizer_ssn_tuple_stage_cfg = localizer_ssn_tuple_stage_cfg.cuda() imgs = imgs.cuda() relative_proposal_list = relative_proposal_list.cuda() scale_factor_list = scale_factor_list.cuda() proposal_tick_list = proposal_tick_list.cuda() reg_norm_consts = reg_norm_consts.cuda() with torch.no_grad(): # Test normal case localizer_ssn(imgs, relative_proposal_list=relative_proposal_list, scale_factor_list=scale_factor_list, proposal_tick_list=proposal_tick_list, reg_norm_consts=reg_norm_consts, return_loss=False) # Test SSN model with max spatial pooling localizer_ssn_maxpool(imgs, relative_proposal_list=relative_proposal_list, scale_factor_list=scale_factor_list, proposal_tick_list=proposal_tick_list, reg_norm_consts=reg_norm_consts, return_loss=False) # Test SSN model without regression localizer_ssn_non_regression( imgs, relative_proposal_list=relative_proposal_list, scale_factor_list=scale_factor_list, proposal_tick_list=proposal_tick_list, reg_norm_consts=reg_norm_consts, return_loss=False) # Test SSN model with tuple stage cfg. localizer_ssn_tuple_stage_cfg( imgs, relative_proposal_list=relative_proposal_list, scale_factor_list=scale_factor_list, proposal_tick_list=proposal_tick_list, reg_norm_consts=reg_norm_consts, return_loss=False)
def test_ssn_train(): train_cfg = mmcv.ConfigDict( dict(ssn=dict(assigner=dict(positive_iou_threshold=0.7, background_iou_threshold=0.01, incomplete_iou_threshold=0.3, background_coverage_threshold=0.02, incomplete_overlap_threshold=0.01), sampler=dict(num_per_video=8, positive_ratio=1, background_ratio=1, incomplete_ratio=6, add_gt_as_proposals=True), loss_weight=dict(comp_loss_weight=0.1, reg_loss_weight=0.1), debug=False))) base_model_cfg = dict(type='SSN', backbone=dict(type='ResNet', pretrained=None, depth=18, norm_eval=True), spatial_type='avg', dropout_ratio=0.8, loss_cls=dict(type='SSNLoss'), cls_head=dict(type='SSNHead', dropout_ratio=0., in_channels=512, num_classes=20, consensus=dict(type='STPPTrain', stpp_stage=(1, 1, 1), num_segments_list=(2, 5, 2)), use_regression=True), train_cfg=train_cfg) dropout_cfg = copy.deepcopy(base_model_cfg) dropout_cfg['dropout_ratio'] = 0 dropout_cfg['cls_head']['dropout_ratio'] = 0.5 non_regression_cfg = copy.deepcopy(base_model_cfg) non_regression_cfg['cls_head']['use_regression'] = False imgs = torch.rand(1, 8, 9, 3, 224, 224) proposal_scale_factor = torch.Tensor([[[1.0345, 1.0345], [1.0028, 0.0028], [1.0013, 1.0013], [1.0008, 1.0008], [0.3357, 1.0006], [1.0006, 1.0006], [0.0818, 1.0005], [1.0030, 1.0030]]]) proposal_type = torch.Tensor([[0, 1, 1, 1, 1, 1, 1, 2]]) proposal_labels = torch.LongTensor([[8, 8, 8, 8, 8, 8, 8, 0]]) reg_targets = torch.Tensor([[[0.2929, 0.2694], [0.0000, 0.0000], [0.0000, 0.0000], [0.0000, 0.0000], [0.0000, 0.0000], [0.0000, 0.0000], [0.0000, 0.0000], [0.0000, 0.0000]]]) localizer_ssn = build_localizer(base_model_cfg) localizer_ssn_dropout = build_localizer(dropout_cfg) localizer_ssn_non_regression = build_localizer(non_regression_cfg) if torch.cuda.is_available(): localizer_ssn = localizer_ssn.cuda() localizer_ssn_dropout = localizer_ssn_dropout.cuda() localizer_ssn_non_regression = localizer_ssn_non_regression.cuda() imgs = imgs.cuda() proposal_scale_factor = proposal_scale_factor.cuda() proposal_type = proposal_type.cuda() proposal_labels = proposal_labels.cuda() reg_targets = reg_targets.cuda() # Train normal case losses = localizer_ssn(imgs, proposal_scale_factor=proposal_scale_factor, proposal_type=proposal_type, proposal_labels=proposal_labels, reg_targets=reg_targets) assert isinstance(losses, dict) # Train SSN without dropout in model, with dropout in head losses = localizer_ssn_dropout(imgs, proposal_scale_factor=proposal_scale_factor, proposal_type=proposal_type, proposal_labels=proposal_labels, reg_targets=reg_targets) assert isinstance(losses, dict) # Train SSN model without regression losses = localizer_ssn_non_regression( imgs, proposal_scale_factor=proposal_scale_factor, proposal_type=proposal_type, proposal_labels=proposal_labels, reg_targets=reg_targets) assert isinstance(losses, dict)
def main(): args = parse_args() if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.data.test.test_mode = True # reorganize stpp num_classes = (cfg.model.cls_head.num_classes - 1 if cfg.model.cls_head.with_bg else cfg.model.cls_head.num_classes) stpp_feat_multiplier = 0 for stpp_subcfg in cfg.model.segmental_consensus.stpp_cfg: _, mult = parse_stage_config(stpp_subcfg) stpp_feat_multiplier += mult cfg.model.segmental_consensus = dict( type="STPPReorganized", standalong_classifier=cfg.model.segmental_consensus. standalong_classifier, feat_dim=num_classes + 1 + num_classes * 3 * stpp_feat_multiplier, act_score_len=num_classes + 1, comp_score_len=num_classes, reg_score_len=num_classes * 2, stpp_cfg=cfg.model.segmental_consensus.stpp_cfg) dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True)) if args.gpus == 1: model = build_localizer(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) load_checkpoint(model, args.checkpoint, strict=True) model = MMDataParallel(model, device_ids=[0]) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, num_gpus=1, dist=False, shuffle=False) outputs = single_test(model, data_loader) else: model_args = cfg.model.copy() model_args.update(train_cfg=None, test_cfg=cfg.test_cfg) model_type = getattr(localizers, model_args.pop('type')) outputs = parallel_test(model_type, model_args, args.checkpoint, dataset, _data_func, range(args.gpus), workers_per_gpu=args.proc_per_gpu) if args.out: print('writing results to {}'.format(args.out)) mmcv.dump(outputs, args.out) eval_type = args.eval if eval_type: print('Starting evaluate {}'.format(eval_type)) detections = results2det(dataset, outputs, **cfg.test_cfg.ssn.evaluater) if not args.no_regression: print("Performing location regression") for cls in range(len(detections)): detections[cls] = { k: perform_regression(v) for k, v in detections[cls].items() } print("Regression finished") print("Performing NMS") for cls in range(len(detections)): detections[cls] = { k: temporal_nms(v, cfg.test_cfg.ssn.evaluater.nms) for k, v in detections[cls].items() } print("NMS finished") if eval_type == 'activitynet': iou_range = np.arange(0.5, 1.0, 0.05) elif eval_type == 'thumos14': iou_range = np.arange(0.1, 1.0, .1) # get gt all_gt = pd.DataFrame(dataset.get_all_gt(), columns=['video-id', 'cls', 't-start', 't-end']) gt_by_cls = [ all_gt[all_gt.cls == cls].reset_index(drop=True).drop('cls', 1) for cls in range(len(detections)) ] plain_detections = [ det2df(detections, cls) for cls in range(len(detections)) ] ap_values = eval_ap_parallel(plain_detections, gt_by_cls, iou_range) map_iou = ap_values.mean(axis=0) print("Evaluation finished") # display display_title = 'Temporal detection performance ({})'.format(args.eval) display_data = [['IoU thresh'], ['mean AP']] for i in range(len(iou_range)): display_data[0].append('{:.02f}'.format(iou_range[i])) display_data[1].append('{:.04f}'.format(map_iou[i])) table = AsciiTable(display_data, display_title) table.justify_columns[-1] = 'right' table.inner_footing_row_border = True print(table.table)