def test_compose():
    with pytest.raises(TypeError):
        # transform must be callable or a dict
        Compose('LoadImage')

    target_keys = ['img', 'img_metas']

    # test Compose given a data pipeline
    img = np.random.randn(256, 256, 3)
    results = dict(img=img, abandoned_key=None, img_name='test_image.png')
    test_pipeline = [
        dict(type='Collect', keys=['img'], meta_keys=['img_name']),
        dict(type='ImageToTensor', keys=['img'])
    ]
    compose = Compose(test_pipeline)
    compose_results = compose(results)
    assert check_keys_equal(compose_results.keys(), target_keys)
    assert check_keys_equal(compose_results['img_metas'].data.keys(),
                            ['img_name'])

    # test Compose when forward data is None
    results = None
    image_to_tensor = ImageToTensor(keys=[])
    test_pipeline = [image_to_tensor]
    compose = Compose(test_pipeline)
    compose_results = compose(results)
    assert compose_results is None

    assert repr(compose) == compose.__class__.__name__ + \
        f'(\n    {image_to_tensor}\n)'
Esempio n. 2
0
def main():
    args = parse_args()

    args.device = torch.device(args.device)

    cfg = Config.fromfile(args.config)
    cfg.merge_from_dict(args.cfg_options)

    model = init_recognizer(cfg, args.checkpoint, device=args.device)
    data = dict(img_shape=None, modality='RGB', label=-1)
    with open(args.label, 'r') as f:
        label = [line.strip() for line in f]

    # prepare test pipeline from non-camera pipeline
    cfg = model.cfg
    sample_length = 0
    pipeline = cfg.data.test.pipeline
    pipeline_ = pipeline.copy()
    for step in pipeline:
        if 'SampleFrames' in step['type']:
            sample_length = step['clip_len'] * step['num_clips']
            data['num_clips'] = step['num_clips']
            data['clip_len'] = step['clip_len']
            pipeline_.remove(step)
        if step['type'] in EXCLUED_STEPS:
            # remove step to decode frames
            pipeline_.remove(step)
    test_pipeline = Compose(pipeline_)

    assert sample_length > 0
    args.sample_length = sample_length
    args.test_pipeline = test_pipeline

    show_results(model, data, label, args)
Esempio n. 3
0
def main():
    global frame_queue, threshold, sample_length, data, test_pipeline, model, \
        out_file, video_path, device, input_step, label, result_queue

    args = parse_args()
    input_step = args.input_step
    threshold = args.threshold
    video_path = args.video
    out_file = args.out_file

    device = torch.device(args.device)
    model = init_recognizer(args.config, args.checkpoint, device=device)
    data = dict(img_shape=None, modality='RGB', label=-1)
    with open(args.label, 'r') as f:
        label = [line.strip() for line in f]

    # prepare test pipeline from non-camera pipeline
    cfg = model.cfg
    sample_length = 0
    pipeline = cfg.test_pipeline
    pipeline_ = pipeline.copy()
    for step in pipeline:
        if 'SampleFrames' in step['type']:
            sample_length = step['clip_len'] * step['num_clips']
            data['num_clips'] = step['num_clips']
            data['clip_len'] = step['clip_len']
            pipeline_.remove(step)
        if step['type'] in EXCLUED_STEPS:
            # remove step to decode frames
            pipeline_.remove(step)
    test_pipeline = Compose(pipeline_)
    assert sample_length > 0
    frame_queue = deque(maxlen=sample_length)
    result_queue = deque(maxlen=1)
    show_results()
Esempio n. 4
0
def main():
    global label, device, model, test_pipeline, \
        camera, sample_length, average_size, threshold

    args = parse_args()
    device = torch.device(args.device)
    model = init_recognizer(args.config, args.checkpoint, device=device)
    camera = cv2.VideoCapture(args.camera_id)

    sample_length = args.sample_length
    average_size = args.average_size
    threshold = args.threshold

    with open(args.label, 'r') as f:
        label = [line.strip() for line in f]

    # prepare test pipeline from non-camera pipeline
    cfg = model.cfg
    pipeline = cfg.test_pipeline
    pipeline_ = pipeline.copy()
    for step in pipeline:
        if 'SampleFrames' in step['type']:
            # Remove step to sample frames
            if sample_length == 0:
                sample_length = step['clip_len'] * step['num_clips']
            pipeline_.remove(step)
        if step['type'] in EXCLUED_STEPS:
            # remove step to decode frames
            pipeline_.remove(step)
    test_pipeline = Compose(pipeline_)

    assert sample_length > 0

    print('Press "Esc", "q" or "Q" to exit')
    predict_webcam_video()
Esempio n. 5
0
def build_inputs(model, video_path, use_frames=False):
    """build inputs for GradCAM.

    Note that, building inputs for GradCAM is exactly the same as building
    inputs for Recognizer test stage. Codes from `inference_recognizer`.

    Args:
        model (nn.Module): Recognizer model.
        video_path (str): video file/url or rawframes directory.
        use_frames (bool): whether to use rawframes as input.
    Returns:
        dict: Both GradCAM inputs and Recognizer test stage inputs,
            including two keys, ``imgs`` and ``label``.
    """
    if not (osp.exists(video_path) or video_path.startswith('http')):
        raise RuntimeError(f"'{video_path}' is missing")

    if osp.isfile(video_path) and use_frames:
        raise RuntimeError(
            f"'{video_path}' is a video file, not a rawframe directory")
    elif osp.isdir(video_path) and not use_frames:
        raise RuntimeError(
            f"'{video_path}' is a rawframe directory, not a video file")

    cfg = model.cfg
    device = next(model.parameters()).device  # model device

    # build the data pipeline
    test_pipeline = cfg.data.test.pipeline
    test_pipeline = Compose(test_pipeline)
    # prepare data
    if use_frames:
        filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg')
        modality = cfg.data.test.get('modality', 'RGB')
        start_index = cfg.data.test.get('start_index', 1)
        data = dict(
            frame_dir=video_path,
            total_frames=len(os.listdir(video_path)),
            # assuming files in ``video_path`` are all named with ``filename_tmpl``  # noqa: E501
            label=-1,
            start_index=start_index,
            filename_tmpl=filename_tmpl,
            modality=modality)
    else:
        start_index = cfg.data.test.get('start_index', 0)
        data = dict(
            filename=video_path,
            label=-1,
            start_index=start_index,
            modality='RGB')
    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]

    return data
Esempio n. 6
0
def main():
    global frame_queue, camera, frame, results, threshold, sample_length, \
        data, test_pipeline, model, device, average_size, label, \
        result_queue, drawing_fps, inference_fps

    args = parse_args()
    average_size = args.average_size
    threshold = args.threshold
    drawing_fps = args.drawing_fps
    inference_fps = args.inference_fps

    device = torch.device(args.device)

    cfg = Config.fromfile(args.config)
    cfg.merge_from_dict(args.cfg_options)

    model = init_recognizer(cfg, args.checkpoint, device=device)
    camera = cv2.VideoCapture(args.camera_id)
    data = dict(img_shape=None, modality='RGB', label=-1)

    with open(args.label, 'r') as f:
        label = [line.strip() for line in f]

    # prepare test pipeline from non-camera pipeline
    cfg = model.cfg
    sample_length = 0
    pipeline = cfg.data.test.pipeline
    pipeline_ = pipeline.copy()
    for step in pipeline:
        if 'SampleFrames' in step['type']:
            sample_length = step['clip_len'] * step['num_clips']
            data['num_clips'] = step['num_clips']
            data['clip_len'] = step['clip_len']
            pipeline_.remove(step)
        if step['type'] in EXCLUED_STEPS:
            # remove step to decode frames
            pipeline_.remove(step)
    test_pipeline = Compose(pipeline_)

    assert sample_length > 0

    try:
        frame_queue = deque(maxlen=sample_length)
        result_queue = deque(maxlen=1)
        pw = Thread(target=show_results, args=(), daemon=True)
        pr = Thread(target=inference, args=(), daemon=True)
        pw.start()
        pr.start()
        pw.join()
    except KeyboardInterrupt:
        pass
Esempio n. 7
0
def main():
    global frame_queue, camera, frame, results, threshold, sample_length, \
        data, test_pipeline, model, device, average_size, label, result_queue

    args = parse_args()
    average_size = args.average_size
    threshold = args.threshold

    device = torch.device(args.device)
    model = init_recognizer(args.config, args.checkpoint, device=device)

    camera = cv2.VideoCapture(args.camera_id)
    #camera = cv2.VideoCapture('/home/ww/tools/image/office/2020-12-10_14-54-03.mp4')

    camera.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    camera.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    data = dict(img_shape=None, modality='RGB', label=-1)

    with open(args.label, 'r') as f:
        label = [line.strip() for line in f]

    # prepare test pipeline from non-camera pipeline
    cfg = model.cfg
    sample_length = 0
    pipeline = cfg.test_pipeline
    pipeline_ = pipeline.copy()
    for step in pipeline:
        if 'SampleFrames' in step['type']:
            sample_length = step['clip_len'] * step['num_clips']
            data['num_clips'] = step['num_clips']
            data['clip_len'] = step['clip_len']
            pipeline_.remove(step)
        if step['type'] in EXCLUED_STEPS:
            # remove step to decode frames
            pipeline_.remove(step)
    test_pipeline = Compose(pipeline_)

    assert sample_length > 0

    try:
        frame_queue = deque(maxlen=sample_length)
        result_queue = deque(maxlen=1)
        pw = Thread(target=show_results, args=(), daemon=True)
        pr = Thread(target=inference, args=(), daemon=True)
        pw.start()
        pr.start()
        pw.join()
    except KeyboardInterrupt:
        pass
Esempio n. 8
0
def skeleton_based_action_recognition(args, pose_results, num_frame, h, w):
    fake_anno = dict(frame_dict='',
                     label=-1,
                     img_shape=(h, w),
                     origin_shape=(h, w),
                     start_index=0,
                     modality='Pose',
                     total_frames=num_frame)
    num_person = max([len(x) for x in pose_results])

    num_keypoint = 17
    keypoint = np.zeros((num_person, num_frame, num_keypoint, 2),
                        dtype=np.float16)
    keypoint_score = np.zeros((num_person, num_frame, num_keypoint),
                              dtype=np.float16)
    for i, poses in enumerate(pose_results):
        for j, pose in enumerate(poses):
            pose = pose['keypoints']
            keypoint[j, i] = pose[:, :2]
            keypoint_score[j, i] = pose[:, 2]

    fake_anno['keypoint'] = keypoint
    fake_anno['keypoint_score'] = keypoint_score

    label_map = [x.strip() for x in open(args.label_map).readlines()]
    num_class = len(label_map)

    skeleton_config = mmcv.Config.fromfile(args.skeleton_config)
    skeleton_config.model.cls_head.num_classes = num_class  # for K400 dataset
    skeleton_pipeline = Compose(skeleton_config.test_pipeline)
    skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None]
    skeleton_imgs = skeleton_imgs.to(args.device)

    # Build skeleton-based recognition model
    skeleton_model = build_model(skeleton_config.model)
    load_checkpoint(skeleton_model,
                    args.skeleton_checkpoint,
                    map_location='cpu')
    skeleton_model.to(args.device)
    skeleton_model.eval()

    with torch.no_grad():
        output = skeleton_model(return_loss=False, imgs=skeleton_imgs)

    action_idx = np.argmax(output)
    skeleton_action_result = label_map[
        action_idx]  # skeleton-based action result for the whole video
    return skeleton_action_result
Esempio n. 9
0
def preprocess(video_path, cfg):
    test_pipeline = cfg.data.test.pipeline
    test_pipeline = Compose(test_pipeline)
    # prepare data
    start_index = cfg.data.test.get('start_index', 0)
    data = dict(filename=video_path,
                label=-1,
                start_index=start_index,
                modality='RGB')
    data = test_pipeline(data)
    #data = collate([data], samples_per_gpu=1)
    #if next(model.parameters()).is_cuda:
    # scatter to specified GPU
    #    data = scatter(data, [device])[0]

    return data['imgs']
Esempio n. 10
0
def test_compose_support_torchvision():
    target_keys = ['imgs', 'img_metas']

    # test Compose given a data pipeline
    imgs = [np.random.randn(256, 256, 3)] * 8
    results = dict(imgs=imgs,
                   abandoned_key=None,
                   img_name='test_image.png',
                   clip_len=8,
                   num_clips=1)
    test_pipeline = [
        dict(type='torchvision.Grayscale', num_output_channels=3),
        dict(type='FormatShape', input_format='NCTHW'),
        dict(type='Collect', keys=['imgs'], meta_keys=['img_name']),
        dict(type='ToTensor', keys=['imgs'])
    ]
    compose = Compose(test_pipeline)
    compose_results = compose(results)
    assert assert_keys_equal(compose_results.keys(), target_keys)
    assert assert_keys_equal(compose_results['img_metas'].data.keys(),
                             ['img_name'])
Esempio n. 11
0
def main():
    args = parse_args()

    frame_paths, original_frames = frame_extraction(args.video,
                                                    args.short_side)
    num_frame = len(frame_paths)
    h, w, _ = original_frames[0].shape

    # Get clip_len, frame_interval and calculate center index of each clip
    config = mmcv.Config.fromfile(args.config)
    config.merge_from_dict(args.cfg_options)

    test_pipeline = Compose(config.data.test.pipeline)

    # Load label_map
    label_map = [x.strip() for x in open(args.label_map).readlines()]

    # Get Human detection results
    det_results = detection_inference(args, frame_paths)
    torch.cuda.empty_cache()

    pose_results = pose_inference(args, frame_paths, det_results)
    torch.cuda.empty_cache()

    fake_anno = dict(frame_dir='',
                     label=-1,
                     img_shape=(h, w),
                     original_shape=(h, w),
                     start_index=0,
                     modality='Pose',
                     total_frames=num_frame)
    num_person = max([len(x) for x in pose_results])
    # Current PoseC3D models are trained on COCO-keypoints (17 keypoints)
    num_keypoint = 17
    keypoint = np.zeros((num_person, num_frame, num_keypoint, 2),
                        dtype=np.float16)
    keypoint_score = np.zeros((num_person, num_frame, num_keypoint),
                              dtype=np.float16)
    for i, poses in enumerate(pose_results):
        for j, pose in enumerate(poses):
            pose = pose['keypoints']
            keypoint[j, i] = pose[:, :2]
            keypoint_score[j, i] = pose[:, 2]
    fake_anno['keypoint'] = keypoint
    fake_anno['keypoint_score'] = keypoint_score

    imgs = test_pipeline(fake_anno)['imgs'][None]
    imgs = imgs.to(args.device)

    model = build_model(config.model)
    load_checkpoint(model, args.checkpoint, map_location=args.device)
    model.to(args.device)
    model.eval()

    with torch.no_grad():
        output = model(return_loss=False, imgs=imgs)

    action_idx = np.argmax(output)
    action_label = label_map[action_idx]

    pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
                                 args.device)
    vis_frames = [
        vis_pose_result(pose_model, frame_paths[i], pose_results[i])
        for i in range(num_frame)
    ]
    for frame in vis_frames:
        cv2.putText(frame, action_label, (10, 30), FONTFACE, FONTSCALE,
                    FONTCOLOR, THICKNESS, LINETYPE)

    cv2.imwrite('frame.jpg', vis_frames[0])
    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=24)
    vid.write_videofile(args.out_filename, remove_temp=True)

    tmp_frame_dir = osp.dirname(frame_paths[0])
    shutil.rmtree(tmp_frame_dir)
Esempio n. 12
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--config', '-c', type=str, required=True)
    parser.add_argument('--checkpoint', '-w', type=str, required=True)
    parser.add_argument('--dataset_name', '-n', type=str, required=True)
    parser.add_argument('--data_dir', '-d', type=str, required=True)
    parser.add_argument('--predictions', '-p', type=str, required=True)
    parser.add_argument('--movements', '-m', type=str, required=True)
    parser.add_argument('--keypoints', '-k', type=str, required=True)
    parser.add_argument('--out_annotation', '-o', type=str, required=True)
    args = parser.parse_args()

    assert exists(args.config)
    assert exists(args.weights)
    assert exists(args.data_dir)
    assert exists(args.predictions)
    assert exists(args.movements)
    assert exists(args.keypoints)
    assert args.dataset_name is not None and args.dataset_name != ''
    assert args.out_annotation is not None and args.out_annotation != ''

    cfg = Config.fromfile(args.config)
    cfg = update_config(cfg, args, trg_name=args.dataset_name)
    cfg = propagate_root_dir(cfg, args.data_dir)

    dataset = build_dataset(cfg.data, 'train', dict(test_mode=True))
    data_pipeline = Compose(dataset.pipeline.transforms[1:])
    print('{} dataset:\n'.format(args.mode) + str(dataset))

    model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
    load_checkpoint(model, args.checkpoint, strict=False)
    model = MMDataParallel(model, device_ids=[0])
    model.eval()

    annotation_path = join(args.data_dir, cfg.data.train.sources[0],
                           cfg.data.train.ann_file)
    records = load_annotation(annotation_path)
    predictions = load_distributed_data(args.predictions,
                                        parse_predictions_file, 'txt')
    movements = load_distributed_data(args.movements, parse_movements_file,
                                      'txt')
    hand_kpts = load_distributed_data(args.keypoints, parse_kpts_file, 'json')
    print('Loaded records: {}'.format(len(records)))

    invalid_stat = dict()
    all_candidates = []

    ignore_candidates = get_ignore_candidates(records, IGNORE_LABELS)
    all_candidates += ignore_candidates

    static_candidates, static_invalids = get_regular_candidates(
        records,
        predictions,
        movements,
        hand_kpts,
        cfg.data.output.length,
        False,
        STATIC_LABELS,
        NEGATIVE_LABEL,
        NO_MOTION_LABEL,
        min_score=0.9,
        min_length=4,
        max_distance=1)
    all_candidates += static_candidates
    invalid_stat = update_stat(invalid_stat, static_invalids)
    print('Static candidates: {}'.format(len(static_candidates)))

    if len(invalid_stat) > 0:
        print('Ignored records after static analysis:')
        for ignore_label, ignore_values in invalid_stat.items():
            print('   - {}: {}'.format(ignore_label.replace('_', ' '),
                                       len(ignore_values)))

    dynamic_candidates, dynamic_invalids = get_regular_candidates(
        records,
        predictions,
        movements,
        hand_kpts,
        cfg.data.output.length,
        True,
        DYNAMIC_LABELS,
        NEGATIVE_LABEL,
        NO_MOTION_LABEL,
        min_score=0.9,
        min_length=4,
        max_distance=1)
    all_candidates += dynamic_candidates
    invalid_stat = update_stat(invalid_stat, dynamic_invalids)
    print('Dynamic candidates: {}'.format(len(dynamic_candidates)))

    if len(invalid_stat) > 0:
        print('Ignored records after dynamic analysis:')
        for ignore_label, ignore_values in invalid_stat.items():
            print('   - {}: {}'.format(ignore_label.replace('_', ' '),
                                       len(ignore_values)))

    fixed_records, fix_stat = find_best_match(all_candidates, model, dataset,
                                              NEGATIVE_LABEL)
    invalid_stat = update_stat(invalid_stat, fix_stat)
    print('Final records: {}'.format(len(fixed_records)))

    if len(invalid_stat) > 0:
        print('Final ignored records:')
        for ignore_label, ignore_values in invalid_stat.items():
            print('   - {}: {}'.format(ignore_label.replace('_', ' '),
                                       len(ignore_values)))
            for ignored_record in ignore_values:
                print('      - {}'.format(ignored_record.path))

    dump_records(fixed_records, args.out_annotation)
    print('Fixed annotation has been stored at: {}'.format(
        args.out_annotation))
Esempio n. 13
0
def main():
    args = parse_args()

    device = torch.device(args.device)
    use_frames = False
    if args.use_frames == "False":
        use_frames = False
    if args.use_frames == "True":
        use_frames = True
    model = init_recognizer(args.config, device=device, use_frames=use_frames)

    # Target FPGA Zynq UltraScale+ MPSoC ZCU104. Assuming clock frequency of 100 MHz.
    # The actual BRAM size is 11 Mbits (1.375 MBytes). This divided by the 18 Kbits size of each BRAM gives a total of 624 BRAM units.
    # The ZCU104 has also 27 Mbits (3.375 MBytes) of URAM. This divided by the 288 Kbits size of each URAM gives a total of 96 URAM units.
    # The ZCU104 has 20 GTH gigabit transceivers (16.3 Gb/s or 2.03 GB/s) on the PL-size
    feature_maps = ModelFeatureMaps(model=model,
                                    word_length=16,
                                    clock_freq=100,
                                    bram=624,
                                    dsp=1728)
    feature_maps.get_inter_feature_maps()

    random_img = np.random.randn(args.imshape[0], args.imshape[1],
                                 args.imshape[2])

    data = dict(img_shape=None, modality="RGB", label=-1)

    # prepare test pipeline from non-camera pipeline
    cfg = model.cfg
    sample_length = 0
    pipeline = cfg.test_pipeline
    pipeline_ = pipeline.copy()
    for step in pipeline:
        if "SampleFrames" in step["type"]:
            step["num_clips"] = 1
            sample_length = step["clip_len"] * step["num_clips"]
            data["num_clips"] = step["num_clips"]
            data["clip_len"] = step["clip_len"]
            pipeline_.remove(step)
        if step["type"] in EXCLUED_STEPS:
            # remove step to decode frames
            pipeline_.remove(step)
    test_pipeline = Compose(pipeline_)
    print(test_pipeline)
    assert sample_length > 0

    data_in = []
    for _ in range(data["clip_len"]):
        data_in.append(random_img)

    data["imgs"] = data_in
    if data["img_shape"] is None:
        data["img_shape"] = random_img.shape[:2]

    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        data = scatter(data, [device])[0]

    with torch.no_grad():
        scores = model(return_loss=False, **data)[0]

    feature_maps.get_info()

    feature_maps.get_conv_layers(file_name=args.model_name)
Esempio n. 14
0
def main():
    args = parse_args()
    args.is_rgb = args.modality == 'RGB'
    args.clip_len = 1 if args.is_rgb else 5
    args.input_format = 'NCHW' if args.is_rgb else 'NCHW_Flow'
    rgb_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
                        std=[58.395, 57.12, 57.375],
                        to_bgr=False)
    flow_norm_cfg = dict(mean=[128, 128], std=[128, 128])
    args.img_norm_cfg = rgb_norm_cfg if args.is_rgb else flow_norm_cfg
    args.f_tmpl = 'img_{:05d}.jpg' if args.is_rgb else 'flow_{}_{:05d}.jpg'
    args.in_channels = args.clip_len * (3 if args.is_rgb else 2)
    # max batch_size for one forward
    args.batch_size = 200

    # define the data pipeline for Untrimmed Videos
    data_pipeline = [
        dict(type='UntrimmedSampleFrames',
             clip_len=args.clip_len,
             frame_interval=args.frame_interval,
             start_index=0),
        dict(type='FrameSelector'),
        dict(type='Resize', scale=(-1, 256)),
        dict(type='CenterCrop', crop_size=256),
        dict(type='Normalize', **args.img_norm_cfg),
        dict(type='FormatShape', input_format=args.input_format),
        dict(type='Collect', keys=['imgs'], meta_keys=[]),
        dict(type='ToTensor', keys=['imgs'])
    ]
    data_pipeline = Compose(data_pipeline)
    #pdb.set_trace()
    # define TSN R50 model, the model is used as the feature extractor
    model_cfg = dict(type='Recognizer2D',
                     backbone=dict(type='ResNet',
                                   depth=50,
                                   in_channels=args.in_channels,
                                   norm_eval=False),
                     cls_head=dict(type='TSNHead',
                                   num_classes=400,
                                   in_channels=2048,
                                   spatial_type='avg',
                                   consensus=dict(type='AvgConsensus', dim=1)),
                     test_cfg=dict(average_clips=None))
    model = build_model(model_cfg)
    # load pretrained weight into the feature extractor
    state_dict = torch.load(args.ckpt)['state_dict']
    #pdb.set_trace()
    model.load_state_dict(state_dict)
    model = model.cuda()
    model.eval()

    data = open(args.data_list).readlines()
    data = [x.strip() for x in data]
    data = data[args.part::args.total]

    # enumerate Untrimmed videos, extract feature from each of them
    prog_bar = mmcv.ProgressBar(len(data))
    if not osp.exists(args.output_prefix):
        os.system(f'mkdir -p {args.output_prefix}')

    for item in data:
        frame_dir, length, label = item.split()
        output_file = osp.basename(frame_dir) + '.pkl'
        frame_dir = osp.join(args.data_prefix, frame_dir)
        output_file = osp.join(args.output_prefix, output_file)
        assert output_file.endswith('.pkl')
        length = int(length)

        # prepare a psuedo sample
        tmpl = dict(frame_dir=frame_dir,
                    total_frames=length,
                    filename_tmpl=args.f_tmpl,
                    start_index=0,
                    modality=args.modality)
        sample = data_pipeline(tmpl)
        imgs = sample['imgs']
        shape = imgs.shape
        # the original shape should be N_seg * C * H * W, resize it to N_seg *
        # 1 * C * H * W so that the network return feature of each frame (No
        # score average among segments)
        imgs = imgs.reshape((shape[0], 1) + shape[1:])
        imgs = imgs.cuda()

        def forward_data(model, data):
            # chop large data into pieces and extract feature from them
            results = []
            start_idx = 0
            num_clip = data.shape[0]
            while start_idx < num_clip:
                with torch.no_grad():
                    part = data[start_idx:start_idx + args.batch_size]
                    feat = model.forward(part, return_loss=False)
                    results.append(feat)
                    start_idx += args.batch_size
            return np.concatenate(results)

        feat = forward_data(model, imgs)
        #pdb.set_trace()
        with open(output_file, 'wb') as fout:
            pickle.dump(feat, fout)
        prog_bar.update()
Esempio n. 15
0
def inference_recognizer(model,
                         video_path,
                         label_path,
                         use_frames=False,
                         outputs=None,
                         as_tensor=True):
    """Inference a video with the detector.

    Args:
        model (nn.Module): The loaded recognizer.
        video_path (str): The video file path/url or the rawframes directory
            path. If ``use_frames`` is set to True, it should be rawframes
            directory path. Otherwise, it should be video file path.
        label_path (str): The label file path.
        use_frames (bool): Whether to use rawframes as input. Default:False.
        outputs (list(str) | tuple(str) | str | None) : Names of layers whose
            outputs need to be returned, default: None.
        as_tensor (bool): Same as that in ``OutputHook``. Default: True.

    Returns:
        dict[tuple(str, float)]: Top-5 recognition result dict.
        dict[torch.tensor | np.ndarray]:
            Output feature maps from layers specified in `outputs`.
    """
    if not (osp.exists(video_path) or video_path.startswith('http')):
        raise RuntimeError(f"'{video_path}' is missing")

    if osp.isfile(video_path) and use_frames:
        raise RuntimeError(
            f"'{video_path}' is a video file, not a rawframe directory")
    if osp.isdir(video_path) and not use_frames:
        raise RuntimeError(
            f"'{video_path}' is a rawframe directory, not a video file")

    if isinstance(outputs, str):
        outputs = (outputs, )
    assert outputs is None or isinstance(outputs, (tuple, list))

    cfg = model.cfg
    device = next(model.parameters()).device  # model device
    # construct label map
    with open(label_path, 'r') as f:
        label = [line.strip() for line in f]
    # build the data pipeline
    test_pipeline = cfg.data.test.pipeline
    test_pipeline = Compose(test_pipeline)
    # prepare data
    if use_frames:
        filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg')
        modality = cfg.data.test.get('modality', 'RGB')
        start_index = cfg.data.test.get('start_index', 1)
        data = dict(
            frame_dir=video_path,
            total_frames=len(os.listdir(video_path)),
            # assuming files in ``video_path`` are all named with ``filename_tmpl``  # noqa: E501
            label=-1,
            start_index=start_index,
            filename_tmpl=filename_tmpl,
            modality=modality)
    else:
        start_index = cfg.data.test.get('start_index', 0)
        data = dict(filename=video_path,
                    label=-1,
                    start_index=start_index,
                    modality='RGB')
    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]

    # forward the model
    with OutputHook(model, outputs=outputs, as_tensor=as_tensor) as h:
        with torch.no_grad():
            scores = model(return_loss=False, **data)[0]
        returned_features = h.layer_outputs if outputs else None

    score_tuples = tuple(zip(label, scores))
    score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)

    top5_label = score_sorted[:5]
    if outputs:
        return top5_label, returned_features
    return top5_label
Esempio n. 16
0
def skeleton_based_stdet(args, label_map, human_detections, pose_results,
                         num_frame, clip_len, frame_interval, h, w):
    window_size = clip_len * frame_interval
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           args.predict_stepsize)

    skeleton_config = mmcv.Config.fromfile(args.skeleton_config)
    num_class = max(label_map.keys()) + 1  # for AVA dataset (81)
    skeleton_config.model.cls_head.num_classes = num_class
    skeleton_pipeline = Compose(skeleton_config.test_pipeline)
    skeleton_stdet_model = build_model(skeleton_config.model)
    load_checkpoint(skeleton_stdet_model,
                    args.skeleton_stdet_checkpoint,
                    map_location='cpu')
    skeleton_stdet_model.to(args.device)
    skeleton_stdet_model.eval()

    skeleton_predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    prog_bar = mmcv.ProgressBar(len(timestamps))
    for timestamp in timestamps:
        proposal = human_detections[timestamp - 1]
        if proposal.shape[0] == 0:  # no people detected
            skeleton_predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)
        num_frame = len(frame_inds)  # 30

        pose_result = [pose_results[ind] for ind in frame_inds]

        skeleton_prediction = []
        for i in range(proposal.shape[0]):  # num_person
            skeleton_prediction.append([])

            fake_anno = dict(frame_dict='',
                             label=-1,
                             img_shape=(h, w),
                             origin_shape=(h, w),
                             start_index=0,
                             modality='Pose',
                             total_frames=num_frame)
            num_person = 1

            num_keypoint = 17
            keypoint = np.zeros(
                (num_person, num_frame, num_keypoint, 2))  # M T V 2
            keypoint_score = np.zeros(
                (num_person, num_frame, num_keypoint))  # M T V

            # pose matching
            person_bbox = proposal[i][:4]
            area = expand_bbox(person_bbox, h, w)

            for j, poses in enumerate(pose_result):  # num_frame
                max_iou = float('-inf')
                index = -1
                if len(poses) == 0:
                    continue
                for k, per_pose in enumerate(poses):
                    iou = cal_iou(per_pose['bbox'][:4], area)
                    if max_iou < iou:
                        index = k
                        max_iou = iou
                keypoint[0, j] = poses[index]['keypoints'][:, :2]
                keypoint_score[0, j] = poses[index]['keypoints'][:, 2]

            fake_anno['keypoint'] = keypoint
            fake_anno['keypoint_score'] = keypoint_score

            skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None]
            skeleton_imgs = skeleton_imgs.to(args.device)

            with torch.no_grad():
                output = skeleton_stdet_model(return_loss=False,
                                              imgs=skeleton_imgs)
                output = output[0]
                for k in range(len(output)):  # 81
                    if k not in label_map:
                        continue
                    if output[k] > args.action_score_thr:
                        skeleton_prediction[i].append(
                            (label_map[k], output[k]))

        skeleton_predictions.append(skeleton_prediction)
        prog_bar.update()

    return timestamps, skeleton_predictions
Esempio n. 17
0
def inference_recognizer(model, video, outputs=None, as_tensor=True, **kwargs):
    """Inference a video with the recognizer.

    Args:
        model (nn.Module): The loaded recognizer.
        video (str | dict | ndarray): The video file path / url or the
            rawframes directory path / results dictionary (the input of
            pipeline) / a 4D array T x H x W x 3 (The input video).
        outputs (list(str) | tuple(str) | str | None) : Names of layers whose
            outputs need to be returned, default: None.
        as_tensor (bool): Same as that in ``OutputHook``. Default: True.

    Returns:
        dict[tuple(str, float)]: Top-5 recognition result dict.
        dict[torch.tensor | np.ndarray]:
            Output feature maps from layers specified in `outputs`.
    """
    if 'use_frames' in kwargs:
        warnings.warn('The argument `use_frames` is deprecated PR #1191. '
                      'Now you can use models trained with frames or videos '
                      'arbitrarily. ')
    if 'label_path' in kwargs:
        warnings.warn('The argument `use_frames` is deprecated PR #1191. '
                      'Now the label file is not needed in '
                      'inference_recognizer. ')

    input_flag = None
    if isinstance(video, dict):
        input_flag = 'dict'
    elif isinstance(video, np.ndarray):
        assert len(video.shape) == 4, 'The shape should be T x H x W x C'
        input_flag = 'array'
    elif isinstance(video, str) and video.startswith('http'):
        input_flag = 'video'
    elif isinstance(video, str) and osp.exists(video):
        if osp.isfile(video):
            input_flag = 'video'
        if osp.isdir(video):
            input_flag = 'rawframes'
    else:
        raise RuntimeError('The type of argument video is not supported: '
                           f'{type(video)}')

    if isinstance(outputs, str):
        outputs = (outputs, )
    assert outputs is None or isinstance(outputs, (tuple, list))

    cfg = model.cfg
    device = next(model.parameters()).device  # model device
    # build the data pipeline
    test_pipeline = cfg.data.test.pipeline
    # Alter data pipelines & prepare inputs
    if input_flag == 'dict':
        data = video
    if input_flag == 'array':
        modality_map = {2: 'Flow', 3: 'RGB'}
        modality = modality_map.get(video.shape[-1])
        data = dict(total_frames=video.shape[0],
                    label=-1,
                    start_index=0,
                    array=video,
                    modality=modality)
        for i in range(len(test_pipeline)):
            if 'Decode' in test_pipeline[i]['type']:
                test_pipeline[i] = dict(type='ArrayDecode')
    if input_flag == 'video':
        data = dict(filename=video, label=-1, start_index=0, modality='RGB')
        if 'Init' not in test_pipeline[0]['type']:
            test_pipeline = [dict(type='OpenCVInit')] + test_pipeline
        else:
            test_pipeline[0] = dict(type='OpenCVInit')
        for i in range(len(test_pipeline)):
            if 'Decode' in test_pipeline[i]['type']:
                test_pipeline[i] = dict(type='OpenCVDecode')
    if input_flag == 'rawframes':
        filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg')
        modality = cfg.data.test.get('modality', 'RGB')
        start_index = cfg.data.test.get('start_index', 1)

        # count the number of frames that match the format of `filename_tmpl`
        # RGB pattern example: img_{:05}.jpg -> ^img_\d+.jpg$
        # Flow patteren example: {}_{:05d}.jpg -> ^x_\d+.jpg$
        pattern = f'^{filename_tmpl}$'
        if modality == 'Flow':
            pattern = pattern.replace('{}', 'x')
        pattern = pattern.replace(
            pattern[pattern.find('{'):pattern.find('}') + 1], '\\d+')
        total_frames = len(
            list(
                filter(lambda x: re.match(pattern, x) is not None,
                       os.listdir(video))))
        data = dict(frame_dir=video,
                    total_frames=total_frames,
                    label=-1,
                    start_index=start_index,
                    filename_tmpl=filename_tmpl,
                    modality=modality)
        if 'Init' in test_pipeline[0]['type']:
            test_pipeline = test_pipeline[1:]
        for i in range(len(test_pipeline)):
            if 'Decode' in test_pipeline[i]['type']:
                test_pipeline[i] = dict(type='RawFrameDecode')

    test_pipeline = Compose(test_pipeline)
    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)

    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]

    # forward the model
    with OutputHook(model, outputs=outputs, as_tensor=as_tensor) as h:
        with torch.no_grad():
            scores = model(return_loss=False, **data)[0]
        returned_features = h.layer_outputs if outputs else None

    num_classes = scores.shape[-1]
    score_tuples = tuple(zip(range(num_classes), scores))
    score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)

    top5_label = score_sorted[:5]
    if outputs:
        return top5_label, returned_features
    return top5_label
Esempio n. 18
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--config',
                        type=str,
                        required=True,
                        help='Test config file path')
    parser.add_argument('--checkpoint',
                        type=str,
                        required=True,
                        help='Checkpoint file')
    parser.add_argument('--data_dir',
                        type=str,
                        required=True,
                        help='The dir with dataset')
    parser.add_argument('--out_dir',
                        type=str,
                        required=True,
                        help='Output directory')
    parser.add_argument('--dataset',
                        type=str,
                        required=True,
                        help='Dataset name')
    parser.add_argument('--gpus',
                        default=1,
                        type=int,
                        help='GPU number used for annotating')
    parser.add_argument('--proc_per_gpu',
                        default=2,
                        type=int,
                        help='Number of processes per GPU')
    parser.add_argument('--mode',
                        choices=['train', 'val', 'test'],
                        default='train')
    args = parser.parse_args()

    assert exists(args.config)
    assert exists(args.checkpoint)
    assert exists(args.data_dir)

    cfg = Config.fromfile(args.config)
    cfg = update_config(cfg, args, trg_name=args.dataset)
    cfg = propagate_root_dir(cfg, args.data_dir)

    dataset = build_dataset(cfg.data, args.mode, dict(test_mode=True))
    data_pipeline = Compose(dataset.pipeline.transforms[1:])
    print('{} dataset:\n'.format(args.mode) + str(dataset))

    tasks = prepare_tasks(dataset, cfg.input_clip_length)
    print('Prepared tasks: {}'.format(sum([len(v) for v in tasks.values()])))

    if not exists(args.out_dir):
        makedirs(args.out_dir)

    model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
    load_checkpoint(model, args.checkpoint, strict=False)

    batch_size = 4 * cfg.data.videos_per_gpu
    if args.gpus == 1:
        model = MMDataParallel(model, device_ids=[0])
        model.eval()

        process_tasks(tasks, dataset, model, args.out_dir, batch_size,
                      cfg.input_clip_length, data_pipeline)
    else:
        raise NotImplementedError
Esempio n. 19
0
        return False
    return True


if __name__ == '__main__':
    from pathlib import Path
    from mmcv import Config
    from tqdm import tqdm
    from mmaction.datasets.pipelines import Compose
    from railway.utils import utils
    import shutil
    import sys
    import time

    infer_pipeline_config = Config.fromfile('infer_pipeline.py')
    base_pipeline = Compose(infer_pipeline_config.base_decode_pipeline)
    hand_watch_pipleline = Compose(infer_pipeline_config.hand_watch_pipleline)

    input_dir = sys.argv[1]
    output_dir = sys.argv[2]
    video_paths = Path(input_dir).glob('**/*.mp4')
    utils.mkdir(output_dir)
    all_video_count = 0
    still_count = 0
    check_times = []
    for video_path in tqdm(list(video_paths)):
        all_video_count += 1
        video_path = str(video_path)
        _d = dict(filename=video_path, label=-1, start_index=0, modality='RGB')
        _base_data = base_pipeline(_d)
        img_data = _base_data['imgs']