Exemple #1
0
def test_source_time(video_mp4):
    sources = dict(
        file=video_mp4,
        # cam=0,
        # rtsp_qb="rtsp://*****:*****@[email protected]:554/cam/realmonitor?channel=1&subtype=1",
        # rtsp_qb="rtsp://*****:*****@[email protected]:554/cam/realmonitor?channel=1&subtype=1",
        # rtsp_ernie="rtsp://*****:*****@50.211.198.158:558/LiveChannel/0/media.smp",
        # nuuo_latham="nuuo://*****:*****@73.222.32.72:5250", # Cam 1
        # kvs_awscam="aws_cam-5302",
    )
    import av
    from time import time, localtime, strftime
    from datetime import datetime
    for src, path in sources.items():
        s = av.open(path)
        s_start = s.start_time  # us

        vs = s.streams[0]
        v_start = vs.start_time  # pts
        vtb = vs.time_base  # per frame time base in pts
        rates = (float(vs.guessed_rate), float(vs.average_rate),
                 float(vs.base_rate))

        cc = vs.codec_context
        fps = cc.framerate
        tps = cc.ticks_per_frame
        ctb = cc.time_base  # per frame time base in secs

        v = s.demux()
        pkts = [next(v) for i in range(3)]
        frames = [p.decode()[0] for p in pkts]

        pkt_ts = [(p.dts, p.pts) for p in pkts]
        pkt_rts = [(f"{float(p.dts * vtb):.3f}", f"{float(p.pts * vtb):.3f}")
                   for p in pkts]
        pkt_durations = [(p.duration, round(float(p.duration * vtb), 3))
                         for p in pkts]
        frame_ts = [(f.dts, f.pts) for f in frames]
        frame_rts = [(f"{float(f.dts * vtb):.3f}s",
                      f"{float(f.pts * vtb):.3f}s") for f in frames]

        #assert s.start_time == vs.start_time
        #assert vs.time_base == cc.time_base

        s_timestamp = strftime("%X %x", localtime(s_start / 1e6))
        v_timestamp = strftime("%X %x", localtime(float(v_start * vtb)))
        print()
        print(f'##### {src}={path} #####')
        print(f'stream start: {s_timestamp}, {s_start/1e6:.3f}s, {s_start}us')
        print(
            f'video start: {v_timestamp}, {float(v_start * vtb):.3f}s, {v_start}pts'
        )
        print(
            f"FPS={fps}({1 / (ctb * tps)}), rates={rates}, time_base={vtb}({ctb}), ticks_per_frame={tps}"
        )
        print(f"pkt time: {pkt_ts}, {pkt_rts}")
        print(f"pkt duration: {list(zip(*pkt_durations))}")
        print(f"frame time: {frame_ts}, {frame_rts}")
Exemple #2
0
def av_dump(src, count=60, last=None, **options):
    print()
    print(f'========== av: {src} ============')
    s = av.open(src, options=options)
    v = s.demux(video=0)
    pkts = [next(v) for _ in range(count)][last:]
    print('key:', [pkt.is_keyframe for pkt in pkts])
    print('corrupt:', [pkt.is_corrupt for pkt in pkts])
    print('dts:', [pkt.dts for pkt in pkts])
    print('pts:', [pkt.pts for pkt in pkts])
    print('duration:', [pkt.duration for pkt in pkts])
    print('size:', [pkt.size for pkt in pkts])
    s.close()
    return pkts
Exemple #3
0
def test_motion_vectors(bitstream_short):
    s = av.open(bitstream_short)
    codec = s.streams.video[0].codec_context
    codec.options = dict(flags2='+export_mvs')
    v = s.decode(video=0)
    frames = [next(v) for _ in range(15)]
    #print([list(f.side_data.keys()) for f in frames[:3]])
    MVs = [f.side_data.get('MOTION_VECTORS') for f in frames]
    print()
    for i, mv in enumerate(MVs):
        if mv is None:
            continue
        print(f"frame[{i}]:", len(mv), 'MVs')
        print(mv.to_ndarray()[:10])
Exemple #4
0
def test_youtube_live_av(url):
    source = AVSource.create(url)
    assert type(source) is youtube.YTSource
    assert source.url == url
    hls = youtube.yt_hls_url(url)
    options = dict(
        rtsp_transport='http',  # required for VPN
        rtsp_flags='prefer_tcp',
        stimeout='2000000')  # in case of network down
    logging.info(f"av.open({hls}, {options})")
    s = av.open(hls, options=options, timeout=5.0 * 2)
    v = s.demux(video=0)
    for i in range(100):
        f = next(v)
        pts = float(f.pts * f.time_base)
        duration = float(f.duration * f.time_base)
        logging.info(
            f"frame[{i}] time={pts:.3f}s, duration={duration:.3f}s, now={time():.3f}s"
        )
Exemple #5
0
    def open(self):
        if self.stream is not None:
            logging.warning("Already opened")
            return False

        self.encoder = av.open(ENCODERS[self.ch])
        self.video = self.encoder.streams[0]
        self.codec = self.video.codec_context
        self.stream = self.encoder.demux()

        self.duration = float(self.codec.time_base *
                              self.codec.ticks_per_frame)
        self.fps = 1 / self.duration  # nominal FPS
        self.rate = float(self.codec.rate)  # average FPS

        self.started = False
        self.start = None
        self.time = None
        self.frames = 0
        return True
Exemple #6
0
def track_video():
    """
    Usage:
        # images: /zdata/projects/shared/datasets/kinetics400/frames-5fps/val/abseiling/GwlcmI36imo_000127_000137
        track_video /path/to/video.mp4 [--det-tag v3.1] [--render-all]
        track_video /path/to/image.jpg [--det-tag v3.1] [--render-all]
        track_video /path/to/images/ [--det-tag v3.1] [--render-all]

    Options:
        --det-chkpt-url s3://latest-sinet-checkpoints/detector/yolo/yolo5x-custom_rama_new-v3.1.pt \
        --det-chkpt yolov5x_custom \
        --det-classes 5 \
        --render-all \
        --reload
    """
    parser = argparse.ArgumentParser(
        'Deploy a trained YOLO5 checkpoint on S3 by stripping out training states'
    )
    parser.add_argument('path',
                        help='Path to a trained checkpoint for deployment')
    parser.add_argument('-o',
                        '--output',
                        default='export',
                        help='Path to output visualizations')

    parser.add_argument(
        '-b',
        '--batch-size',
        default=24,
        type=int,
        help='Batch size to perform object detection inference')
    parser.add_argument('--fps',
                        default=5,
                        type=int,
                        help='Frames per second in the video')
    parser.add_argument('--reload',
                        action='store_true',
                        help='Forece to reload checkpoint')
    parser.add_argument('--det-amp',
                        action='store_true',
                        help='Inference in AMP')
    parser.add_argument('--det-chkpt',
                        default='yolov5x',
                        choices=['yolov5x', 'yolov5x_custom'],
                        help='Checkpoint name to save locally')
    parser.add_argument('--det-backend',
                        default=None,
                        choices=['trt'],
                        help='Inference backend to use')
    parser.add_argument('--det-trt-fp16',
                        action='store_true',
                        help='TRT FP16 enabled or not')
    parser.add_argument('--det-trt-int8',
                        action='store_true',
                        help='TRT INT8 enabled or not')
    parser.add_argument('--det-chkpt-url',
                        help='S3 URL to download checkpoint')
    parser.add_argument('--det-classes',
                        type=int,
                        default=80,
                        help='Number of classes to detect by the model')
    parser.add_argument('--det-tag',
                        default='v6.0',
                        help='Object detector code base git tag')
    parser.add_argument('--det-resize',
                        default=[720, 1280],
                        type=int,
                        help='Resize frame')
    parser.add_argument('--det-scales',
                        default=640,
                        type=int,
                        choices=[608, 640, 672, 736],
                        help='Size to rescale input for object detection')
    parser.add_argument('--det-cls-thres',
                        default=0.4,
                        type=float,
                        help='Object class confidence threshold')
    parser.add_argument('--det-nms-thres',
                        default=0.5,
                        type=float,
                        help='NMS IoU threshold')
    parser.add_argument('--det-pooling',
                        default=1,
                        type=int,
                        help='Object feature pooling size for tracking')
    parser.add_argument('--trk-cls-person',
                        nargs='+',
                        default=[0],
                        help='One or more person classes to track')
    parser.add_argument('--trk-max-iou-dist',
                        default=0.8,
                        type=float,
                        help='max (1 - IoU) distance to track ')
    parser.add_argument('--trk-max-feat-dist',
                        default=0.1395,
                        type=float,
                        help='max (1 - feature similarity) distance to track')
    parser.add_argument('--trk-gating-kf',
                        default='iain',
                        choices=['org', 'iain', False],
                        help='KF gating for Deep Sort')
    parser.add_argument('--trk-gating-thrd',
                        default=50,
                        type=float,
                        help='KF gating threshold')
    parser.add_argument('--trk-gating-alpha',
                        default=0.2,
                        type=float,
                        help='KF gating parameter')
    parser.add_argument('--render-all',
                        action='store_true',
                        help='Render all objects or person only')
    cfg = parser.parse_args()
    print(cfg)

    from ml.vision.models import yolo5x, yolo5
    from ml.vision.models.tracking.dsort import DSTracker
    from ml.vision.ops import dets_select
    from torchvision.transforms import functional as TF
    from torchvision.io import write_jpeg, read_image
    from ml import av, hub, logging, time
    import numpy as np
    import torch as th
    path = Path(cfg.path)
    fps = cfg.fps
    src = None
    if path.suffix in ['.mp4', '.avi']:
        # path to video
        src = av.open(cfg.path)
        v = src.decode(video=0)
        codec = src.streams[0].codec_context
        fps = round(codec.framerate)
        logging.info(f"Tracking video@{float(fps):.2f}fps in {path}")
    else:
        # path to image or a directory of subsampled images
        if path.is_file():
            paths = [path]
        elif path.is_dir():
            paths = sorted([f for f in path.iterdir() if f.is_file()])

        def framer():
            for p in paths:
                if True:
                    # Follow feature extraction to load images in accimage.Image followed by ToTensor
                    img = read_image(str(p))
                    yield img
                else:
                    yield cv.imread(p)

        v = framer()
        logging.info(f"Tracking {len(paths)} frames@{cfg.fps}fps in {path}")

    dev = th.cuda.default_stream().device if th.cuda.is_available() else 'cpu'
    if cfg.det_chkpt_url:
        model = yolo5
        spec = hub.parse(cfg.det_chkpt_url)
        s3 = spec['scheme'] == 's3://' and spec or None
        # detector = model(chkpt=cfg.det_chkpt, tag=cfg.det_tag, pretrained=True, classes=cfg.det_classes, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload, s3=s3).to(dev)
        # detector = model(chkpt='yolov5x-v2.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-v2.0.pt'), pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev)
        # detector = model(chkpt='yolov5x-v1.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-v1.0.pt'), tag='v1.0', pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev)
        # detector = model(chkpt='yolov5x-store-v1.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-store-v1.0.pt'), tag='v1.0', pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev)
        # detector = model(chkpt='yolov5x-retail81-v1.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-retail81-v1.0.pt'), tag='v1.0', pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev)
        detector = model(classes=cfg.det_classes,
                         pretrained=True,
                         chkpt=cfg.det_chkpt,
                         tag=cfg.det_tag,
                         s3=s3,
                         fuse=True,
                         pooling=cfg.det_pooling,
                         force_reload=cfg.reload)
    else:
        model = yolo5x
        detector = model(tag=cfg.det_tag,
                         pretrained=True,
                         classes=cfg.det_classes,
                         fuse=True,
                         pooling=cfg.det_pooling,
                         force_reload=cfg.reload).to(dev)

    if cfg.det_backend in ['trt']:
        import math
        # XXX Deployment by batch size and minimal preprocessed shape
        amp = cfg.det_trt_fp16
        bs = cfg.batch_size
        scale = cfg.det_scales
        H, W = cfg.det_resize
        if W > H:
            spec = (3, 32 * math.ceil(H / W * scale / 32), scale)
        else:
            spec = (3, scale, 32 * math.ceil(W / H * scale / 32))
        logging.info(
            f"Deploying runtime={cfg.det_backend} with batch_size={bs}, spec={spec}, fp16={amp}"
        )
        detector.deploy('yolov5x',
                        batch_size=bs,
                        spec=spec,
                        fp16=amp,
                        int8=cfg.det_trt_int8,
                        backend=cfg.det_backend,
                        reload=False)

    tracker = DSTracker(max_feat_dist=cfg.trk_max_feat_dist,
                        max_iou_dist=cfg.trk_max_iou_dist,
                        max_age=cfg.fps * 2,
                        n_init=3,
                        nn_budget=cfg.fps * 2,
                        gating_kf=cfg.trk_gating_kf,
                        gating_thrd=cfg.trk_gating_thrd,
                        gating_alpha=cfg.trk_gating_alpha)

    export_path = Path(
        f'{cfg.output}/{path.stem}-{cfg.det_chkpt}_{cfg.det_tag}_{cfg.det_scales}_{cfg.det_cls_thres}_{cfg.det_nms_thres}'
    )
    # create frame path
    export_frame = export_path / 'rendered_frames'
    export_frame.mkdir(parents=True, exist_ok=True)
    assert export_frame.exists()

    TRACK_INFO = 'x1, y1, x2, y2, score, class, origin_x, origin_y, velocity_x, velocity_y'
    DETECTION_INFO = 'x1, y1, x2, y2, score, class'
    annot_dict = defaultdict(dict)

    def render_frame(idx, frame, dets, tracks=False):
        from torchvision.utils import draw_bounding_boxes
        from ml.vision.utils import rgb, COLORS91
        from ml.vision.datasets.coco import COCO80_CLASSES

        if tracks:
            tids, dets = list(zip(*dets))
            dets = th.stack(dets)
            labels = [f"[{int(c)}][{tid}]" for tid, c in zip(tids, dets[:, 5])]
            colors = [rgb(tid, integral=True) for tid in tids]
            annot_dict[idx]['tracks'] = {
                tid: det
                for tid, det in zip(tids, dets.tolist())
            }
        else:
            labels = [COCO80_CLASSES[i] for i in dets[:, -1].int()]
            colors = [COLORS91[i] for i in dets[:, 5].int()]
            annot_dict[idx]['detections'] = dets.tolist()
        # get boxes: x1, y1, x2, y2
        boxes = dets[:, :4]
        # draw bounding boxes
        frame = draw_bounding_boxes(frame,
                                    boxes=boxes,
                                    labels=labels,
                                    colors=colors,
                                    fill=True,
                                    width=3,
                                    font_size=25)
        return frame

    logging.info(f"Saving tracked video and frames to {export_path}")
    media = av.open(f"{export_path}/{path.stem}-tracking.mp4", 'w')
    stream = media.add_stream('h264', cfg.fps)
    stream.bit_rate = 2000000

    def track_frames(frames, start, step):
        frames = th.stack(frames)
        # Track person only
        with th.cuda.amp.autocast(enabled=cfg.det_amp):
            dets, features = detector.detect(frames,
                                             size=cfg.det_scales,
                                             conf_thres=cfg.det_cls_thres,
                                             iou_thres=cfg.det_nms_thres,
                                             batch_preprocess=True)
        persons = dets_select(dets, cfg.trk_cls_person)
        objs = [
            dets_f[~persons_f].cpu()
            for dets_f, persons_f in zip(dets, persons)
        ]
        ppls = [
            dets_f[persons_f].cpu()
            for dets_f, persons_f in zip(dets, persons)
        ]
        ppl_feats = [
            feats_f[persons_f].cpu()
            for feats_f, persons_f in zip(features, persons)
        ]
        for j, (objs_f, ppls_f,
                ppl_feats_f) in enumerate(zip(objs, ppls, ppl_feats), start):
            logging.info(
                f"[{start + (j - start) * step}] objs: {tuple(objs_f.shape)}, ppls: {tuple(ppls_f.shape)}, feats: {tuple(ppl_feats_f.shape)}"
            )
            assert objs_f.shape[1] == 4 + 1 + 1
            assert ppls_f.shape[1] == 4 + 1 + 1
            assert len(ppls) == len(ppl_feats)
            # assert ppl_feats.shape[1] == 256 + 512 + 1024
            assert ppl_feats_f.shape[1] == 320 + 640 + 1280
            matches = tracker.update(
                ppls_f,
                ppl_feats_f.view(len(ppl_feats_f),
                                 np.prod(ppl_feats_f.shape[1:])))
            snapshot = tracker.snapshot()
            tracks = []
            for tid, info in snapshot:
                tracks.append([tid, info])
            logging.debug(f"matches[{start + (j - start) * step}]: {matches}")
            logging.debug(
                f"snapshot[{start + (j - start) * step}]: {snapshot}")

            # Render both dets and tracks side by side
            frame_det = frames[j - start]
            frame_trk = frame_det.clone()
            C, H, W = frame_det.shape
            idx = f'{start + (j - start) * step:03d}'
            if cfg.render_all:
                dets = th.cat([ppls_f, objs_f])
                frame_det = render_frame(idx, frame_det, dets, False)
            else:
                frame_det = render_frame(idx, frame_det, ppls_f, False)
            if tracks:
                frame_trk = render_frame(idx, frame_trk, tracks, True)

            frame = th.zeros((C, H, 2 * W), dtype=th.uint8)
            frame[:, :, :W] = frame_det
            frame[:, :, W:] = frame_trk
            write_jpeg(frame, str(export_frame / f"frame{idx}.jpg"))
            if media is not None:
                frame = av.VideoFrame.from_ndarray(frame.permute(1, 2,
                                                                 0).numpy(),
                                                   format='rgb24')
                packets = stream.encode(frame)
                media.mux(packets)
                logging.debug(f'Encoded: {len(packets)} {packets}, {frame}')

    frames = []
    BS = bs = cfg.batch_size
    step = 1 if src is None else fps // cfg.fps
    t = time.time()
    for i, frame in enumerate(v):
        if isinstance(frame, av.VideoFrame):
            frame = th.as_tensor(
                np.ascontiguousarray(frame.to_rgb().to_ndarray())).permute(
                    2, 0, 1)
            if cfg.det_resize and cfg.det_backend in ['trt']:
                frame = TF.resize(frame, cfg.det_resize, antialias=True)
            assert frame.data.contiguous
        if i == 0:
            stream.height = frame.shape[1]
            stream.width = frame.shape[2] * 2
        if src is not None and i % step != 0:
            continue
        frames.append(frame)
        if len(frames) < BS:
            continue
        assert len(frames) == BS
        track_frames(frames, i - (BS - 1) * step, step)
        frames.clear()
        bs = BS

    if frames:
        track_frames(frames, i - (len(frames) - 1) * step, step)
    if media is not None:
        packets = stream.encode(None)
        if packets:
            media.mux(packets)
        media.close()
    if src is not None:
        src.close()

    # write annotations to file
    annotations = {
        'info': {
            'track': TRACK_INFO,
            'detection': DETECTION_INFO
        },
        'annotations': annot_dict
    }
    with open(f'{export_path}/annotations.json', 'w') as f:
        json.dump(annotations, f, indent=4)

    elapse = time.time() - t
    logging.info(
        f"Done tracking {path.name} in {elapse:.3f}s at {(i + 1) / step / elapse:.2f}fps"
    )
Exemple #7
0
def test_rtsp(rtsp):
    workaround = True
    s = av.open(rtsp, options=dict(rtsp_transport='http'))
    v = s.demux(video=0)
    codec = s.streams[0].codec_context
    if True:
        pkts = []
        for _ in range(15 * 2):
            pkt = next(v)
            NALUs = []
            for (pos, _, _, type), nalu in NALUParser(memoryview(pkt),
                                                      workaround=False):
                NALUs.append(nalu[-1] == 0x00 and nalu[:-1] or nalu)
            packet = av.Packet(b''.join(NALUs))
            packet.dts = pkt.dts
            packet.pts = pkt.pts
            packet.time_base = pkt.time_base
            pkt = packet
            pkts.append(pkt)
    else:
        pkts = [next(v) for _ in range(15 * 2)]
    print(pkts)
    #   frames = [pkt.decode()[0] for pkt in pkts]

    print()
    if codec.extradata is not None:
        for (pos, _, _, type), nalu in NALUParser(codec.extradata):
            logging.info(
                f"CPD {NALU_t(type).name} at {pos}: {nalu[:8]} ending with {nalu[-1:]}"
            )
        with open(f"tmp/cpd.264", 'wb') as f:
            f.write(codec.extradata)
    for i, pkt in enumerate(pkts):
        print(f"pkt[{i}] {pkt.is_keyframe and 'key ' or ''}{pkt}")
        for (pos, _, _, type), nalu in NALUParser(memoryview(pkt),
                                                  workaround=False):
            logging.info(
                f"frame[{i}] {NALU_t(type).name} at {pos}: {nalu[:8].tobytes()} ending with {nalu[-1:].tobytes()}"
            )
        with open(f"tmp/frame{i:02d}.264", 'wb') as f:
            f.write(pkt)
#    for i, frame in enumerate(frames):
#        print(f"frame[{i}] {frame.key_frame and 'key' or ''}{frame}")

    decoded = []
    h264 = av.CodecContext.create('h264', 'r')
    for i, pkt in enumerate(pkts):
        res = h264.decode(pkt)
        print(f"pkt[{i}] {len(res)} frames decoded")
        if res:
            assert len(res) == 1
            decoded.append(res[0])
    res = h264.decode()
    if res:
        print(f"pkt[-1] {len(res)} frames decoded")
        assert len(res) == 1
        decoded.append(res[0])

    if workaround:
        print(decoded)
        from ml import cv
        for i, f in enumerate(decoded):
            cv.save(f.to_rgb().to_ndarray()[:, :, ::-1],
                    f'tmp/images/frame{i:02d}.jpg')
    else:
        for i, f in enumerate(decoded):
            assert (f.to_ndarray() == frames[i].to_ndarray()).all()
Exemple #8
0
def openAV(src, decoding=False, with_audio=False, **kwargs):
    try:
        format = None
        options = None
        fps = float(kwargs.get('fps', 10))
        if str(src).startswith('rtsp'):
            # ffmpeg RTSP options:
            # rtsp_transport: tcp, http, udp_multicast, udp
            # rtsp_flags: prefer_tcp, filter_src, listen, none
            # allowed_media_types: video, audio, data
            # stimeout: socket TCP I/O timeout in us
            # RTSP/HTTP required for VPN but not necessarily supported
            options = dict(
                rtsp_transport=kwargs.get('rtsp_transport', 'tcp'),
                rtsp_flags='prefer_tcp',
                stimeout=kwargs.get('stimeout',
                                    '5000000'))  # in case of network down

            # NOTE: retry with different rtsp transport types if unspecified
            if options and options.get('rtsp_transport', None):
                source = None
                for transport in ['tcp', 'http']:
                    try:
                        options['rtsp_transport'] = transport
                        source = av.open(src,
                                         format=format,
                                         options=options,
                                         timeout=(15, 5))
                    except Exception as e:
                        logging.warning(
                            f'Failed with rtsp_transport={transport}: {e}')
                    else:
                        options['rtsp_transport'] = transport
                        break
                assert source is not None, f"Failed to open RTSP source over TCP/HTTP"
            else:
                source = av.open(src,
                                 format=format,
                                 options=options,
                                 timeout=(15, 5))
        else:
            if isinstance(src, int) or (isinstance(src, str)
                                        and src.startswith('/dev/video')):
                # XXX webcam: high FPS with MJPG
                import platform
                system = platform.system()
                resolution = av.resolution_str(
                    *kwargs.get('resolution', ['720p']))
                options = {
                    'framerate': str(fps),
                    'video_size': resolution,
                    'input_format': 'mjpeg'
                }
                decoding = True
                if system == 'Darwin':
                    src = str(src)
                    format = 'avfoundation'
                elif system == 'Linux':
                    src = f"/dev/video{src}" if isinstance(src, int) else src
                else:
                    raise ValueError(f"Webcam unsupported on {system}")
            source = av.open(src, format=format, options=options)

        # timeout: maximum timeout (in secs) to wait for incoming connections and soket reading
        # XXX HLS connection potential time out for taking more than 5s
        logging.info(
            f"av.open({src}, format={format}, options={options}, timeout=(15, 5))"
        )
    except Exception as e:
        logging.error(e)
        raise e
    else:
        '''
        H.264 NALU formats:
        Annex b.: 
            RTSP/RTP: rtsp, 'RTSP input', set()
            bitstream: h264, 'raw H.264 video', {'h26l', 'h264', '264', 'avc'}
            webcam: /dev/videoX, ...
            DeepLens: /opt/.../...out
            NUUO/NVR: N/A
        AVCC:
            avi: avi, 'AVI (Audio Video Interleaved)', {'avi'}
            mp4: 'mov,mp4,m4a,3gp,3g2,mj2', 'QuickTime / MOV', {'m4a', 'mov', 'mp4', 'mj2', '3gp', '3g2'}
            webm/mkv: 'matroska,webm', 'Matroska / WebM', {'mks', 'mka', 'mkv', 'mk3d'}
            DASH/KVS(StreamBody): file-like obj
        '''
        now = time.time()
        start_time = source.start_time / 1e6  # us
        relative = abs(start_time -
                       now) > 60 * 60 * 24 * 30  # Too small to be absolute
        rt = not (isinstance(src, str) and os.path.isfile(src)
                  )  # regular file or not
        if rt:
            logging.info(f"Assume real-time source: {src}")
        else:
            logging.info(f"Simulating local source as real-time: {src}")

        # XXX start_time may be negative (webcam), zero if unavailable, or a small logical timestamp
        session = dict(
            src=src,
            streams=source,
            format=source.format.name,
            decoding=decoding,
            start=relative and now or start_time,
            rt=rt,
        )
        session_start_local = strftime('%X', localtime(session['start']))
        source_start_local = strftime('%X', localtime(start_time))
        logging.info(
            f"Session start: {session['start']:.3f}s({session_start_local}), source start: {start_time:.3f}s({source_start_local})"
        )
        if source.streams.video:
            # FIXME RTSP FPS might be unavailable or incorrectly set
            video0 = source.streams.video[0]
            codec = video0.codec_context
            FPS = 1 / (codec.time_base * codec.ticks_per_frame)
            fps = FPS > 60 and (codec.framerate
                                and float(codec.framerate)) or fps
            session['video'] = dict(
                stream=source.demux(video=0),
                start=video0.start_time,  # same as 1st frame in pts
                codec=codec,
                format=video0.name,
                width=video0.width,
                height=video0.height,
                fps=fps,
                count=0,
                time=0,  # pts in secs
                duration=None,  # frame duration in secs
                drifting=False,
                adaptive=kwargs.get('adaptive', True),
                workaround=kwargs.get('workaround', True),
                thresholds=dict(drifting=10, ),
                prev=None,
            )
            logging.info(
                f"codec.framerate={codec.framerate}, codec.time_base={codec.time_base}, codec.ticks_per_frame={codec.ticks_per_frame}, fps={session['video']['fps']}, FPS={FPS}"
            )
        if source.streams.audio:
            audio0 = source.streams.audio[0]
            codec = audio0.codec_context
            logging.warning(f"No audio streaming supported yet")
            '''
            if codec.name == 'aac':
                logging.warning(f"AAC is not supported yet")
            else:
                session['audio'] = dict(
                    stream=decoding and source.decode(audio=0) or source.demux(audio=0),
                    start=audio0.start_time,        # same as 1st frame in pts
                    format=audio0.name,
                    codec=codec,
                    sample_rate=codec.sample_rate,
                    channels=len(codec.layout.channels),
                    count=0,
                    time=0,
                )
            '''
        return session
Exemple #9
0
def test_fragments(stream, total, inf):
    kvs = boto3.client("kinesisvideo")
    info = kvs.describe_stream(StreamName=stream)['StreamInfo']
    assert stream == info['StreamName']
    print(f"{stream} info:")
    for key, value in info.items():
        print(f'\t{key}:', value)

    media, codec = info['MediaType'].split('/')
    assert media == 'video'
    assert codec == 'h264'
    dataEndpoint = kvs.get_data_endpoint(StreamName=stream,
                                         APIName='GET_MEDIA')['DataEndpoint']
    print(f"{stream} endpoint: {dataEndpoint}")

    kvm = boto3.client('kinesis-video-media',
                       endpoint_url=dataEndpoint,
                       region_name='us-east-1')
    while True:
        now = elapse = tic = time()
        media = kvm.get_media(
            StreamName=stream,
            StartSelector=dict(StartSelectorType='NOW'),
            #StartSelector=dict(StartSelectorType='PRODUCER_TIMESTAMP',
            #StartSelector=dict(StartSelectorType='SERVER_TIMESTAMP',
            #                   StartTimestamp=time())
        )

        contentType = media['ContentType']
        payload = media['Payload']
        print(f"Received {contentType} stream payload")

        webm = av.open(payload)
        video = webm.streams.video[0]
        codec = video.codec_context
        print(webm.format, f"{webm.start_time / 1000:.3f}")  # start time in ms
        print(video, video.type, video.time_base,
              f"{video.start_time / 1000:.3f}s", video.base_rate,
              video.average_rate, video.guessed_rate)  # start time in s
        print(f"{codec.type}/{codec.name}", codec.time_base,
              codec.ticks_per_frame)  # codec type/name, frame duration, FPS

        start = video.start_time
        duration_cc = float(codec.time_base * codec.ticks_per_frame)
        fps = 1 / duration_cc
        started = False
        X = sys.x_available()
        print(
            f"Streaming {codec.type}/{codec.name} since {start/1000:.3f}s(now={now:.3f}s, diff={now-start/1000:.3f}s) at {fps:.2f}FPS with frame duration of {duration_cc:.3f}s"
        )
        try:
            for i, packet in enumerate(webm.demux(video=0), 1):
                frame = packet.decode()[0]
                now = time()
                pts = frame.time
                duration_pkt = float(packet.duration *
                                     packet.time_base)  # FIXME 0 -> 1/FPS
                frame = frame.to_rgb().to_ndarray()[:, :, ::-1]
                print(
                    f"{packet.is_keyframe and 'key ' or ''}frame[{i}]{frame.shape} of {frame.nbytes} bytes with pts={pts:.3f} at {now:.3f}s and duration={duration_pkt:.3f}s({packet.time_base}, {packet.duration})"
                )

                elapse += duration_pkt
                slack = elapse - now
                if slack > 0:
                    print(f"Sleep for {slack:.3f}s")
                    sleep(slack)

                if X:
                    cv.imshow('LIVE', frame)
                    cv.waitKey(1)
                if i == total:
                    print(f"RT FPS={total/(now-tic)}")
                    break
        except Exception as e:
            print(f"Failed to decode: {e}")
            webm.close()
        if not inf:
            break
Exemple #10
0
def test_rfcn_deep_sort(video):
    import numpy as np
    from ml.vision.models.tracking.dsort import DeepSort
    model, size = rfcn, 608
    detector = model(pooling=2,
                     model_dir="/tmp/checkpoints",
                     force_reload=not True)
    tracker = DeepSort(
        max_feat_dist=0.2,
        nn_budget=100,
        max_iou_dist=0.7,  # 0.7
        max_age=15,  # 30 (FPS)
        n_init=3)  # 3

    from ml import av
    s = av.open(video)
    v = s.decode()
    video = Path(video)
    export = Path(f'export/{video.stem}-{model.__name__}')
    export.mkdir(exist_ok=True)
    assert export.exists()

    print(f"Tracking video: {video}")
    print(f"Saving to {export / 'tracking.mp4'}")
    media = av.open(f"{export}/tracking.mp4", 'w')
    stream = media.add_stream('h264', 15)
    stream.bit_rate = 2000000
    for i, frame in enumerate(v):
        if i == 0:
            stream.height = frame.height
            stream.width = frame.width

        frame = frame.to_rgb().to_ndarray()[:, :, ::-1]
        dets, features = detector.detect([frame], size=size)
        if True:
            # Track person only
            person = dets[0][:, -1] == 0
            dets[0] = dets[0][person]
            features[0] = features[0][person]

        assert len(dets) == 1
        assert len(dets[0]) == features[0].shape[0]
        assert dets[0].shape[1] == 4 + 1 + 1
        # assert features[0].shape[1] == 256+512+1024
        assert features[0].shape[1] == 1024

        if len(dets[0]) > 0:
            D = 1
            for s in features[0].shape[1:]:
                D *= s
            tracker.update(dets[0], features[0].view(len(features[0]), D))
            if i < 60:
                logging.info(
                    f"[{i}] dets[0]: {dets[0].shape}, feats: {[tuple(feats.shape) for feats in features]}"
                )
                detector.render(frame,
                                dets[0],
                                path=export / 'dets' / f"frame{i:03d}.jpg")
            else:
                break

        snapshot = tracker.snapshot()
        logging.info(
            f"[{i}] snapshot[0]: {snapshot and list(zip(*snapshot))[0] or len(snapshot)}"
        )
        frame = detector.render(
            frame,
            snapshot,
            path=
            f"export/{video.stem}-{model.__name__}/tracking/frame{i:03d}.jpg")
        #frame = detector.render(frame, snapshot)

        if media is not None:
            shape = frame.shape
            frame = av.VideoFrame.from_ndarray(frame, format='bgr24')
            packets = stream.encode(frame)
            print('encoded:', packets, frame)
            media.mux(packets)
    if media is not None:
        packets = stream.encode(None)
        media.mux(packets)
        media.close()
Exemple #11
0
def test_yolo5x_store_deep_sort(video):
    import numpy as np
    from ml.vision.models.tracking.dsort import DeepSort
    from ml.vision.datasets.widerperson import WIDERPERSON_CLASSES
    WIDERPERSON_CLASSES[0] = 'object'
    model, size = yolo5, 736
    detector = model(name='yolov5x-store',
                     pretrained=True,
                     bucket='eigen-pretrained',
                     key='detection/yolo/yolov5x-store.pt',
                     classes=len(WIDERPERSON_CLASSES),
                     pooling=True,
                     fuse=True,
                     model_dir=None,
                     force_reload=not True)
    pooler = MultiScaleFusionRoIAlign(3)
    tracker = DeepSort(
        max_feat_dist=0.2,
        nn_budget=100,
        max_iou_dist=0.7,  # 0.7
        max_age=15,  # 30 (FPS)
        n_init=3)  # 3

    from ml import av
    s = av.open(video)
    v = s.decode()
    video = Path(video)
    export = Path(f'export/{video.stem}-{model.__name__}')
    export.mkdir(exist_ok=True)
    assert export.exists()

    print(f"Tracking video: {video}")
    print(f"Saving to {export / 'tracking.mp4'}")
    media = av.open(f"{export}/tracking.mp4", 'w')
    stream = media.add_stream('h264', 15)
    stream.bit_rate = 2000000
    for i, frame in enumerate(v):
        if i == 0:
            stream.height = frame.height
            stream.width = frame.width

        frame = frame.to_rgb().to_ndarray()[:, :, ::-1]
        dets, features = detector.detect([frame], size=size)

        # Track person only
        person = (0 < dets[0][:, -1]) & (dets[0][:, -1] < 4)
        persons = dets[0][person]
        features[0] = features[0][person]

        assert len(dets) == 1
        assert len(persons) == features[0].shape[0]
        assert dets[0].shape[1] == 4 + 1 + 1
        assert features[0].shape[1] == 320 + 640 + 1280
        if len(dets[0]) > 0:
            D = 1
            for s in features[0].shape[1:]:
                D *= s
            tracker.update(persons, features[0].view(len(features[0]), D))
            if i < 60:
                logging.info(
                    f"[{i}] dets[0]: {dets[0].shape}, feats: {[tuple(feats.shape) for feats in features]}"
                )
                cv.render(frame,
                          dets[0],
                          classes=WIDERPERSON_CLASSES,
                          path=export / 'dets' / f"frame{i:03d}.jpg")
            else:
                break

        snapshot = tracker.snapshot()
        logging.info(
            f"[{i}] snapshot[0]: {snapshot and list(zip(*snapshot))[0] or len(snapshot)}"
        )
        frame = cv.render(
            frame,
            snapshot,
            classes=WIDERPERSON_CLASSES,
            path=
            f"export/{video.stem}-{model.__name__}/tracking/frame{i:03d}.jpg")
        #frame = detector.render(frame, snapshot)

        if media is not None:
            shape = frame.shape
            frame = av.VideoFrame.from_ndarray(frame, format='bgr24')
            packets = stream.encode(frame)
            print('encoded:', packets, frame)
            media.mux(packets)
    if media is not None:
        packets = stream.encode(None)
        media.mux(packets)
        media.close()
Exemple #12
0
def test_yolo_deep_sort(video):
    import numpy as np
    from ml.vision.models.tracking.dsort import DeepSort
    from ml import av
    model, size = yolo4, 608
    model, size = yolo5x, 736
    detector = model(pretrained=True, fuse=True, pooling=True)
    pooler = MultiScaleFusionRoIAlign(3)
    tracker = DeepSort(
        max_feat_dist=0.2,
        nn_budget=100,
        max_iou_dist=0.7,  # 0.7
        max_age=15,  # 30 (FPS)
        n_init=3)  # 3

    video = Path(video)
    if video.suffix in ['.mp4', '.avi']:
        s = av.open(video)
        v = s.decode(video=0)
        print(f"Tracking video: {video}")
    else:
        s = None
        if video.is_file():
            files = [video]
        elif video.is_dir():
            files = sorted([f for f in video.iterdir() if f.is_file()])
        v = [cv.imread(f) for f in files]
        print(f"Tracking {len(files)} frames in {video}")
    export = Path(f'export/{video.stem}-{model.__name__}')
    export.mkdir(parents=True, exist_ok=True)
    assert export.exists()

    print(f"Saving to {export / 'tracking.mp4'}")
    media = av.open(f"{export}/tracking.mp4", 'w')
    stream = media.add_stream('h264', 15)
    stream.bit_rate = 2000000
    for i, frame in enumerate(v):
        if not isinstance(frame, np.ndarray):
            frame = frame.to_rgb().to_ndarray()[:, :, ::-1]

        if i == 0:
            stream.height = frame.shape[0]
            stream.width = frame.shape[1]
        dets, features = detector.detect([frame], size=size)

        # Track person only
        person = dets[0][:, -1] == 0
        persons = dets[0][person]
        features[0] = features[0][person]

        assert len(dets) == 1
        assert len(persons) == features[0].shape[0]
        assert dets[0].shape[1] == 4 + 1 + 1
        # assert features[0].shape[1] == 256+512+1024
        assert features[0].shape[1] == 320 + 640 + 1280

        if len(dets[0]) > 0:
            D = 1
            for s in features[0].shape[1:]:
                D *= s
            tracker.update(persons, features[0].view(len(features[0]), D))
            if i < 60:
                logging.info(
                    f"[{i}] dets[0]: {dets[0].shape}, feats: {[tuple(feats.shape) for feats in features]}"
                )
                cv.render(frame,
                          dets[0],
                          path=export / 'dets' / f"frame{i:03d}.jpg")
            else:
                break

        snapshot = tracker.snapshot()
        logging.info(
            f"[{i}] snapshot[0]: {snapshot and list(zip(*snapshot))[0] or len(snapshot)}"
        )
        frame = cv.render(
            frame,
            snapshot,
            path=
            f"export/{video.stem}-{model.__name__}/tracking/frame{i:03d}.jpg")
        if media is not None:
            shape = frame.shape
            frame = av.VideoFrame.from_ndarray(frame, format='bgr24')
            packets = stream.encode(frame)
            print('encoded:', packets, frame)
            media.mux(packets)
    if media is not None:
        packets = stream.encode(None)
        media.mux(packets)
        media.close()