def test_source_time(video_mp4): sources = dict( file=video_mp4, # cam=0, # rtsp_qb="rtsp://*****:*****@[email protected]:554/cam/realmonitor?channel=1&subtype=1", # rtsp_qb="rtsp://*****:*****@[email protected]:554/cam/realmonitor?channel=1&subtype=1", # rtsp_ernie="rtsp://*****:*****@50.211.198.158:558/LiveChannel/0/media.smp", # nuuo_latham="nuuo://*****:*****@73.222.32.72:5250", # Cam 1 # kvs_awscam="aws_cam-5302", ) import av from time import time, localtime, strftime from datetime import datetime for src, path in sources.items(): s = av.open(path) s_start = s.start_time # us vs = s.streams[0] v_start = vs.start_time # pts vtb = vs.time_base # per frame time base in pts rates = (float(vs.guessed_rate), float(vs.average_rate), float(vs.base_rate)) cc = vs.codec_context fps = cc.framerate tps = cc.ticks_per_frame ctb = cc.time_base # per frame time base in secs v = s.demux() pkts = [next(v) for i in range(3)] frames = [p.decode()[0] for p in pkts] pkt_ts = [(p.dts, p.pts) for p in pkts] pkt_rts = [(f"{float(p.dts * vtb):.3f}", f"{float(p.pts * vtb):.3f}") for p in pkts] pkt_durations = [(p.duration, round(float(p.duration * vtb), 3)) for p in pkts] frame_ts = [(f.dts, f.pts) for f in frames] frame_rts = [(f"{float(f.dts * vtb):.3f}s", f"{float(f.pts * vtb):.3f}s") for f in frames] #assert s.start_time == vs.start_time #assert vs.time_base == cc.time_base s_timestamp = strftime("%X %x", localtime(s_start / 1e6)) v_timestamp = strftime("%X %x", localtime(float(v_start * vtb))) print() print(f'##### {src}={path} #####') print(f'stream start: {s_timestamp}, {s_start/1e6:.3f}s, {s_start}us') print( f'video start: {v_timestamp}, {float(v_start * vtb):.3f}s, {v_start}pts' ) print( f"FPS={fps}({1 / (ctb * tps)}), rates={rates}, time_base={vtb}({ctb}), ticks_per_frame={tps}" ) print(f"pkt time: {pkt_ts}, {pkt_rts}") print(f"pkt duration: {list(zip(*pkt_durations))}") print(f"frame time: {frame_ts}, {frame_rts}")
def av_dump(src, count=60, last=None, **options): print() print(f'========== av: {src} ============') s = av.open(src, options=options) v = s.demux(video=0) pkts = [next(v) for _ in range(count)][last:] print('key:', [pkt.is_keyframe for pkt in pkts]) print('corrupt:', [pkt.is_corrupt for pkt in pkts]) print('dts:', [pkt.dts for pkt in pkts]) print('pts:', [pkt.pts for pkt in pkts]) print('duration:', [pkt.duration for pkt in pkts]) print('size:', [pkt.size for pkt in pkts]) s.close() return pkts
def test_motion_vectors(bitstream_short): s = av.open(bitstream_short) codec = s.streams.video[0].codec_context codec.options = dict(flags2='+export_mvs') v = s.decode(video=0) frames = [next(v) for _ in range(15)] #print([list(f.side_data.keys()) for f in frames[:3]]) MVs = [f.side_data.get('MOTION_VECTORS') for f in frames] print() for i, mv in enumerate(MVs): if mv is None: continue print(f"frame[{i}]:", len(mv), 'MVs') print(mv.to_ndarray()[:10])
def test_youtube_live_av(url): source = AVSource.create(url) assert type(source) is youtube.YTSource assert source.url == url hls = youtube.yt_hls_url(url) options = dict( rtsp_transport='http', # required for VPN rtsp_flags='prefer_tcp', stimeout='2000000') # in case of network down logging.info(f"av.open({hls}, {options})") s = av.open(hls, options=options, timeout=5.0 * 2) v = s.demux(video=0) for i in range(100): f = next(v) pts = float(f.pts * f.time_base) duration = float(f.duration * f.time_base) logging.info( f"frame[{i}] time={pts:.3f}s, duration={duration:.3f}s, now={time():.3f}s" )
def open(self): if self.stream is not None: logging.warning("Already opened") return False self.encoder = av.open(ENCODERS[self.ch]) self.video = self.encoder.streams[0] self.codec = self.video.codec_context self.stream = self.encoder.demux() self.duration = float(self.codec.time_base * self.codec.ticks_per_frame) self.fps = 1 / self.duration # nominal FPS self.rate = float(self.codec.rate) # average FPS self.started = False self.start = None self.time = None self.frames = 0 return True
def track_video(): """ Usage: # images: /zdata/projects/shared/datasets/kinetics400/frames-5fps/val/abseiling/GwlcmI36imo_000127_000137 track_video /path/to/video.mp4 [--det-tag v3.1] [--render-all] track_video /path/to/image.jpg [--det-tag v3.1] [--render-all] track_video /path/to/images/ [--det-tag v3.1] [--render-all] Options: --det-chkpt-url s3://latest-sinet-checkpoints/detector/yolo/yolo5x-custom_rama_new-v3.1.pt \ --det-chkpt yolov5x_custom \ --det-classes 5 \ --render-all \ --reload """ parser = argparse.ArgumentParser( 'Deploy a trained YOLO5 checkpoint on S3 by stripping out training states' ) parser.add_argument('path', help='Path to a trained checkpoint for deployment') parser.add_argument('-o', '--output', default='export', help='Path to output visualizations') parser.add_argument( '-b', '--batch-size', default=24, type=int, help='Batch size to perform object detection inference') parser.add_argument('--fps', default=5, type=int, help='Frames per second in the video') parser.add_argument('--reload', action='store_true', help='Forece to reload checkpoint') parser.add_argument('--det-amp', action='store_true', help='Inference in AMP') parser.add_argument('--det-chkpt', default='yolov5x', choices=['yolov5x', 'yolov5x_custom'], help='Checkpoint name to save locally') parser.add_argument('--det-backend', default=None, choices=['trt'], help='Inference backend to use') parser.add_argument('--det-trt-fp16', action='store_true', help='TRT FP16 enabled or not') parser.add_argument('--det-trt-int8', action='store_true', help='TRT INT8 enabled or not') parser.add_argument('--det-chkpt-url', help='S3 URL to download checkpoint') parser.add_argument('--det-classes', type=int, default=80, help='Number of classes to detect by the model') parser.add_argument('--det-tag', default='v6.0', help='Object detector code base git tag') parser.add_argument('--det-resize', default=[720, 1280], type=int, help='Resize frame') parser.add_argument('--det-scales', default=640, type=int, choices=[608, 640, 672, 736], help='Size to rescale input for object detection') parser.add_argument('--det-cls-thres', default=0.4, type=float, help='Object class confidence threshold') parser.add_argument('--det-nms-thres', default=0.5, type=float, help='NMS IoU threshold') parser.add_argument('--det-pooling', default=1, type=int, help='Object feature pooling size for tracking') parser.add_argument('--trk-cls-person', nargs='+', default=[0], help='One or more person classes to track') parser.add_argument('--trk-max-iou-dist', default=0.8, type=float, help='max (1 - IoU) distance to track ') parser.add_argument('--trk-max-feat-dist', default=0.1395, type=float, help='max (1 - feature similarity) distance to track') parser.add_argument('--trk-gating-kf', default='iain', choices=['org', 'iain', False], help='KF gating for Deep Sort') parser.add_argument('--trk-gating-thrd', default=50, type=float, help='KF gating threshold') parser.add_argument('--trk-gating-alpha', default=0.2, type=float, help='KF gating parameter') parser.add_argument('--render-all', action='store_true', help='Render all objects or person only') cfg = parser.parse_args() print(cfg) from ml.vision.models import yolo5x, yolo5 from ml.vision.models.tracking.dsort import DSTracker from ml.vision.ops import dets_select from torchvision.transforms import functional as TF from torchvision.io import write_jpeg, read_image from ml import av, hub, logging, time import numpy as np import torch as th path = Path(cfg.path) fps = cfg.fps src = None if path.suffix in ['.mp4', '.avi']: # path to video src = av.open(cfg.path) v = src.decode(video=0) codec = src.streams[0].codec_context fps = round(codec.framerate) logging.info(f"Tracking video@{float(fps):.2f}fps in {path}") else: # path to image or a directory of subsampled images if path.is_file(): paths = [path] elif path.is_dir(): paths = sorted([f for f in path.iterdir() if f.is_file()]) def framer(): for p in paths: if True: # Follow feature extraction to load images in accimage.Image followed by ToTensor img = read_image(str(p)) yield img else: yield cv.imread(p) v = framer() logging.info(f"Tracking {len(paths)} frames@{cfg.fps}fps in {path}") dev = th.cuda.default_stream().device if th.cuda.is_available() else 'cpu' if cfg.det_chkpt_url: model = yolo5 spec = hub.parse(cfg.det_chkpt_url) s3 = spec['scheme'] == 's3://' and spec or None # detector = model(chkpt=cfg.det_chkpt, tag=cfg.det_tag, pretrained=True, classes=cfg.det_classes, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload, s3=s3).to(dev) # detector = model(chkpt='yolov5x-v2.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-v2.0.pt'), pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev) # detector = model(chkpt='yolov5x-v1.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-v1.0.pt'), tag='v1.0', pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev) # detector = model(chkpt='yolov5x-store-v1.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-store-v1.0.pt'), tag='v1.0', pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev) # detector = model(chkpt='yolov5x-retail81-v1.0', s3=dict(bucket='eigen-pretrained', key='detection/yolo/yolov5x-retail81-v1.0.pt'), tag='v1.0', pretrained=True, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev) detector = model(classes=cfg.det_classes, pretrained=True, chkpt=cfg.det_chkpt, tag=cfg.det_tag, s3=s3, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload) else: model = yolo5x detector = model(tag=cfg.det_tag, pretrained=True, classes=cfg.det_classes, fuse=True, pooling=cfg.det_pooling, force_reload=cfg.reload).to(dev) if cfg.det_backend in ['trt']: import math # XXX Deployment by batch size and minimal preprocessed shape amp = cfg.det_trt_fp16 bs = cfg.batch_size scale = cfg.det_scales H, W = cfg.det_resize if W > H: spec = (3, 32 * math.ceil(H / W * scale / 32), scale) else: spec = (3, scale, 32 * math.ceil(W / H * scale / 32)) logging.info( f"Deploying runtime={cfg.det_backend} with batch_size={bs}, spec={spec}, fp16={amp}" ) detector.deploy('yolov5x', batch_size=bs, spec=spec, fp16=amp, int8=cfg.det_trt_int8, backend=cfg.det_backend, reload=False) tracker = DSTracker(max_feat_dist=cfg.trk_max_feat_dist, max_iou_dist=cfg.trk_max_iou_dist, max_age=cfg.fps * 2, n_init=3, nn_budget=cfg.fps * 2, gating_kf=cfg.trk_gating_kf, gating_thrd=cfg.trk_gating_thrd, gating_alpha=cfg.trk_gating_alpha) export_path = Path( f'{cfg.output}/{path.stem}-{cfg.det_chkpt}_{cfg.det_tag}_{cfg.det_scales}_{cfg.det_cls_thres}_{cfg.det_nms_thres}' ) # create frame path export_frame = export_path / 'rendered_frames' export_frame.mkdir(parents=True, exist_ok=True) assert export_frame.exists() TRACK_INFO = 'x1, y1, x2, y2, score, class, origin_x, origin_y, velocity_x, velocity_y' DETECTION_INFO = 'x1, y1, x2, y2, score, class' annot_dict = defaultdict(dict) def render_frame(idx, frame, dets, tracks=False): from torchvision.utils import draw_bounding_boxes from ml.vision.utils import rgb, COLORS91 from ml.vision.datasets.coco import COCO80_CLASSES if tracks: tids, dets = list(zip(*dets)) dets = th.stack(dets) labels = [f"[{int(c)}][{tid}]" for tid, c in zip(tids, dets[:, 5])] colors = [rgb(tid, integral=True) for tid in tids] annot_dict[idx]['tracks'] = { tid: det for tid, det in zip(tids, dets.tolist()) } else: labels = [COCO80_CLASSES[i] for i in dets[:, -1].int()] colors = [COLORS91[i] for i in dets[:, 5].int()] annot_dict[idx]['detections'] = dets.tolist() # get boxes: x1, y1, x2, y2 boxes = dets[:, :4] # draw bounding boxes frame = draw_bounding_boxes(frame, boxes=boxes, labels=labels, colors=colors, fill=True, width=3, font_size=25) return frame logging.info(f"Saving tracked video and frames to {export_path}") media = av.open(f"{export_path}/{path.stem}-tracking.mp4", 'w') stream = media.add_stream('h264', cfg.fps) stream.bit_rate = 2000000 def track_frames(frames, start, step): frames = th.stack(frames) # Track person only with th.cuda.amp.autocast(enabled=cfg.det_amp): dets, features = detector.detect(frames, size=cfg.det_scales, conf_thres=cfg.det_cls_thres, iou_thres=cfg.det_nms_thres, batch_preprocess=True) persons = dets_select(dets, cfg.trk_cls_person) objs = [ dets_f[~persons_f].cpu() for dets_f, persons_f in zip(dets, persons) ] ppls = [ dets_f[persons_f].cpu() for dets_f, persons_f in zip(dets, persons) ] ppl_feats = [ feats_f[persons_f].cpu() for feats_f, persons_f in zip(features, persons) ] for j, (objs_f, ppls_f, ppl_feats_f) in enumerate(zip(objs, ppls, ppl_feats), start): logging.info( f"[{start + (j - start) * step}] objs: {tuple(objs_f.shape)}, ppls: {tuple(ppls_f.shape)}, feats: {tuple(ppl_feats_f.shape)}" ) assert objs_f.shape[1] == 4 + 1 + 1 assert ppls_f.shape[1] == 4 + 1 + 1 assert len(ppls) == len(ppl_feats) # assert ppl_feats.shape[1] == 256 + 512 + 1024 assert ppl_feats_f.shape[1] == 320 + 640 + 1280 matches = tracker.update( ppls_f, ppl_feats_f.view(len(ppl_feats_f), np.prod(ppl_feats_f.shape[1:]))) snapshot = tracker.snapshot() tracks = [] for tid, info in snapshot: tracks.append([tid, info]) logging.debug(f"matches[{start + (j - start) * step}]: {matches}") logging.debug( f"snapshot[{start + (j - start) * step}]: {snapshot}") # Render both dets and tracks side by side frame_det = frames[j - start] frame_trk = frame_det.clone() C, H, W = frame_det.shape idx = f'{start + (j - start) * step:03d}' if cfg.render_all: dets = th.cat([ppls_f, objs_f]) frame_det = render_frame(idx, frame_det, dets, False) else: frame_det = render_frame(idx, frame_det, ppls_f, False) if tracks: frame_trk = render_frame(idx, frame_trk, tracks, True) frame = th.zeros((C, H, 2 * W), dtype=th.uint8) frame[:, :, :W] = frame_det frame[:, :, W:] = frame_trk write_jpeg(frame, str(export_frame / f"frame{idx}.jpg")) if media is not None: frame = av.VideoFrame.from_ndarray(frame.permute(1, 2, 0).numpy(), format='rgb24') packets = stream.encode(frame) media.mux(packets) logging.debug(f'Encoded: {len(packets)} {packets}, {frame}') frames = [] BS = bs = cfg.batch_size step = 1 if src is None else fps // cfg.fps t = time.time() for i, frame in enumerate(v): if isinstance(frame, av.VideoFrame): frame = th.as_tensor( np.ascontiguousarray(frame.to_rgb().to_ndarray())).permute( 2, 0, 1) if cfg.det_resize and cfg.det_backend in ['trt']: frame = TF.resize(frame, cfg.det_resize, antialias=True) assert frame.data.contiguous if i == 0: stream.height = frame.shape[1] stream.width = frame.shape[2] * 2 if src is not None and i % step != 0: continue frames.append(frame) if len(frames) < BS: continue assert len(frames) == BS track_frames(frames, i - (BS - 1) * step, step) frames.clear() bs = BS if frames: track_frames(frames, i - (len(frames) - 1) * step, step) if media is not None: packets = stream.encode(None) if packets: media.mux(packets) media.close() if src is not None: src.close() # write annotations to file annotations = { 'info': { 'track': TRACK_INFO, 'detection': DETECTION_INFO }, 'annotations': annot_dict } with open(f'{export_path}/annotations.json', 'w') as f: json.dump(annotations, f, indent=4) elapse = time.time() - t logging.info( f"Done tracking {path.name} in {elapse:.3f}s at {(i + 1) / step / elapse:.2f}fps" )
def test_rtsp(rtsp): workaround = True s = av.open(rtsp, options=dict(rtsp_transport='http')) v = s.demux(video=0) codec = s.streams[0].codec_context if True: pkts = [] for _ in range(15 * 2): pkt = next(v) NALUs = [] for (pos, _, _, type), nalu in NALUParser(memoryview(pkt), workaround=False): NALUs.append(nalu[-1] == 0x00 and nalu[:-1] or nalu) packet = av.Packet(b''.join(NALUs)) packet.dts = pkt.dts packet.pts = pkt.pts packet.time_base = pkt.time_base pkt = packet pkts.append(pkt) else: pkts = [next(v) for _ in range(15 * 2)] print(pkts) # frames = [pkt.decode()[0] for pkt in pkts] print() if codec.extradata is not None: for (pos, _, _, type), nalu in NALUParser(codec.extradata): logging.info( f"CPD {NALU_t(type).name} at {pos}: {nalu[:8]} ending with {nalu[-1:]}" ) with open(f"tmp/cpd.264", 'wb') as f: f.write(codec.extradata) for i, pkt in enumerate(pkts): print(f"pkt[{i}] {pkt.is_keyframe and 'key ' or ''}{pkt}") for (pos, _, _, type), nalu in NALUParser(memoryview(pkt), workaround=False): logging.info( f"frame[{i}] {NALU_t(type).name} at {pos}: {nalu[:8].tobytes()} ending with {nalu[-1:].tobytes()}" ) with open(f"tmp/frame{i:02d}.264", 'wb') as f: f.write(pkt) # for i, frame in enumerate(frames): # print(f"frame[{i}] {frame.key_frame and 'key' or ''}{frame}") decoded = [] h264 = av.CodecContext.create('h264', 'r') for i, pkt in enumerate(pkts): res = h264.decode(pkt) print(f"pkt[{i}] {len(res)} frames decoded") if res: assert len(res) == 1 decoded.append(res[0]) res = h264.decode() if res: print(f"pkt[-1] {len(res)} frames decoded") assert len(res) == 1 decoded.append(res[0]) if workaround: print(decoded) from ml import cv for i, f in enumerate(decoded): cv.save(f.to_rgb().to_ndarray()[:, :, ::-1], f'tmp/images/frame{i:02d}.jpg') else: for i, f in enumerate(decoded): assert (f.to_ndarray() == frames[i].to_ndarray()).all()
def openAV(src, decoding=False, with_audio=False, **kwargs): try: format = None options = None fps = float(kwargs.get('fps', 10)) if str(src).startswith('rtsp'): # ffmpeg RTSP options: # rtsp_transport: tcp, http, udp_multicast, udp # rtsp_flags: prefer_tcp, filter_src, listen, none # allowed_media_types: video, audio, data # stimeout: socket TCP I/O timeout in us # RTSP/HTTP required for VPN but not necessarily supported options = dict( rtsp_transport=kwargs.get('rtsp_transport', 'tcp'), rtsp_flags='prefer_tcp', stimeout=kwargs.get('stimeout', '5000000')) # in case of network down # NOTE: retry with different rtsp transport types if unspecified if options and options.get('rtsp_transport', None): source = None for transport in ['tcp', 'http']: try: options['rtsp_transport'] = transport source = av.open(src, format=format, options=options, timeout=(15, 5)) except Exception as e: logging.warning( f'Failed with rtsp_transport={transport}: {e}') else: options['rtsp_transport'] = transport break assert source is not None, f"Failed to open RTSP source over TCP/HTTP" else: source = av.open(src, format=format, options=options, timeout=(15, 5)) else: if isinstance(src, int) or (isinstance(src, str) and src.startswith('/dev/video')): # XXX webcam: high FPS with MJPG import platform system = platform.system() resolution = av.resolution_str( *kwargs.get('resolution', ['720p'])) options = { 'framerate': str(fps), 'video_size': resolution, 'input_format': 'mjpeg' } decoding = True if system == 'Darwin': src = str(src) format = 'avfoundation' elif system == 'Linux': src = f"/dev/video{src}" if isinstance(src, int) else src else: raise ValueError(f"Webcam unsupported on {system}") source = av.open(src, format=format, options=options) # timeout: maximum timeout (in secs) to wait for incoming connections and soket reading # XXX HLS connection potential time out for taking more than 5s logging.info( f"av.open({src}, format={format}, options={options}, timeout=(15, 5))" ) except Exception as e: logging.error(e) raise e else: ''' H.264 NALU formats: Annex b.: RTSP/RTP: rtsp, 'RTSP input', set() bitstream: h264, 'raw H.264 video', {'h26l', 'h264', '264', 'avc'} webcam: /dev/videoX, ... DeepLens: /opt/.../...out NUUO/NVR: N/A AVCC: avi: avi, 'AVI (Audio Video Interleaved)', {'avi'} mp4: 'mov,mp4,m4a,3gp,3g2,mj2', 'QuickTime / MOV', {'m4a', 'mov', 'mp4', 'mj2', '3gp', '3g2'} webm/mkv: 'matroska,webm', 'Matroska / WebM', {'mks', 'mka', 'mkv', 'mk3d'} DASH/KVS(StreamBody): file-like obj ''' now = time.time() start_time = source.start_time / 1e6 # us relative = abs(start_time - now) > 60 * 60 * 24 * 30 # Too small to be absolute rt = not (isinstance(src, str) and os.path.isfile(src) ) # regular file or not if rt: logging.info(f"Assume real-time source: {src}") else: logging.info(f"Simulating local source as real-time: {src}") # XXX start_time may be negative (webcam), zero if unavailable, or a small logical timestamp session = dict( src=src, streams=source, format=source.format.name, decoding=decoding, start=relative and now or start_time, rt=rt, ) session_start_local = strftime('%X', localtime(session['start'])) source_start_local = strftime('%X', localtime(start_time)) logging.info( f"Session start: {session['start']:.3f}s({session_start_local}), source start: {start_time:.3f}s({source_start_local})" ) if source.streams.video: # FIXME RTSP FPS might be unavailable or incorrectly set video0 = source.streams.video[0] codec = video0.codec_context FPS = 1 / (codec.time_base * codec.ticks_per_frame) fps = FPS > 60 and (codec.framerate and float(codec.framerate)) or fps session['video'] = dict( stream=source.demux(video=0), start=video0.start_time, # same as 1st frame in pts codec=codec, format=video0.name, width=video0.width, height=video0.height, fps=fps, count=0, time=0, # pts in secs duration=None, # frame duration in secs drifting=False, adaptive=kwargs.get('adaptive', True), workaround=kwargs.get('workaround', True), thresholds=dict(drifting=10, ), prev=None, ) logging.info( f"codec.framerate={codec.framerate}, codec.time_base={codec.time_base}, codec.ticks_per_frame={codec.ticks_per_frame}, fps={session['video']['fps']}, FPS={FPS}" ) if source.streams.audio: audio0 = source.streams.audio[0] codec = audio0.codec_context logging.warning(f"No audio streaming supported yet") ''' if codec.name == 'aac': logging.warning(f"AAC is not supported yet") else: session['audio'] = dict( stream=decoding and source.decode(audio=0) or source.demux(audio=0), start=audio0.start_time, # same as 1st frame in pts format=audio0.name, codec=codec, sample_rate=codec.sample_rate, channels=len(codec.layout.channels), count=0, time=0, ) ''' return session
def test_fragments(stream, total, inf): kvs = boto3.client("kinesisvideo") info = kvs.describe_stream(StreamName=stream)['StreamInfo'] assert stream == info['StreamName'] print(f"{stream} info:") for key, value in info.items(): print(f'\t{key}:', value) media, codec = info['MediaType'].split('/') assert media == 'video' assert codec == 'h264' dataEndpoint = kvs.get_data_endpoint(StreamName=stream, APIName='GET_MEDIA')['DataEndpoint'] print(f"{stream} endpoint: {dataEndpoint}") kvm = boto3.client('kinesis-video-media', endpoint_url=dataEndpoint, region_name='us-east-1') while True: now = elapse = tic = time() media = kvm.get_media( StreamName=stream, StartSelector=dict(StartSelectorType='NOW'), #StartSelector=dict(StartSelectorType='PRODUCER_TIMESTAMP', #StartSelector=dict(StartSelectorType='SERVER_TIMESTAMP', # StartTimestamp=time()) ) contentType = media['ContentType'] payload = media['Payload'] print(f"Received {contentType} stream payload") webm = av.open(payload) video = webm.streams.video[0] codec = video.codec_context print(webm.format, f"{webm.start_time / 1000:.3f}") # start time in ms print(video, video.type, video.time_base, f"{video.start_time / 1000:.3f}s", video.base_rate, video.average_rate, video.guessed_rate) # start time in s print(f"{codec.type}/{codec.name}", codec.time_base, codec.ticks_per_frame) # codec type/name, frame duration, FPS start = video.start_time duration_cc = float(codec.time_base * codec.ticks_per_frame) fps = 1 / duration_cc started = False X = sys.x_available() print( f"Streaming {codec.type}/{codec.name} since {start/1000:.3f}s(now={now:.3f}s, diff={now-start/1000:.3f}s) at {fps:.2f}FPS with frame duration of {duration_cc:.3f}s" ) try: for i, packet in enumerate(webm.demux(video=0), 1): frame = packet.decode()[0] now = time() pts = frame.time duration_pkt = float(packet.duration * packet.time_base) # FIXME 0 -> 1/FPS frame = frame.to_rgb().to_ndarray()[:, :, ::-1] print( f"{packet.is_keyframe and 'key ' or ''}frame[{i}]{frame.shape} of {frame.nbytes} bytes with pts={pts:.3f} at {now:.3f}s and duration={duration_pkt:.3f}s({packet.time_base}, {packet.duration})" ) elapse += duration_pkt slack = elapse - now if slack > 0: print(f"Sleep for {slack:.3f}s") sleep(slack) if X: cv.imshow('LIVE', frame) cv.waitKey(1) if i == total: print(f"RT FPS={total/(now-tic)}") break except Exception as e: print(f"Failed to decode: {e}") webm.close() if not inf: break
def test_rfcn_deep_sort(video): import numpy as np from ml.vision.models.tracking.dsort import DeepSort model, size = rfcn, 608 detector = model(pooling=2, model_dir="/tmp/checkpoints", force_reload=not True) tracker = DeepSort( max_feat_dist=0.2, nn_budget=100, max_iou_dist=0.7, # 0.7 max_age=15, # 30 (FPS) n_init=3) # 3 from ml import av s = av.open(video) v = s.decode() video = Path(video) export = Path(f'export/{video.stem}-{model.__name__}') export.mkdir(exist_ok=True) assert export.exists() print(f"Tracking video: {video}") print(f"Saving to {export / 'tracking.mp4'}") media = av.open(f"{export}/tracking.mp4", 'w') stream = media.add_stream('h264', 15) stream.bit_rate = 2000000 for i, frame in enumerate(v): if i == 0: stream.height = frame.height stream.width = frame.width frame = frame.to_rgb().to_ndarray()[:, :, ::-1] dets, features = detector.detect([frame], size=size) if True: # Track person only person = dets[0][:, -1] == 0 dets[0] = dets[0][person] features[0] = features[0][person] assert len(dets) == 1 assert len(dets[0]) == features[0].shape[0] assert dets[0].shape[1] == 4 + 1 + 1 # assert features[0].shape[1] == 256+512+1024 assert features[0].shape[1] == 1024 if len(dets[0]) > 0: D = 1 for s in features[0].shape[1:]: D *= s tracker.update(dets[0], features[0].view(len(features[0]), D)) if i < 60: logging.info( f"[{i}] dets[0]: {dets[0].shape}, feats: {[tuple(feats.shape) for feats in features]}" ) detector.render(frame, dets[0], path=export / 'dets' / f"frame{i:03d}.jpg") else: break snapshot = tracker.snapshot() logging.info( f"[{i}] snapshot[0]: {snapshot and list(zip(*snapshot))[0] or len(snapshot)}" ) frame = detector.render( frame, snapshot, path= f"export/{video.stem}-{model.__name__}/tracking/frame{i:03d}.jpg") #frame = detector.render(frame, snapshot) if media is not None: shape = frame.shape frame = av.VideoFrame.from_ndarray(frame, format='bgr24') packets = stream.encode(frame) print('encoded:', packets, frame) media.mux(packets) if media is not None: packets = stream.encode(None) media.mux(packets) media.close()
def test_yolo5x_store_deep_sort(video): import numpy as np from ml.vision.models.tracking.dsort import DeepSort from ml.vision.datasets.widerperson import WIDERPERSON_CLASSES WIDERPERSON_CLASSES[0] = 'object' model, size = yolo5, 736 detector = model(name='yolov5x-store', pretrained=True, bucket='eigen-pretrained', key='detection/yolo/yolov5x-store.pt', classes=len(WIDERPERSON_CLASSES), pooling=True, fuse=True, model_dir=None, force_reload=not True) pooler = MultiScaleFusionRoIAlign(3) tracker = DeepSort( max_feat_dist=0.2, nn_budget=100, max_iou_dist=0.7, # 0.7 max_age=15, # 30 (FPS) n_init=3) # 3 from ml import av s = av.open(video) v = s.decode() video = Path(video) export = Path(f'export/{video.stem}-{model.__name__}') export.mkdir(exist_ok=True) assert export.exists() print(f"Tracking video: {video}") print(f"Saving to {export / 'tracking.mp4'}") media = av.open(f"{export}/tracking.mp4", 'w') stream = media.add_stream('h264', 15) stream.bit_rate = 2000000 for i, frame in enumerate(v): if i == 0: stream.height = frame.height stream.width = frame.width frame = frame.to_rgb().to_ndarray()[:, :, ::-1] dets, features = detector.detect([frame], size=size) # Track person only person = (0 < dets[0][:, -1]) & (dets[0][:, -1] < 4) persons = dets[0][person] features[0] = features[0][person] assert len(dets) == 1 assert len(persons) == features[0].shape[0] assert dets[0].shape[1] == 4 + 1 + 1 assert features[0].shape[1] == 320 + 640 + 1280 if len(dets[0]) > 0: D = 1 for s in features[0].shape[1:]: D *= s tracker.update(persons, features[0].view(len(features[0]), D)) if i < 60: logging.info( f"[{i}] dets[0]: {dets[0].shape}, feats: {[tuple(feats.shape) for feats in features]}" ) cv.render(frame, dets[0], classes=WIDERPERSON_CLASSES, path=export / 'dets' / f"frame{i:03d}.jpg") else: break snapshot = tracker.snapshot() logging.info( f"[{i}] snapshot[0]: {snapshot and list(zip(*snapshot))[0] or len(snapshot)}" ) frame = cv.render( frame, snapshot, classes=WIDERPERSON_CLASSES, path= f"export/{video.stem}-{model.__name__}/tracking/frame{i:03d}.jpg") #frame = detector.render(frame, snapshot) if media is not None: shape = frame.shape frame = av.VideoFrame.from_ndarray(frame, format='bgr24') packets = stream.encode(frame) print('encoded:', packets, frame) media.mux(packets) if media is not None: packets = stream.encode(None) media.mux(packets) media.close()
def test_yolo_deep_sort(video): import numpy as np from ml.vision.models.tracking.dsort import DeepSort from ml import av model, size = yolo4, 608 model, size = yolo5x, 736 detector = model(pretrained=True, fuse=True, pooling=True) pooler = MultiScaleFusionRoIAlign(3) tracker = DeepSort( max_feat_dist=0.2, nn_budget=100, max_iou_dist=0.7, # 0.7 max_age=15, # 30 (FPS) n_init=3) # 3 video = Path(video) if video.suffix in ['.mp4', '.avi']: s = av.open(video) v = s.decode(video=0) print(f"Tracking video: {video}") else: s = None if video.is_file(): files = [video] elif video.is_dir(): files = sorted([f for f in video.iterdir() if f.is_file()]) v = [cv.imread(f) for f in files] print(f"Tracking {len(files)} frames in {video}") export = Path(f'export/{video.stem}-{model.__name__}') export.mkdir(parents=True, exist_ok=True) assert export.exists() print(f"Saving to {export / 'tracking.mp4'}") media = av.open(f"{export}/tracking.mp4", 'w') stream = media.add_stream('h264', 15) stream.bit_rate = 2000000 for i, frame in enumerate(v): if not isinstance(frame, np.ndarray): frame = frame.to_rgb().to_ndarray()[:, :, ::-1] if i == 0: stream.height = frame.shape[0] stream.width = frame.shape[1] dets, features = detector.detect([frame], size=size) # Track person only person = dets[0][:, -1] == 0 persons = dets[0][person] features[0] = features[0][person] assert len(dets) == 1 assert len(persons) == features[0].shape[0] assert dets[0].shape[1] == 4 + 1 + 1 # assert features[0].shape[1] == 256+512+1024 assert features[0].shape[1] == 320 + 640 + 1280 if len(dets[0]) > 0: D = 1 for s in features[0].shape[1:]: D *= s tracker.update(persons, features[0].view(len(features[0]), D)) if i < 60: logging.info( f"[{i}] dets[0]: {dets[0].shape}, feats: {[tuple(feats.shape) for feats in features]}" ) cv.render(frame, dets[0], path=export / 'dets' / f"frame{i:03d}.jpg") else: break snapshot = tracker.snapshot() logging.info( f"[{i}] snapshot[0]: {snapshot and list(zip(*snapshot))[0] or len(snapshot)}" ) frame = cv.render( frame, snapshot, path= f"export/{video.stem}-{model.__name__}/tracking/frame{i:03d}.jpg") if media is not None: shape = frame.shape frame = av.VideoFrame.from_ndarray(frame, format='bgr24') packets = stream.encode(frame) print('encoded:', packets, frame) media.mux(packets) if media is not None: packets = stream.encode(None) media.mux(packets) media.close()