def test_keyframe_reading(self, test_video, config): full_path = os.path.join(VIDEO_DIR, test_video) av_reader = av.open(full_path) # reduce streams to only keyframes av_stream = av_reader.streams.video[0] av_stream.codec_context.skip_frame = "NONKEY" av_keyframes = [] vr_keyframes = [] if av_reader.streams.video: # get all keyframes using pyav. Then, seek randomly into video reader # and assert that all the returned values are in AV_KEYFRAMES for av_frame in av_reader.decode(av_stream): av_keyframes.append(float(av_frame.pts * av_frame.time_base)) if len(av_keyframes) > 1: video_reader = VideoReader(full_path, "video") for i in range(1, len(av_keyframes)): seek_val = (av_keyframes[i] + av_keyframes[i - 1]) / 2 data = next(video_reader.seek(seek_val, True)) vr_keyframes.append(data["pts"]) data = next(video_reader.seek(config.duration, True)) vr_keyframes.append(data["pts"]) assert len(av_keyframes) == len(vr_keyframes) # NOTE: this video gets different keyframe with different # loaders (0.333 pyav, 0.666 for us) if test_video != "TrumanShow_wave_f_nm_np1_fr_med_26.avi": for i in range(len(av_keyframes)): assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001)
def test_fate_suite(self): video_path = fate("sub/MovText_capability_tester.mp4", VIDEO_DIR) vr = VideoReader(video_path) metadata = vr.get_metadata() self.assertTrue(metadata["subtitles"]["duration"] is not None) os.remove(video_path)
def test_frame_reading(self): for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) av_reader = av.open(full_path) if av_reader.streams.video: video_reader = VideoReader(full_path, "video") for av_frame in av_reader.decode(av_reader.streams.video[0]): vr_frame = next(video_reader) assert float(av_frame.pts * av_frame.time_base) == approx(vr_frame["pts"], abs=0.1) av_array = torch.tensor(av_frame.to_rgb().to_ndarray()).permute(2, 0, 1) vr_array = vr_frame["data"] mean_delta = torch.mean(torch.abs(av_array.float() - vr_array.float())) # on average the difference is very small and caused # by decoding (around 1%) # TODO: asses empirically how to set this? atm it's 1% # averaged over all frames assert mean_delta.item() < 2.5 av_reader = av.open(full_path) if av_reader.streams.audio: video_reader = VideoReader(full_path, "audio") for av_frame in av_reader.decode(av_reader.streams.audio[0]): vr_frame = next(video_reader) assert float(av_frame.pts * av_frame.time_base) == approx(vr_frame["pts"], abs=0.1) av_array = torch.tensor(av_frame.to_ndarray()).permute(1, 0) vr_array = vr_frame["data"] max_delta = torch.max(torch.abs(av_array.float() - vr_array.float())) # we assure that there is never more than 1% difference in signal assert max_delta.item() < 0.001
def test_frame_reading(self, test_video): full_path = os.path.join(VIDEO_DIR, test_video) with av.open(full_path) as av_reader: if av_reader.streams.video: av_frames, vr_frames = [], [] av_pts, vr_pts = [], [] # get av frames for av_frame in av_reader.decode(av_reader.streams.video[0]): av_frames.append(torch.tensor(av_frame.to_rgb().to_ndarray(), dtype=torch.uint8).permute(2, 0, 1)) av_pts.append(av_frame.pts * av_frame.time_base) # get vr frames video_reader = VideoReader(full_path, "video") for vr_frame in video_reader: vr_frames.append(vr_frame["data"]) vr_pts.append(vr_frame["pts"]) # same number of frames assert len(vr_frames) == len(av_frames) assert len(vr_pts) == len(av_pts) # compare the frames and ptss for i in range(len(vr_frames)): assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) # on average the difference is very small and caused # by decoding (around 1%) # TODO: asses empirically how to set this? atm it's 1% # averaged over all frames assert mean_delta.item() < 2.55 del vr_frames, av_frames, vr_pts, av_pts # test audio reading compared to PYAV with av.open(full_path) as av_reader: if av_reader.streams.audio: av_frames, vr_frames = [], [] av_pts, vr_pts = [], [] # get av frames for av_frame in av_reader.decode(av_reader.streams.audio[0]): av_frames.append(torch.tensor(av_frame.to_ndarray(), dtype=torch.float32).permute(1, 0)) av_pts.append(av_frame.pts * av_frame.time_base) av_reader.close() # get vr frames video_reader = VideoReader(full_path, "audio") for vr_frame in video_reader: vr_frames.append(vr_frame["data"]) vr_pts.append(vr_frame["pts"]) # same number of frames assert len(vr_frames) == len(av_frames) assert len(vr_pts) == len(av_pts) # compare the frames and ptss for i in range(len(vr_frames)): assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) max_delta = torch.max(torch.abs(av_frames[i].float() - vr_frames[i].float())) # we assure that there is never more than 1% difference in signal assert max_delta.item() < 0.001
def test_metadata(self, video_file): full_path = os.path.join(VIDEO_DIR, video_file) decoder = VideoReader(full_path, device="cuda") video_metadata = decoder.get_metadata()["video"] with av.open(full_path) as container: video = container.streams.video[0] av_duration = float(video.duration * video.time_base) assert math.isclose(video_metadata["duration"], av_duration, rel_tol=1e-2) assert math.isclose(video_metadata["fps"], video.base_rate, rel_tol=1e-2)
def test_metadata(self, test_video, config): """ Test that the metadata returned via pyav corresponds to the one returned by the new video decoder API """ full_path = os.path.join(VIDEO_DIR, test_video) reader = VideoReader(full_path, "video") reader_md = reader.get_metadata() assert config.video_fps == approx(reader_md["video"]["fps"][0], abs=0.0001) assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
def test_fate_suite(self): # TODO: remove the try-except statement once the connectivity issues are resolved try: video_path = fate("sub/MovText_capability_tester.mp4", VIDEO_DIR) except (urllib.error.URLError, ConnectionError) as error: pytest.skip(f"Skipping due to connectivity issues: {error}") vr = VideoReader(video_path) metadata = vr.get_metadata() assert metadata["subtitles"]["duration"] is not None os.remove(video_path)
def test_seek_reading(self, keyframes, full_path, duration): decoder = VideoReader(full_path, device="cuda") time = duration / 2 decoder.seek(time, keyframes_only=keyframes) with av.open(full_path) as container: container.seek(int(time * 1000000), any_frame=not keyframes, backward=False) for av_frame in container.decode(container.streams.video[0]): av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray()) vision_frames = next(decoder)["data"] mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float())) assert mean_delta < 0.75
def test_frame_reading(self): for test_video in test_videos: full_path = os.path.join(VIDEO_DIR, test_video) decoder = VideoReader(full_path, device="cuda:0") with av.open(full_path) as container: for av_frame in container.decode(container.streams.video[0]): av_frames = torch.tensor(av_frame.to_ndarray().flatten()) vision_frames = next(decoder)["data"] mean_delta = torch.mean( torch.abs(av_frames.float() - decoder._reformat(vision_frames).float())) assert mean_delta < 0.1
def test_metadata(self): """ Test that the metadata returned via pyav corresponds to the one returned by the new video decoder API """ for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) reader = VideoReader(full_path, "video") reader_md = reader.get_metadata() self.assertAlmostEqual(config.video_fps, reader_md["video"]["fps"][0], delta=0.0001) self.assertAlmostEqual(config.duration, reader_md["video"]["duration"][0], delta=0.5)
def test_video_reading_fn(self): """ Test that the outputs of the pyav and ffmpeg outputs are mostly the same """ for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) ref_result = _decode_frames_by_av_module(full_path) reader = VideoReader(full_path, "video") newapi_result = _template_read_video(reader) # First we check if the frames are approximately the same # (note that every codec context has signature artefacts which # make a direct comparison not feasible) if newapi_result.vframes.numel() > 0 and ref_result.vframes.numel( ) > 0: mean_delta = torch.mean( torch.abs(newapi_result.vframes.float() - ref_result.vframes.float())) self.assertAlmostEqual(mean_delta, 0, delta=8.0) # Just a sanity check: are the two of the correct size? self.assertEqual(newapi_result.vframes.size(), ref_result.vframes.size()) # Lastly, we compare the resulting audio streams if (config.check_aframes and newapi_result.aframes.numel() > 0 and ref_result.aframes.numel() > 0): """Audio stream is available and audio frame is required to return from decoder""" is_same = torch.all( torch.eq(newapi_result.aframes, ref_result.aframes)).item() self.assertEqual(is_same, True)
def test_accurateseek_middle(self): for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) stream = "video" video_reader = VideoReader(full_path, stream) md = video_reader.get_metadata() duration = md[stream]["duration"][0] if duration is not None: num_frames = 0 for frame in video_reader: num_frames += 1 video_reader.seek(duration / 2) middle_num_frames = 0 for frame in video_reader: middle_num_frames += 1 assert middle_num_frames < num_frames assert middle_num_frames == approx(num_frames // 2, abs=1) video_reader.seek(duration / 2) frame = next(video_reader) lb = duration / 2 - 1 / md[stream]["fps"][0] ub = duration / 2 + 1 / md[stream]["fps"][0] assert (lb <= frame["pts"]) and (ub >= frame["pts"])
def test_accurateseek_middle(self): for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) stream = "video" video_reader = VideoReader(full_path, stream) md = video_reader.get_metadata() duration = md[stream]["duration"][0] if duration is not None: num_frames = 0 for frame in video_reader: num_frames += 1 video_reader.seek(duration / 2) middle_num_frames = 0 for frame in video_reader: middle_num_frames += 1 self.assertTrue(middle_num_frames < num_frames) self.assertAlmostEqual(middle_num_frames, num_frames // 2, delta=1) video_reader.seek(duration / 2) frame = next(video_reader) lb = duration / 2 - 1 / md[stream]["fps"][0] ub = duration / 2 + 1 / md[stream]["fps"][0] self.assertTrue((lb <= frame["pts"]) & (ub >= frame["pts"]))
def test_frame_reading(self, video_file): full_path = os.path.join(VIDEO_DIR, video_file) decoder = VideoReader(full_path, device="cuda") with av.open(full_path) as container: for av_frame in container.decode(container.streams.video[0]): av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray()) vision_frames = next(decoder)["data"] mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float())) assert mean_delta < 0.75
def test_seek_start(self): for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) video_reader = VideoReader(full_path, "video") num_frames = 0 for frame in video_reader: num_frames += 1 # now seek the container to 0 and do it again # It's often that starting seek can be inprecise # this way and it doesn't start at 0 video_reader.seek(0) start_num_frames = 0 for frame in video_reader: start_num_frames += 1 assert start_num_frames == num_frames # now seek the container to < 0 to check for unexpected behaviour video_reader.seek(-1) start_num_frames = 0 for frame in video_reader: start_num_frames += 1 assert start_num_frames == num_frames
def test_read_video_tensor(self): """ Check if reading the video using the `next` based API yields the same sized tensors as the pyav alternative. """ torchvision.set_video_backend("pyav") for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) # pass 1: decode all frames using existing TV decoder tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec") tv_result = tv_result.permute(0, 3, 1, 2) # pass 2: decode all frames using new api reader = VideoReader(full_path, "video") frames = [] for frame in reader: frames.append(frame['data']) new_api = torch.stack(frames, 0) self.assertEqual(tv_result.size(), new_api.size())
def video_gen(video_path, S): reader = VideoReader(video_path) for frame in reader: yield frame['data']