Python _read_video_from_memoryの例

プログラミング言語: Python

名前空間/パッケージ名: torchvision.io

メソッド/関数: _read_video_from_memory

hotexamples.comのコード掲載数: 3

Python _read_video_from_memory - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtorchvision.io._read_video_from_memoryの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def pyav_decode(
    container,
    sampling_rate,
    num_frames,
    clip_idx,
    num_clips_uniform=10,
    target_fps=30,
    use_offset=False,
    min_delta=-math.inf,
    max_delta=math.inf,
):
    """
    Convert the video from its original fps to the target_fps. If the video
    support selective decoding (contain decoding information in the video head),
    the perform temporal selective decoding and sample a clip from the video
    with the PyAV decoder. If the video does not support selective decoding,
    decode the entire video.

    Args:
        container (container): pyav container.
        sampling_rate (int): frame sampling rate (interval between two sampled
            frames.
        num_frames (int): number of frames to sample.
        clip_idx (int): if clip_idx is -1, perform random temporal sampling. If
            clip_idx is larger than -1, uniformly split the video to num_clips_uniform
            clips, and select the clip_idx-th video clip.
        num_clips_uniform (int): overall number of clips to uniformly sample from the
            given video.
        target_fps (int): the input video may has different fps, convert it to
            the target video fps before frame sampling.
        min_delta (int): minimum distance between clips when sampling multiple.
        max_delta (int): max distance between clips when sampling multiple.
    Returns:
        frames (tensor): decoded frames from the video. Return None if the no
            video stream was found.
        fps (float): the number of frames per second of the video.
        decode_all_video (bool): If True, the entire video was decoded.
    """
    # Try to fetch the decoding information from the video head. Some of the
    # videos does not support fetching the decoding information, for that case
    # it will get None duration.
    fps = float(container.streams.video[0].average_rate)
    frames_length = container.streams.video[0].frames
    duration = container.streams.video[0].duration

    if duration is None:
        # If failed to fetch the decoding information, decode the entire video.
        decode_all_video = True
        video_start_pts, video_end_pts = 0, math.inf
    else:
        # Perform selective decoding.
        decode_all_video = False
        # clip_size = np.maximum(
        #     1.0, np.ceil(sampling_rate[0] * (num_frames[0] - 1) / target_fps * fps)
        # )
        clip_sizes = [
            np.maximum(
                1.0,
                np.ceil(sampling_rate[i] * (num_frames[i] - 1) / target_fps *
                        fps),
            ) for i in range(len(sampling_rate))
        ]
        start_end_delta_time = get_multiple_start_end_idx(
            fps * duration,
            clip_sizes,
            clip_idx,
            num_clips_uniform,
            min_delta=min_delta,
            max_delta=max_delta,
        )

        frames_out = [None] * len(num_frames)
        for k in range(len(num_frames)):
            pts_per_frame = (video_meta["video_denominator"] / fps)
            video_start_pts = int(start_end_delta_time[k, 0] * pts_per_frame)
            video_end_pts = int(start_end_delta_time[k, 1] * pts_per_frame)

            # Decode the raw video with the tv decoder.
            v_frames, _ = io._read_video_from_memory(
                video_tensor,
                seek_frame_margin=1.0,
                read_video_stream="visual" in modalities,
                video_width=0,
                video_height=0,
                video_min_dimension=max_spatial_scale,
                video_pts_range=(video_start_pts, video_end_pts),
                video_timebase_numerator=video_meta["video_numerator"],
                video_timebase_denominator=video_meta["video_denominator"],
                read_audio_stream=0,
            )
            if v_frames is None or v_frames.shape == torch.Size([0]):
                decode_all_video = True
                logger.info("TV decode FAILED try decode all")
                break
            frames_out[k] = v_frames

    if decode_all_video:
        # failed selective decoding
        decode_all_video = True
        video_start_pts, video_end_pts = 0, -1
        start_end_delta_time = None
        v_frames, _ = io._read_video_from_memory(
            video_tensor,
            seek_frame_margin=1.0,
            read_video_stream="visual" in modalities,
            video_width=0,
            video_height=0,
            video_min_dimension=max_spatial_scale,
            video_pts_range=(video_start_pts, video_end_pts),
            video_timebase_numerator=video_meta["video_numerator"],
            video_timebase_denominator=video_meta["video_denominator"],
            read_audio_stream=0,
        )
        if v_frames.shape == torch.Size([0]):
            v_frames = None
            logger.info("TV decode FAILED try cecode all")

        frames_out = [v_frames]

    if any([t.shape[0] < 0 for t in frames_out]):
        frames_out = [None]
        logger.info("TV decode FAILED: Decoded empty video")

        start_idx, end_idx, fraction = get_start_end_idx(
            frames_length,
            clip_size,
            clip_idx,
            num_clips_uniform,
            use_offset=use_offset,
        )
        timebase = duration / frames_length
        video_start_pts = int(start_idx * timebase)
        video_end_pts = int(end_idx * timebase)

    frames = None
    # If video stream was found, fetch video frames from the video.
    if container.streams.video:
        video_frames, max_pts = pyav_decode_stream(
            container,
            video_start_pts,
            video_end_pts,
            container.streams.video[0],
            {"video": 0},
        )
        container.close()

        frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
        frames = [torch.as_tensor(np.stack(frames))]

    start_end_delta_time = None
    return frames, fps, decode_all_video, start_end_delta_time

コード例 #2

ファイルを表示

def torchvision_decode(
        video_handle,
        sampling_rate,
        num_frames,
        clip_idx,
        video_meta,
        num_clips=10,
        target_fps=30,
        modalities=("visual", ),
        max_spatial_scale=0,
):
    """
    If video_meta is not empty, perform temporal selective decoding to sample a
    clip from the video with TorchVision decoder. If video_meta is empty, decode
    the entire video and update the video_meta.
    Args:
        video_handle (bytes): raw bytes of the video file.
        sampling_rate (int): frame sampling rate (interval between two sampled
            frames).
        num_frames (int): number of frames to sample.
        clip_idx (int): if clip_idx is -1, perform random temporal
            sampling. If clip_idx is larger than -1, uniformly split the
            video to num_clips clips, and select the clip_idx-th video clip.
        video_meta (dict): a dict contains VideoMetaData. Details can be found
            at `pytorch/vision/torchvision/io/_video_opt.py`.
        num_clips (int): overall number of clips to uniformly sample from the
            given video.
        target_fps (int): the input video may has different fps, convert it to
            the target video fps.
        modalities (tuple): tuple of modalities to decode. Currently only
            support `visual`, planning to support `acoustic` soon.
        max_spatial_scale (int): the maximal resolution of the spatial shorter
            edge size during decoding.
    Returns:
        frames (tensor): decoded frames from the video.
        fps (float): the number of frames per second of the video.
        decode_all_video (bool): if True, the entire video was decoded.
    """
    # Convert the bytes to a tensor.
    video_tensor = torch.from_numpy(np.frombuffer(video_handle,
                                                  dtype=np.uint8))

    decode_all_video = True
    video_start_pts, video_end_pts = 0, -1
    # The video_meta is empty, fetch the meta data from the raw video.
    if len(video_meta) == 0:
        # Tracking the meta info for selective decoding in the future.
        meta = io._probe_video_from_memory(video_tensor)
        # Using the information from video_meta to perform selective decoding.
        video_meta["video_timebase"] = meta.video_timebase
        video_meta["video_numerator"] = meta.video_timebase.numerator
        video_meta["video_denominator"] = meta.video_timebase.denominator
        video_meta["has_video"] = meta.has_video
        video_meta["video_duration"] = meta.video_duration
        video_meta["video_fps"] = meta.video_fps
        video_meta["audio_timebas"] = meta.audio_timebase
        video_meta["audio_numerator"] = meta.audio_timebase.numerator
        video_meta["audio_denominator"] = meta.audio_timebase.denominator
        video_meta["has_audio"] = meta.has_audio
        video_meta["audio_duration"] = meta.audio_duration
        video_meta["audio_sample_rate"] = meta.audio_sample_rate

    fps = video_meta["video_fps"]
    if (video_meta["has_video"] and video_meta["video_denominator"] > 0
            and video_meta["video_duration"] > 0):
        # try selective decoding.
        decode_all_video = False
        clip_size = sampling_rate * num_frames / target_fps * fps
        start_idx, end_idx = get_start_end_idx(
            fps * video_meta["video_duration"], clip_size, clip_idx, num_clips)
        # Convert frame index to pts.
        pts_per_frame = video_meta["video_denominator"] / fps
        video_start_pts = int(start_idx * pts_per_frame)
        video_end_pts = int(end_idx * pts_per_frame)

    # Decode the raw video with the tv decoder.
    v_frames, _ = io._read_video_from_memory(
        video_tensor,
        seek_frame_margin=1.0,
        read_video_stream="visual" in modalities,
        video_width=0,
        video_height=0,
        video_min_dimension=max_spatial_scale,
        video_pts_range=(video_start_pts, video_end_pts),
        video_timebase_numerator=video_meta["video_numerator"],
        video_timebase_denominator=video_meta["video_denominator"],
    )

    if v_frames.shape == torch.Size([0]):
        # failed selective decoding
        decode_all_video = True
        video_start_pts, video_end_pts = 0, -1
        v_frames, _ = io._read_video_from_memory(
            video_tensor,
            seek_frame_margin=1.0,
            read_video_stream="visual" in modalities,
            video_width=0,
            video_height=0,
            video_min_dimension=max_spatial_scale,
            video_pts_range=(video_start_pts, video_end_pts),
            video_timebase_numerator=video_meta["video_numerator"],
            video_timebase_denominator=video_meta["video_denominator"],
        )

    return v_frames, fps, decode_all_video

コード例 #3

ファイルを表示

def torchvision_decode(
    video_handle,
    sampling_rate,
    num_frames,
    clip_idx,
    video_meta,
    num_clips_uniform=10,
    target_fps=30,
    modalities=("visual", ),
    max_spatial_scale=0,
    use_offset=False,
    min_delta=-math.inf,
    max_delta=math.inf,
):
    """
    If video_meta is not empty, perform temporal selective decoding to sample a
    clip from the video with TorchVision decoder. If video_meta is empty, decode
    the entire video and update the video_meta.
    Args:
        video_handle (bytes): raw bytes of the video file.
        sampling_rate (int): frame sampling rate (interval between two sampled
            frames).
        num_frames (int): number of frames to sample.
        clip_idx (int): if clip_idx is -1, perform random temporal
            sampling. If clip_idx is larger than -1, uniformly split the
            video to num_clips_uniform clips, and select the clip_idx-th video clip.
        video_meta (dict): a dict contains VideoMetaData. Details can be found
            at `pytorch/vision/torchvision/io/_video_opt.py`.
        num_clips_uniform (int): overall number of clips to uniformly sample from the
            given video.
        target_fps (int): the input video may has different fps, convert it to
            the target video fps.
        modalities (tuple): tuple of modalities to decode. Currently only
            support `visual`, planning to support `acoustic` soon.
        max_spatial_scale (int): the resolution of the spatial shorter
            edge size during decoding.
        min_delta (int): minimum distance between clips when sampling multiple.
        max_delta (int): max distance between clips when sampling multiple.
    Returns:
        frames (tensor): decoded frames from the video.
        fps (float): the number of frames per second of the video.
        decode_all_video (bool): if True, the entire video was decoded.
    """
    # Convert the bytes to a tensor.
    video_tensor = torch.from_numpy(np.frombuffer(video_handle,
                                                  dtype=np.uint8))

    decode_all_video = True
    video_start_pts, video_end_pts = 0, -1
    # The video_meta is empty, fetch the meta data from the raw video.
    if len(video_meta) == 0:
        # Tracking the meta info for selective decoding in the future.
        meta = io._probe_video_from_memory(video_tensor)
        # Using the information from video_meta to perform selective decoding.
        video_meta["video_timebase"] = meta.video_timebase
        video_meta["video_numerator"] = meta.video_timebase.numerator
        video_meta["video_denominator"] = meta.video_timebase.denominator
        video_meta["has_video"] = meta.has_video
        video_meta["video_duration"] = meta.video_duration
        video_meta["video_fps"] = meta.video_fps
        video_meta["audio_timebas"] = meta.audio_timebase
        video_meta["audio_numerator"] = meta.audio_timebase.numerator
        video_meta["audio_denominator"] = meta.audio_timebase.denominator
        video_meta["has_audio"] = meta.has_audio
        video_meta["audio_duration"] = meta.audio_duration
        video_meta["audio_sample_rate"] = meta.audio_sample_rate

    fps = video_meta["video_fps"]

    if len(video_meta) > 0 and (
            video_meta["has_video"] and video_meta["video_denominator"] > 0
            and video_meta["video_duration"] > 0
            and fps * video_meta["video_duration"] > sum(
                T * tau for T, tau in zip(num_frames, sampling_rate))):
        decode_all_video = False  # try selective decoding

        clip_sizes = [
            np.maximum(
                1.0,
                np.ceil(sampling_rate[i] * (num_frames[i] - 1) / target_fps *
                        fps),
            ) for i in range(len(sampling_rate))
        ]
        start_end_delta_time = get_multiple_start_end_idx(
            fps * video_meta["video_duration"],
            clip_sizes,
            clip_idx,
            num_clips_uniform,
            min_delta=min_delta,
            max_delta=max_delta,
        )
        frames_out = [None] * len(num_frames)
        for k in range(len(num_frames)):
            pts_per_frame = video_meta["video_denominator"] / video_meta[
                "video_fps"]
            video_start_pts = int(start_end_delta_time[k, 0] * pts_per_frame)
            video_end_pts = int(start_end_delta_time[k, 1] * pts_per_frame)

            # Decode the raw video with the tv decoder.
            v_frames, _ = io._read_video_from_memory(
                video_tensor,
                seek_frame_margin=1.0,
                read_video_stream="visual" in modalities,
                video_width=0,
                video_height=0,
                video_min_dimension=max_spatial_scale,
                video_pts_range=(video_start_pts, video_end_pts),
                video_timebase_numerator=video_meta["video_numerator"],
                video_timebase_denominator=video_meta["video_denominator"],
                read_audio_stream=0,
            )
            if v_frames is None or v_frames.shape == torch.Size([0]):
                decode_all_video = True
                logger.info("TV decode FAILED try decode all")
                break
            frames_out[k] = v_frames

    if decode_all_video:
        # failed selective decoding
        decode_all_video = True
        video_start_pts, video_end_pts = 0, -1
        start_end_delta_time = None
        v_frames, _ = io._read_video_from_memory(
            video_tensor,
            seek_frame_margin=1.0,
            read_video_stream="visual" in modalities,
            video_width=0,
            video_height=0,
            video_min_dimension=max_spatial_scale,
            video_pts_range=(video_start_pts, video_end_pts),
            video_timebase_numerator=video_meta["video_numerator"],
            video_timebase_denominator=video_meta["video_denominator"],
            read_audio_stream=0,
        )
        if v_frames.shape == torch.Size([0]):
            v_frames = None
            logger.info("TV decode FAILED try cecode all")

        frames_out = [v_frames]

    if any([t.shape[0] < 0 for t in frames_out]):
        frames_out = [None]
        logger.info("TV decode FAILED: Decoded empty video")

    return frames_out, fps, decode_all_video, start_end_delta_time