Esempio n. 1
0
 def __read_video_with_lintel(self, sample_name, indices=None):
     file = self.rgb_directory + '/' + sample_name + '_rgb.avi'
     fin = open(file, 'rb')
     video = fin.read()
     Dataset = namedtuple('Dataset', 'width height num_frames')
     dataset = Dataset(1920, 1080, None)
     if indices:
         video = lintel.loadvid_frame_nums(video,
                                           frame_nums=indices,
                                           width=dataset.width,
                                           height=dataset.height)
     else:
         video, seek_distance = lintel.loadvid(video,
                                               should_random_seek=True,
                                               width=dataset.width,
                                               height=dataset.height)
     video = np.frombuffer(video, dtype=np.uint8)
     video = np.reshape(video,
                        newshape=(-1, dataset.height, dataset.width, 3))
     fin.close()
     result = []
     if self.image_transforms:
         for i in range(len(video)):
             result.append(self.image_transforms(video[i]))
     return torch.stack(result)
Esempio n. 2
0
def lintel_loader(
        file: Union[str, Path, IO[bytes]],
        frames_idx: Union[slice, List[slice],
                          List[int]]) -> Iterator[Image.Image]:
    import lintel

    if isinstance(file, str):
        file = Path(file)
    if isinstance(file, Path):
        _LOG.debug("Loading data from {}".format(file))
        with file.open("rb") as f:
            video = f.read()
    else:
        video = file.read()

    frames_idx = np.array(frame_idx_to_list(frames_idx))
    assert isinstance(frames_idx, np.ndarray)
    load_idx, reconstruction_idx = np.unique(frames_idx, return_inverse=True)
    _LOG.debug("Converted frames_idx {} to load_idx {}".format(
        frames_idx, load_idx))
    frames_data, width, height = lintel.loadvid_frame_nums(video,
                                                           frame_nums=load_idx,
                                                           should_seek=False)
    frames = np.frombuffer(frames_data, dtype=np.uint8)
    # TODO: Support 1 channel grayscale video
    frames = np.reshape(frames, newshape=(len(load_idx), height, width, 3))
    frames = frames[reconstruction_idx]
    return (Image.fromarray(frame) for frame in frames)
    def _load_clips(self, video_file, video_info):
        # load a set of clips from the video
        all_frame_idx = self._select_k_clips(
            [0, video_info['num_frames'] - 1 - self.protect_frames])
        with open(video_file, 'rb') as f:
            encoded_video = f.read()

        # Reading all frames at once
        frame_idx = [
            int(item) for sublist in all_frame_idx for item in sublist
        ]
        frame_idx = sorted(list(set(frame_idx)))
        decoded_frames = lintel.loadvid_frame_nums(encoded_video,
                                                   frame_nums=frame_idx,
                                                   width=video_info['width'],
                                                   height=video_info['height'])
        decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
        frame_chunk = np.reshape(decoded_frames,
                                 newshape=(len(frame_idx),
                                           video_info['height'],
                                           video_info['width'], 3))

        # tricky! We need to re-map the frames back into clips
        all_clips = []
        for clip_idx in range(self.k_clips):
            clip_frame_idx = all_frame_idx[clip_idx]
            clip_frame_mapping = []
            for cur_frame_idx in clip_frame_idx:
                clip_frame_mapping.append(frame_idx.index(cur_frame_idx))
            clip = frame_chunk[clip_frame_mapping, :, :, :]
            # pad -> expand dim -> add to list
            clip = self._pad_to_length(clip)
            all_clips.append(clip)

        return all_clips
def _load_action_frame_nums_to_4darray(video, frame_nums, width, height):
    """Decodes a specific set of frames from `video` to a 4D numpy array.
    Args:
        video: Encoded video.
        dataset: Dataset meta-info, e.g., width and height.
        frame_nums: Indices of specific frame indices to decode, e.g.,
            [1, 10, 30, 35] will return four frames: the first, 10th, 30th and
            35 frames in `video`. Indices must be in strictly increasing order.
    Returns:
        A numpy array, loaded from the byte array returned by
        `lintel.loadvid_frame_nums`, containing the specified frames, decoded.
    """
    decoded_frames = lintel.loadvid_frame_nums(video,
                                               frame_nums=frame_nums,
                                               width=width,
                                               height=height)
    decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
    decoded_frames = np.reshape(decoded_frames,
                                newshape=(len(frame_nums), height, width, 3))

    return decoded_frames


# import torchvision
# # read dataset from pytorch API
# import classy_vision
#
# def dataset_config(args):
#     if args.dataset == 'kinetics':
#         dataset = classy_vision.dataset.Kinetics400Dataset(split: 'train', batchsize_per_replica: 8,
#         shuffle: True, transform: None, num_samples: None,
#         frames_per_clip: 8, video_width: 256, video_height: 340,
#         video_min_dimension: 256, audio_samples: 0, audio_channels: 0,
#         step_between_clips: 4, frame_rate:0,
#         clips_per_video: int, video_dir: str,
#         extensions: 'avi', metadata_filepath: str)
#
#     elif args.dataset == 'hmdb51':
#         torchvision.datasets.HMDB51(root, annotation_path, frames_per_clip,
#                                     step_between_clips=1, frame_rate=None, fold=1,
#                                     train=True, transform=None, _precomputed_metadata=None,
#                                     num_workers=1, _video_width=0, _video_height=0,
#                                     _video_min_dimension=0, _audio_samples=0)
#     elif args.dataset == 'ucf101':
#         torchvision.datasets.UCF101(root, annotation_path, frames_per_clip,
#                                     step_between_clips=1, frame_rate=None, fold=1,
#                                     train=True, transform=None, _precomputed_metadata=None,
#                                     num_workers=1, _video_width=0, _video_height=0,
#                                     _video_min_dimension=0, _audio_samples=0)
#     else:
#         Exception("wrong dataset")
    def __call__(self, clip_info, frames):

        unique_frames = sorted(list(set(frames)))
        frame_inds = [unique_frames.index(f) for f in frames]

        with open(clip_info["path"], "rb") as f:
            vid = f.read()
        decoded_frames, width, height = lintel.loadvid_frame_nums(
            vid, frame_nums=unique_frames, should_seek=True)
        decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
        decoded_frames = np.reshape(decoded_frames,
                                    newshape=(-1, height, width, 3))

        decoded_frames = decoded_frames[frame_inds]
        return decoded_frames
Esempio n. 6
0
def _loadvid_test_frame_nums(filename, width, height, start_frame,
                             should_seek):
    """Tests loadvid_frame_nums Python extension.

    `loadvid_frame_nums` takes a list of (strictly increasing, and not
    repeated) frame indices to decode from the encoded video corresponding to
    `filename`.

    This function randomly selects frames to decode, in a loop, decodes the
    chosen frames with `loadvid_frame_nums`, and visualizes the resulting
    frames (all of them) using `matplotlib.pyplot`.
    """
    with open(filename, 'rb') as f:
        encoded_video = f.read()

    num_frames = 32
    for _ in range(10):
        start = time.perf_counter()

        i = start_frame
        frame_nums = []
        for _ in range(num_frames):
            frame_nums.append(i)
            i += int(random.uniform(1, 4))

        result = lintel.loadvid_frame_nums(encoded_video,
                                           frame_nums=frame_nums,
                                           width=width,
                                           height=height,
                                           should_seek=should_seek)

        if (width == 0) and (height == 0):
            decoded_frames, width, height = result
        else:
            decoded_frames = result

        decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
        decoded_frames = np.reshape(decoded_frames,
                                    newshape=(num_frames, height, width, 3))
        end = time.perf_counter()

        print('time: {}'.format(end - start))
        for i in range(num_frames):
            plt.imshow(decoded_frames[i, ...])
            plt.show()
Esempio n. 7
0
    def extract_frames(self, video_file, timesteps):

        with open(video_file, 'rb') as f:
            encoded_video = f.read()

            decoded_frames = lintel.loadvid_frame_nums(encoded_video,
                                                       frame_nums=timesteps,
                                                       width=self.real_w,
                                                       height=self.real_h)
            try:
                np_clip = np.frombuffer(decoded_frames, dtype=np.uint8)
                np_clip = np.reshape(np_clip,
                                     newshape=(self.t, self.real_h, self.real_w, 3))
                np_clip = np_clip.transpose([3, 0, 1, 2])
                np_clip = np.float32(np_clip)
            except Exception as e:
                np_clip = decoded_frames
        return np_clip
Esempio n. 8
0
    def get(self, record, indices):

        images = list()
        debug_info = []
        if self.read_mode == 'video':
            video_data = {}
            t1 = time.time()
            finish_flag = False
            full_indices = self._get_full_indices(indices, record.num_frames)
            with open(os.path.join(self.root_path, record.path), 'rb') as f:
                enc_vid = f.read()
            df, w, h = lintel.loadvid_frame_nums(enc_vid,
                                                 frame_nums=full_indices)
            df = np.reshape(df, (len(full_indices), h, w, 3))
            for i in range(len(full_indices)):
                video_data[full_indices[i]] = df[i]
            t2 = time.time()
            debug_info.append('read video: {:.4f}s'.format(t2 - t1))

        t1 = time.time()
        for seg_ind in indices:
            p = int(seg_ind)
            for i in range(self.new_length):
                if self.read_mode == 'video':
                    seg_imgs = self._load_image_from_video(video_data, p)
                else:
                    seg_imgs = self._load_image(
                        os.path.join(self.root_path, record.path), p)
                images.extend(seg_imgs)
                if p < record.num_frames - 1:
                    p += 1 + self.skip
                if p >= record.num_frames:
                    p = record.num_frames - 1
        t2 = time.time()
        debug_info.append('load image: {:.4f}s'.format(t2 - t1))

        t1 = time.time()
        process_data = self.transform(images)
        t2 = time.time()
        debug_info.append('transform data: {:.4f}s'.format(t2 - t1))
        if DEBUG_FLAG:
            print(debug_info)
        return process_data, record.label
    def _load_frames(self, video_file, video_info):
        # load video frames using lintel
        frame_idx = self._sample_frames(
            [0, video_info['num_frames'] - 1 - self.protect_frames])
        # TO-DO: This part is problematic for large videos (e.g., action detection)
        #        It might be possible to read a chunck of the video
        with open(video_file, 'rb') as f:
            encoded_video = f.read()

        decoded_frames = lintel.loadvid_frame_nums(encoded_video,
                                                   frame_nums=frame_idx,
                                                   width=video_info['width'],
                                                   height=video_info['height'])
        decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
        clip = np.reshape(decoded_frames,
                          newshape=(len(frame_idx), video_info['height'],
                                    video_info['width'], 3))
        # pad to fixed length
        clip = self._pad_to_length(clip)
        return clip
def _load_frame_nums_to_4darray(video, dataset, frame_nums):
    """Decodes a specific set of frames from `video` to a 4D numpy array.
    Args:
        video: Encoded video.
        dataset: Dataset meta-info, e.g., width and height.
        frame_nums: Indices of specific frame indices to decode, e.g.,
            [1, 10, 30, 35] will return four frames: the first, 10th, 30th and
            35 frames in `video`. Indices must be in strictly increasing order.
    Returns:
        A numpy array, loaded from the byte array returned by
        `lintel.loadvid_frame_nums`, containing the specified frames, decoded.
    """
    decoded_frames = lintel.loadvid_frame_nums(video,
                                               frame_nums=frame_nums,
                                               width=dataset.width,
                                               height=dataset.height)
    decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
    decoded_frames = np.reshape(decoded_frames,
                                newshape=(dataset.num_frames, dataset.height,
                                          dataset.width, 3))

    return decoded_frames
def test_lintel_frames_num(filename, width=0, height=0):
    witdh = 0
    height = 0

    with open(filename, 'rb') as f:
        encoded_video = f.read()

        num_frames = 47
        start = time.perf_counter()

        i = 0
        frame_nums = []
        for _ in range(num_frames):
            frame_nums.append(i)
            i += 1

        result = lintel.loadvid_frame_nums(encoded_video,
                                           frame_nums=frame_nums,
                                           width=width,
                                           height=height,
                                           should_seek=False)

        if (width == 0) and (height == 0):
            decoded_frames, width, height = result
        else:
            decoded_frames = result

        decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
        decoded_frames = np.reshape(decoded_frames,
                                    newshape=(num_frames, height, width, 3))

        for idx, frame in enumerate(decoded_frames):
            cv2.imwrite("frame{}.jpg".format(idx),
                        cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

        end = time.perf_counter()

        print('time: {}'.format(end - start))
# video, width, height, seek_index = lintel.loadvid(open(v_path, 'rb').read(), should_random_seek=False)
# video = np.reshape(np.frombuffer(video, dtype=np.uint8), (-1, height, width, 3))
# num_frames = video.shape
# print(video_frames_num, num_frames)
#
# videodata = skvideo.io.vread(v_path, inputdict={'-r': '4'})
# print(videodata.shape)
print(video_frames_num)
f = open(v_path, 'rb')
video = f.read()
f.close()

# ffmpeg count 比 cv2少1帧
frame_nums = [0, 201, 280, 295, 296]
decoded_frames = lintel.loadvid_frame_nums(video,
                                           frame_nums=frame_nums,
                                           width=width,
                                           height=height)
decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8)
decoded_frames = np.reshape(decoded_frames,
                            newshape=(len(frame_nums), height, width, 3))

print(np.shape(decoded_frames)[0])

# pytorch vision里的Kinetics dataset可以简单改写,可以得到音频,但是不能限定开始的index
# self.video_clips = VideoClips(
#             video_list,
#             frames_per_clip,
#             step_between_clips,
#             frame_rate,
#             _precomputed_metadata,
#             num_workers=num_workers,