def __read_video_with_lintel(self, sample_name, indices=None): file = self.rgb_directory + '/' + sample_name + '_rgb.avi' fin = open(file, 'rb') video = fin.read() Dataset = namedtuple('Dataset', 'width height num_frames') dataset = Dataset(1920, 1080, None) if indices: video = lintel.loadvid_frame_nums(video, frame_nums=indices, width=dataset.width, height=dataset.height) else: video, seek_distance = lintel.loadvid(video, should_random_seek=True, width=dataset.width, height=dataset.height) video = np.frombuffer(video, dtype=np.uint8) video = np.reshape(video, newshape=(-1, dataset.height, dataset.width, 3)) fin.close() result = [] if self.image_transforms: for i in range(len(video)): result.append(self.image_transforms(video[i])) return torch.stack(result)
def lintel_loader( file: Union[str, Path, IO[bytes]], frames_idx: Union[slice, List[slice], List[int]]) -> Iterator[Image.Image]: import lintel if isinstance(file, str): file = Path(file) if isinstance(file, Path): _LOG.debug("Loading data from {}".format(file)) with file.open("rb") as f: video = f.read() else: video = file.read() frames_idx = np.array(frame_idx_to_list(frames_idx)) assert isinstance(frames_idx, np.ndarray) load_idx, reconstruction_idx = np.unique(frames_idx, return_inverse=True) _LOG.debug("Converted frames_idx {} to load_idx {}".format( frames_idx, load_idx)) frames_data, width, height = lintel.loadvid_frame_nums(video, frame_nums=load_idx, should_seek=False) frames = np.frombuffer(frames_data, dtype=np.uint8) # TODO: Support 1 channel grayscale video frames = np.reshape(frames, newshape=(len(load_idx), height, width, 3)) frames = frames[reconstruction_idx] return (Image.fromarray(frame) for frame in frames)
def _load_clips(self, video_file, video_info): # load a set of clips from the video all_frame_idx = self._select_k_clips( [0, video_info['num_frames'] - 1 - self.protect_frames]) with open(video_file, 'rb') as f: encoded_video = f.read() # Reading all frames at once frame_idx = [ int(item) for sublist in all_frame_idx for item in sublist ] frame_idx = sorted(list(set(frame_idx))) decoded_frames = lintel.loadvid_frame_nums(encoded_video, frame_nums=frame_idx, width=video_info['width'], height=video_info['height']) decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) frame_chunk = np.reshape(decoded_frames, newshape=(len(frame_idx), video_info['height'], video_info['width'], 3)) # tricky! We need to re-map the frames back into clips all_clips = [] for clip_idx in range(self.k_clips): clip_frame_idx = all_frame_idx[clip_idx] clip_frame_mapping = [] for cur_frame_idx in clip_frame_idx: clip_frame_mapping.append(frame_idx.index(cur_frame_idx)) clip = frame_chunk[clip_frame_mapping, :, :, :] # pad -> expand dim -> add to list clip = self._pad_to_length(clip) all_clips.append(clip) return all_clips
def _load_action_frame_nums_to_4darray(video, frame_nums, width, height): """Decodes a specific set of frames from `video` to a 4D numpy array. Args: video: Encoded video. dataset: Dataset meta-info, e.g., width and height. frame_nums: Indices of specific frame indices to decode, e.g., [1, 10, 30, 35] will return four frames: the first, 10th, 30th and 35 frames in `video`. Indices must be in strictly increasing order. Returns: A numpy array, loaded from the byte array returned by `lintel.loadvid_frame_nums`, containing the specified frames, decoded. """ decoded_frames = lintel.loadvid_frame_nums(video, frame_nums=frame_nums, width=width, height=height) decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(len(frame_nums), height, width, 3)) return decoded_frames # import torchvision # # read dataset from pytorch API # import classy_vision # # def dataset_config(args): # if args.dataset == 'kinetics': # dataset = classy_vision.dataset.Kinetics400Dataset(split: 'train', batchsize_per_replica: 8, # shuffle: True, transform: None, num_samples: None, # frames_per_clip: 8, video_width: 256, video_height: 340, # video_min_dimension: 256, audio_samples: 0, audio_channels: 0, # step_between_clips: 4, frame_rate:0, # clips_per_video: int, video_dir: str, # extensions: 'avi', metadata_filepath: str) # # elif args.dataset == 'hmdb51': # torchvision.datasets.HMDB51(root, annotation_path, frames_per_clip, # step_between_clips=1, frame_rate=None, fold=1, # train=True, transform=None, _precomputed_metadata=None, # num_workers=1, _video_width=0, _video_height=0, # _video_min_dimension=0, _audio_samples=0) # elif args.dataset == 'ucf101': # torchvision.datasets.UCF101(root, annotation_path, frames_per_clip, # step_between_clips=1, frame_rate=None, fold=1, # train=True, transform=None, _precomputed_metadata=None, # num_workers=1, _video_width=0, _video_height=0, # _video_min_dimension=0, _audio_samples=0) # else: # Exception("wrong dataset")
def __call__(self, clip_info, frames): unique_frames = sorted(list(set(frames))) frame_inds = [unique_frames.index(f) for f in frames] with open(clip_info["path"], "rb") as f: vid = f.read() decoded_frames, width, height = lintel.loadvid_frame_nums( vid, frame_nums=unique_frames, should_seek=True) decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(-1, height, width, 3)) decoded_frames = decoded_frames[frame_inds] return decoded_frames
def _loadvid_test_frame_nums(filename, width, height, start_frame, should_seek): """Tests loadvid_frame_nums Python extension. `loadvid_frame_nums` takes a list of (strictly increasing, and not repeated) frame indices to decode from the encoded video corresponding to `filename`. This function randomly selects frames to decode, in a loop, decodes the chosen frames with `loadvid_frame_nums`, and visualizes the resulting frames (all of them) using `matplotlib.pyplot`. """ with open(filename, 'rb') as f: encoded_video = f.read() num_frames = 32 for _ in range(10): start = time.perf_counter() i = start_frame frame_nums = [] for _ in range(num_frames): frame_nums.append(i) i += int(random.uniform(1, 4)) result = lintel.loadvid_frame_nums(encoded_video, frame_nums=frame_nums, width=width, height=height, should_seek=should_seek) if (width == 0) and (height == 0): decoded_frames, width, height = result else: decoded_frames = result decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(num_frames, height, width, 3)) end = time.perf_counter() print('time: {}'.format(end - start)) for i in range(num_frames): plt.imshow(decoded_frames[i, ...]) plt.show()
def extract_frames(self, video_file, timesteps): with open(video_file, 'rb') as f: encoded_video = f.read() decoded_frames = lintel.loadvid_frame_nums(encoded_video, frame_nums=timesteps, width=self.real_w, height=self.real_h) try: np_clip = np.frombuffer(decoded_frames, dtype=np.uint8) np_clip = np.reshape(np_clip, newshape=(self.t, self.real_h, self.real_w, 3)) np_clip = np_clip.transpose([3, 0, 1, 2]) np_clip = np.float32(np_clip) except Exception as e: np_clip = decoded_frames return np_clip
def get(self, record, indices): images = list() debug_info = [] if self.read_mode == 'video': video_data = {} t1 = time.time() finish_flag = False full_indices = self._get_full_indices(indices, record.num_frames) with open(os.path.join(self.root_path, record.path), 'rb') as f: enc_vid = f.read() df, w, h = lintel.loadvid_frame_nums(enc_vid, frame_nums=full_indices) df = np.reshape(df, (len(full_indices), h, w, 3)) for i in range(len(full_indices)): video_data[full_indices[i]] = df[i] t2 = time.time() debug_info.append('read video: {:.4f}s'.format(t2 - t1)) t1 = time.time() for seg_ind in indices: p = int(seg_ind) for i in range(self.new_length): if self.read_mode == 'video': seg_imgs = self._load_image_from_video(video_data, p) else: seg_imgs = self._load_image( os.path.join(self.root_path, record.path), p) images.extend(seg_imgs) if p < record.num_frames - 1: p += 1 + self.skip if p >= record.num_frames: p = record.num_frames - 1 t2 = time.time() debug_info.append('load image: {:.4f}s'.format(t2 - t1)) t1 = time.time() process_data = self.transform(images) t2 = time.time() debug_info.append('transform data: {:.4f}s'.format(t2 - t1)) if DEBUG_FLAG: print(debug_info) return process_data, record.label
def _load_frames(self, video_file, video_info): # load video frames using lintel frame_idx = self._sample_frames( [0, video_info['num_frames'] - 1 - self.protect_frames]) # TO-DO: This part is problematic for large videos (e.g., action detection) # It might be possible to read a chunck of the video with open(video_file, 'rb') as f: encoded_video = f.read() decoded_frames = lintel.loadvid_frame_nums(encoded_video, frame_nums=frame_idx, width=video_info['width'], height=video_info['height']) decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) clip = np.reshape(decoded_frames, newshape=(len(frame_idx), video_info['height'], video_info['width'], 3)) # pad to fixed length clip = self._pad_to_length(clip) return clip
def _load_frame_nums_to_4darray(video, dataset, frame_nums): """Decodes a specific set of frames from `video` to a 4D numpy array. Args: video: Encoded video. dataset: Dataset meta-info, e.g., width and height. frame_nums: Indices of specific frame indices to decode, e.g., [1, 10, 30, 35] will return four frames: the first, 10th, 30th and 35 frames in `video`. Indices must be in strictly increasing order. Returns: A numpy array, loaded from the byte array returned by `lintel.loadvid_frame_nums`, containing the specified frames, decoded. """ decoded_frames = lintel.loadvid_frame_nums(video, frame_nums=frame_nums, width=dataset.width, height=dataset.height) decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(dataset.num_frames, dataset.height, dataset.width, 3)) return decoded_frames
def test_lintel_frames_num(filename, width=0, height=0): witdh = 0 height = 0 with open(filename, 'rb') as f: encoded_video = f.read() num_frames = 47 start = time.perf_counter() i = 0 frame_nums = [] for _ in range(num_frames): frame_nums.append(i) i += 1 result = lintel.loadvid_frame_nums(encoded_video, frame_nums=frame_nums, width=width, height=height, should_seek=False) if (width == 0) and (height == 0): decoded_frames, width, height = result else: decoded_frames = result decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(num_frames, height, width, 3)) for idx, frame in enumerate(decoded_frames): cv2.imwrite("frame{}.jpg".format(idx), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) end = time.perf_counter() print('time: {}'.format(end - start))
# video, width, height, seek_index = lintel.loadvid(open(v_path, 'rb').read(), should_random_seek=False) # video = np.reshape(np.frombuffer(video, dtype=np.uint8), (-1, height, width, 3)) # num_frames = video.shape # print(video_frames_num, num_frames) # # videodata = skvideo.io.vread(v_path, inputdict={'-r': '4'}) # print(videodata.shape) print(video_frames_num) f = open(v_path, 'rb') video = f.read() f.close() # ffmpeg count 比 cv2少1帧 frame_nums = [0, 201, 280, 295, 296] decoded_frames = lintel.loadvid_frame_nums(video, frame_nums=frame_nums, width=width, height=height) decoded_frames = np.frombuffer(decoded_frames, dtype=np.uint8) decoded_frames = np.reshape(decoded_frames, newshape=(len(frame_nums), height, width, 3)) print(np.shape(decoded_frames)[0]) # pytorch vision里的Kinetics dataset可以简单改写,可以得到音频,但是不能限定开始的index # self.video_clips = VideoClips( # video_list, # frames_per_clip, # step_between_clips, # frame_rate, # _precomputed_metadata, # num_workers=num_workers,