def extract_frames(video_path, inference_engine, path_frames=None, return_frames=True): save_frames = path_frames is not None and not os.path.exists(path_frames) if not save_frames and not return_frames: # Nothing to do return None # Read frames from video video_source = camera.VideoSource(size=inference_engine.expected_frame_size, filename=video_path) video_fps = video_source.get_fps() frames = [] while True: images = video_source.get_image() if images is None: break else: image, image_rescaled = images frames.append(image_rescaled) frames = uniform_frame_sample(np.array(frames), inference_engine.fps / video_fps) # Save frames if a path was provided if save_frames: os.makedirs(path_frames) for idx, frame in enumerate(frames[::MODEL_TEMPORAL_STRIDE]): Image.fromarray(frame[:, :, ::-1]).resize((400, 300)).save( os.path.join(path_frames, f'{idx}.jpg'), quality=50) return frames
def compute_features(video_path, path_out, inference_engine, num_timesteps=1, path_frames=None, batch_size=None): video_source = camera.VideoSource(camera_id=None, size=inference_engine.expected_frame_size, filename=video_path) video_fps = video_source.get_fps() frames = [] while True: images = video_source.get_image() if images is None: break else: image, image_rescaled = images frames.append(image_rescaled) frames = uniform_frame_sample(np.array(frames), inference_engine.fps / video_fps) # Compute how many frames are padded to the left in order to "warm up" the model -- removing previous predictions # from the internal states -- with the first image, and to ensure we have enough frames in the video. # We also want the first non padding frame to output a feature frames_to_add = MODEL_TEMPORAL_STRIDE * (MODEL_TEMPORAL_DEPENDENCY // MODEL_TEMPORAL_STRIDE + 1) - 1 # Possible improvement : investigate if a symmetric or reflect padding could be better for # temporal annotation prediction instead of the static first frame frames = np.pad(frames, ((frames_to_add, 0), (0, 0), (0, 0), (0, 0)), mode='edge') # Inference clip = frames[None].astype(np.float32) # Run the model on padded frames in order to remove the state in the current model comming # from the previous video. pre_features = inference_engine.infer(clip[:, 0:frames_to_add + 1], batch_size=batch_size) # Depending on the number of layers we finetune, we keep the number of features from padding # equal to the temporal dependancy of the model. temporal_dependancy_features = np.array(pre_features)[-num_timesteps:] # predictions of the actual video frames predictions = inference_engine.infer(clip[:, frames_to_add + 1:], batch_size=batch_size) predictions = np.concatenate([temporal_dependancy_features, predictions], axis=0) features = np.array(predictions) os.makedirs(os.path.dirname(path_out), exist_ok=True) np.save(path_out, features) if path_frames is not None: os.makedirs(os.path.dirname(path_frames), exist_ok=True) frames_to_save = [] # remove the padded frames. extract frames starting at the first one (feature for the # first frame) for e, frame in enumerate(frames[frames_to_add:]): if e % MODEL_TEMPORAL_STRIDE == 0: frames_to_save.append(frame) for e, frame in enumerate(frames_to_save): Image.fromarray(frame[:, :, ::-1]).resize((400, 300)).save( os.path.join(path_frames, str(e) + '.jpg'), quality=50)