Esempio n. 1
0
def extract_features(path_in, net, num_layers_finetune, use_gpu, minimum_frames=45):

    # Create inference engine
    inference_engine = engine.InferenceEngine(net, use_gpu=use_gpu)

    # extract features
    for dataset in ["train", "valid"]:
        videos_dir = os.path.join(path_in, f"videos_{dataset}")
        features_dir = os.path.join(path_in, f"features_{dataset}_num_layers_to_finetune={num_layers_finetune}")
        video_files = glob.glob(os.path.join(videos_dir, "*", "*.mp4"))

        print(f"\nFound {len(video_files)} videos to process in the {dataset}set")

        for video_index, video_path in enumerate(video_files):
            print(f"\rExtract features from video {video_index + 1} / {len(video_files)}",
                  end="")
            path_out = video_path.replace(videos_dir, features_dir).replace(".mp4", ".npy")

            if os.path.isfile(path_out):
                print("\n\tSkipped - feature was already precomputed.")
            else:
                # Read all frames
                video_source = camera.VideoSource(camera_id=None,
                                                  size=inference_engine.expected_frame_size,
                                                  filename=video_path)
                video_fps = video_source.get_fps()
                frames = []
                while True:
                    images = video_source.get_image()
                    if images is None:
                        break
                    else:
                        image, image_rescaled = images
                        frames.append(image_rescaled)
                frames = uniform_frame_sample(np.array(frames), inference_engine.fps / video_fps)

                if frames.shape[0] < minimum_frames:
                    print(f"\nVideo too short: {video_path} - first frame will be duplicated")
                    num_missing_frames = minimum_frames - frames.shape[0]
                    frames = np.pad(frames, ((num_missing_frames, 0), (0, 0), (0, 0), (0, 0)),
                                    mode='edge')
                # Inference
                clip = frames[None].astype(np.float32)
                predictions = inference_engine.infer(clip)
                features = np.array(predictions)
                os.makedirs(os.path.dirname(path_out), exist_ok=True)
                np.save(path_out, features)

        print('\n')
    checkpoint = engine.load_weights('resources/backbone/strided_inflated_efficientnet.ckpt')
    feature_extractor.load_state_dict(checkpoint)
    feature_extractor.eval()

    # Load a logistic regression classifier
    gesture_classifier = LogisticRegression(num_in=feature_extractor.feature_dim,
                                            num_out=30)
    checkpoint = engine.load_weights('resources/gesture_detection/efficientnet_logistic_regression.ckpt')
    gesture_classifier.load_state_dict(checkpoint)
    gesture_classifier.eval()

    # Concatenate feature extractor and met converter
    net = Pipe(feature_extractor, gesture_classifier)

    # Create inference engine, video streaming and display instances
    inference_engine = engine.InferenceEngine(net, use_gpu=use_gpu)

    video_source = camera.VideoSource(camera_id=camera_id,
                                      size=inference_engine.expected_frame_size,
                                      filename=path_in)

    video_stream = camera.VideoStream(video_source,
                                      inference_engine.fps)

    postprocessor = [
        PostprocessClassificationOutput(INT2LAB, smoothing=4)
    ]

    display_ops = [
        realtimenet.display.DisplayTopKClassificationOutputs(top_k=1, threshold=0.5),
    ]
Esempio n. 3
0
 def _setup_inference_engine(self):
     self.inference_engine = engine.InferenceEngine(self.net, use_gpu=True)
     video_source = camera.VideoSource(
         camera_id=0, size=self.inference_engine.expected_frame_size)
     self.frame_grabber = camera.VideoStream(video_source,
                                             self.inference_engine.fps)