Ejemplo n.º 1
0
def recognize_hand(frame, detector, estimator, out_frame=None):
    img256, _, scale, pad = bhut.resize_pad(frame[:, :, ::-1])
    input_data = img256.astype('float32') / 255.
    input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0)

    # inference
    # Palm detection
    preds = detector.predict([input_data])
    detections = bhut.detector_postprocess(
        preds, anchor_path="../../hand_recognition/blazehand/anchors.npy")

    # display bbox
    if args.bbox:
        detections2 = bhut.denormalize_detections(detections[0].copy(), scale,
                                                  pad)
        display_hand_box(out_frame, detections2)

    # Hand landmark estimation
    presence = [0, 0]  # [left, right]
    if detections[0].size != 0:
        img, affine, _ = bhut.estimator_preprocess(frame, detections, scale,
                                                   pad)
        estimator.set_input_shape(img.shape)
        flags, handedness, normalized_landmarks = estimator.predict([img])

        # postprocessing
        landmarks = bhut.denormalize_landmarks(normalized_landmarks, affine)
        for i in range(len(flags)):
            landmark, flag, handed = landmarks[i], flags[i], handedness[i]
            if flag > 0.75:
                if handed > 0.5:
                    presence[0] = 1
                else:
                    presence[1] = 1
                draw_landmarks_hand(out_frame,
                                    landmark[:, :2],
                                    bhut.HAND_CONNECTIONS,
                                    size=4)
Ejemplo n.º 2
0
def recognize_from_image():
    # prepare input data
    src_img = cv2.imread(args.input)
    img256, _, scale, pad = but.resize_pad(src_img[:, :, ::-1])
    input_data = img256.astype('float32') / 255.
    input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0)

    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    detector = ailia.Net(DETECTION_MODEL_PATH,
                         DETECTION_WEIGHT_PATH,
                         env_id=env_id)
    estimator = ailia.Net(LANDMARK_MODEL_PATH,
                          LANDMARK_WEIGHT_PATH,
                          env_id=env_id)

    # inference
    print('Start inference...')
    if args.benchmark:
        print('BENCHMARK mode')
        for _ in range(5):
            start = int(round(time.time() * 1000))
            # Palm detection
            preds = detector.predict([input_data])
            detections = but.detector_postprocess(preds)

            # Hand landmark estimation
            presence = [0, 0]  # [left, right]
            if detections[0].size != 0:
                imgs, affines, _ = but.estimator_preprocess(
                    src_img, detections, scale, pad)
                estimator.set_input_shape(imgs.shape)
                flags, handedness, normalized_landmarks = estimator.predict(
                    [imgs])

                # postprocessing
                landmarks = but.denormalize_landmarks(normalized_landmarks,
                                                      affines)
                for i in range(len(flags)):
                    landmark, flag, handed = landmarks[i], flags[
                        i], 1 - handedness[i]
                    if flag > 0.75:
                        if handed < 0.5:  # Right handedness when not flipped camera input
                            presence[0] = 1
                        else:
                            presence[1] = 1
                        draw_landmarks(src_img,
                                       landmark[:, :2],
                                       but.HAND_CONNECTIONS,
                                       size=2)
            end = int(round(time.time() * 1000))
            print(f'\tailia processing time {end - start} ms')
    else:
        # Palm detection
        preds = detector.predict([input_data])
        detections = but.detector_postprocess(preds)

        # Hand landmark estimation
        presence = [0, 0]  # [left, right]
        if detections[0].size != 0:
            imgs, affines, _ = but.estimator_preprocess(
                src_img, detections, scale, pad)
            estimator.set_input_shape(imgs.shape)
            flags, handedness, normalized_landmarks = estimator.predict([imgs])

            # postprocessing
            landmarks = but.denormalize_landmarks(normalized_landmarks,
                                                  affines)
            for i in range(len(flags)):
                landmark, flag, handed = landmarks[i], flags[
                    i], 1 - handedness[i]
                if flag > 0.75:
                    if handed > 0.5:  # Right handedness when not flipped camera input
                        presence[0] = 1
                    else:
                        presence[1] = 1
                    draw_landmarks(src_img,
                                   landmark[:, :2],
                                   but.HAND_CONNECTIONS,
                                   size=2)

    if presence[0] and presence[1]:
        hand_presence = 'Left and right'
    elif presence[0]:
        hand_presence = 'Left'
    elif presence[1]:
        hand_presence = 'Right'
    else:
        hand_presence = 'No hand'
    print(f'Hand presence: {hand_presence}')
    cv2.imwrite(args.savepath, src_img)
    print('Script finished successfully.')
Ejemplo n.º 3
0
def recognize_from_video():
    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    detector = ailia.Net(DETECTION_MODEL_PATH,
                         DETECTION_WEIGHT_PATH,
                         env_id=env_id)
    estimator = ailia.Net(LANDMARK_MODEL_PATH,
                          LANDMARK_WEIGHT_PATH,
                          env_id=env_id)

    capture = get_capture(args.video)

    # create video writer if savepath is specified as video format
    if args.savepath != SAVE_IMAGE_PATH:
        f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        save_h, save_w = webcamera_utils.calc_adjust_fsize(
            f_h, f_w, IMAGE_HEIGHT, IMAGE_WIDTH)
        writer = webcamera_utils.get_writer(args.savepath, save_h, save_w)
    else:
        writer = None

    while (True):
        ret, frame = capture.read()
        frame = np.ascontiguousarray(frame[:, ::-1, :])
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        img256, _, scale, pad = but.resize_pad(frame[:, :, ::-1])
        input_data = img256.astype('float32') / 255.
        input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0)

        # inference
        # Palm detection
        preds = detector.predict([input_data])
        detections = but.detector_postprocess(preds)

        # Hand landmark estimation
        presence = [0, 0]  # [left, right]
        if detections[0].size != 0:
            img, affine, _ = but.estimator_preprocess(frame, detections, scale,
                                                      pad)
            estimator.set_input_shape(img.shape)
            flags, handedness, normalized_landmarks = estimator.predict([img])

            # postprocessing
            landmarks = but.denormalize_landmarks(normalized_landmarks, affine)
            for i in range(len(flags)):
                landmark, flag, handed = landmarks[i], flags[i], handedness[i]
                if flag > 0.75:
                    if handed > 0.5:
                        presence[0] = 1
                    else:
                        presence[1] = 1
                    draw_landmarks(frame,
                                   landmark[:, :2],
                                   but.HAND_CONNECTIONS,
                                   size=2)

        if presence[0] and presence[1]:
            text = 'Left and right'
        elif presence[0]:
            text = 'Left'
        elif presence[1]:
            text = 'Right'
        else:
            text = 'No hand'
        cv2.putText(frame, text, (8, 24), cv2.FONT_HERSHEY_SIMPLEX, 1,
                    (255, 0, 255), 2)
        cv2.imshow('frame', frame)

        # save results
        if writer is not None:
            writer.write(frame)

    capture.release()
    cv2.destroyAllWindows()
    print('Script finished successfully.')
    pass
Ejemplo n.º 4
0
    def recognize_hand(self, frame, detector, estimator, out_frame=None):
        img256, _, scale, pad = bhut.resize_pad(frame[:, :, ::-1])
        input_data = img256.astype('float32') / 255.
        input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0)

        # inference
        # Perform palm detection on 1st frame and if at least 1 hand has low
        # confidence (not detected)
        if np.any(self.tracked_hands < HAND_DETECTION_THRESHOLD):
            self.tracking = False
            # Palm detection
            preds = detector.predict([input_data])
            detections = bhut.detector_postprocess(
                preds,
                anchor_path="../../hand_recognition/blazehand/anchors.npy")
            if detections[0].size > 0:
                self.tracking = True
                self.roi_imgs, self.affines, _ = bhut.estimator_preprocess(
                    frame, detections[0][:self.num_hands], scale, pad)
        else:
            for i, roi in enumerate(self.rois):
                xc, yc, scale, theta = roi
                roi_img, affine, _ = bhut.extract_roi(frame, xc, yc, theta,
                                                      scale)
                self.roi_imgs[i] = roi_img[0]
                self.affines[i] = affine[0]

        # display bbox
        if args.bbox:
            detections2 = bhut.denormalize_detections(detections[0].copy(),
                                                      scale, pad)
            display_hand_box(out_frame, detections2)

        # Hand landmark estimation
        presence = [0, 0]  # [left, right]
        if self.tracking:
            # img, affine, _ = bhut.estimator_preprocess(frame, detections, scale, pad)
            estimator.set_input_shape(self.roi_imgs.shape)
            hand_flags, handedness, normalized_landmarks = estimator.predict(
                [self.roi_imgs])

            # postprocessing
            landmarks = bhut.denormalize_landmarks(normalized_landmarks,
                                                   self.affines)

            self.tracked_hands[:] = 0
            n_imgs = len(hand_flags)
            for i in range(n_imgs):
                landmark, hand_flag, handed = landmarks[i], hand_flags[
                    i], handedness[i]
                if hand_flag > HAND_LANDMARK_THRESHOLD:
                    if handed > 0.5:
                        presence[0] = 1  # Left hand
                    else:
                        presence[1] = 1  # Right hand
                    draw_landmarks_hand(out_frame,
                                        landmark[:, :2],
                                        bhut.HAND_CONNECTIONS,
                                        size=4)

                    self.rois[i] = bhut.landmarks2roi(normalized_landmarks[i],
                                                      self.affines[i])
                self.tracked_hands[i] = hand_flag
Ejemplo n.º 5
0
def recognize_from_video():
    # net initialize
    detector = ailia.Net(DETECTION_MODEL_PATH,
                         DETECTION_WEIGHT_PATH,
                         env_id=args.env_id)
    estimator = ailia.Net(LANDMARK_MODEL_PATH,
                          LANDMARK_WEIGHT_PATH,
                          env_id=args.env_id)
    num_hands = args.hands
    thresh = 0.5
    tracking = False
    tracked_hands = np.array([0.0] * num_hands)
    rois = [None] * num_hands

    capture = get_capture(args.video)

    # create video writer if savepath is specified as video format
    if args.savepath != SAVE_IMAGE_PATH:
        f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        writer = get_writer(args.savepath, f_h, f_w)
    else:
        writer = None

    while (True):
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        img256, _, scale, pad = but.resize_pad(frame[:, :, ::-1])
        input_data = img256.astype('float32') / 255.
        input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0)

        # inference
        # Perform palm detection on 1st frame and if at least 1 hand has low
        # confidence (not detected)
        if np.any(tracked_hands < thresh):
            tracking = False
            # Palm detection
            preds = detector.predict([input_data])
            detections = but.detector_postprocess(preds)
            if detections[0].size > 0:
                tracking = True
                roi_imgs, affines, _ = but.estimator_preprocess(
                    frame, detections[0][:num_hands], scale, pad)
        else:
            for i, roi in enumerate(rois):
                xc, yc, scale, theta = roi
                roi_img, affine, _ = but.extract_roi(frame, xc, yc, theta,
                                                     scale)
                roi_imgs[i] = roi_img[0]
                affines[i] = affine[0]

        # Hand landmark estimation
        presence = [0, 0]  # [left, right]
        if tracking:
            estimator.set_input_shape(roi_imgs.shape)
            hand_flags, handedness, normalized_landmarks = estimator.predict(
                [roi_imgs])

            # postprocessing
            landmarks = but.denormalize_landmarks(normalized_landmarks,
                                                  affines)

            tracked_hands[:] = 0
            n_imgs = len(hand_flags)
            for i in range(n_imgs):
                landmark, hand_flag, handed = landmarks[i], hand_flags[
                    i], handedness[i]
                if hand_flag > thresh:
                    if handed > 0.5:  # Right handedness when not flipped camera input
                        presence[0] = 1
                    else:
                        presence[1] = 1
                    draw_landmarks(frame,
                                   landmark[:, :2],
                                   but.HAND_CONNECTIONS,
                                   size=2)

                    rois[i] = but.landmarks2roi(normalized_landmarks[i],
                                                affines[i])
                tracked_hands[i] = hand_flag

        if presence[0] and presence[1]:
            text = 'Left and right'
        elif presence[0]:
            text = 'Right'
        elif presence[1]:
            text = 'Left'
        else:
            text = 'No hand'

        visual_img = frame
        if args.video == '0':  # Flip horizontally if camera
            visual_img = np.ascontiguousarray(frame[:, ::-1, :])

        cv2.putText(visual_img, text, (8, 24), cv2.FONT_HERSHEY_SIMPLEX, 1,
                    (255, 0, 255), 2)
        cv2.imshow('frame', visual_img)

        # save results
        if writer is not None:
            cv2.putText(frame, text, (8, 24), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (255, 0, 255), 2)
            writer.write(frame)

    capture.release()
    if writer is not None:
        writer.release()
    cv2.destroyAllWindows()
    logger.info('Script finished successfully.')