def recognize_hand(frame, detector, estimator, out_frame=None): img256, _, scale, pad = bhut.resize_pad(frame[:, :, ::-1]) input_data = img256.astype('float32') / 255. input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0) # inference # Palm detection preds = detector.predict([input_data]) detections = bhut.detector_postprocess( preds, anchor_path="../../hand_recognition/blazehand/anchors.npy") # display bbox if args.bbox: detections2 = bhut.denormalize_detections(detections[0].copy(), scale, pad) display_hand_box(out_frame, detections2) # Hand landmark estimation presence = [0, 0] # [left, right] if detections[0].size != 0: img, affine, _ = bhut.estimator_preprocess(frame, detections, scale, pad) estimator.set_input_shape(img.shape) flags, handedness, normalized_landmarks = estimator.predict([img]) # postprocessing landmarks = bhut.denormalize_landmarks(normalized_landmarks, affine) for i in range(len(flags)): landmark, flag, handed = landmarks[i], flags[i], handedness[i] if flag > 0.75: if handed > 0.5: presence[0] = 1 else: presence[1] = 1 draw_landmarks_hand(out_frame, landmark[:, :2], bhut.HAND_CONNECTIONS, size=4)
def recognize_from_image(): # prepare input data src_img = cv2.imread(args.input) img256, _, scale, pad = but.resize_pad(src_img[:, :, ::-1]) input_data = img256.astype('float32') / 255. input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0) # net initialize env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') detector = ailia.Net(DETECTION_MODEL_PATH, DETECTION_WEIGHT_PATH, env_id=env_id) estimator = ailia.Net(LANDMARK_MODEL_PATH, LANDMARK_WEIGHT_PATH, env_id=env_id) # inference print('Start inference...') if args.benchmark: print('BENCHMARK mode') for _ in range(5): start = int(round(time.time() * 1000)) # Palm detection preds = detector.predict([input_data]) detections = but.detector_postprocess(preds) # Hand landmark estimation presence = [0, 0] # [left, right] if detections[0].size != 0: imgs, affines, _ = but.estimator_preprocess( src_img, detections, scale, pad) estimator.set_input_shape(imgs.shape) flags, handedness, normalized_landmarks = estimator.predict( [imgs]) # postprocessing landmarks = but.denormalize_landmarks(normalized_landmarks, affines) for i in range(len(flags)): landmark, flag, handed = landmarks[i], flags[ i], 1 - handedness[i] if flag > 0.75: if handed < 0.5: # Right handedness when not flipped camera input presence[0] = 1 else: presence[1] = 1 draw_landmarks(src_img, landmark[:, :2], but.HAND_CONNECTIONS, size=2) end = int(round(time.time() * 1000)) print(f'\tailia processing time {end - start} ms') else: # Palm detection preds = detector.predict([input_data]) detections = but.detector_postprocess(preds) # Hand landmark estimation presence = [0, 0] # [left, right] if detections[0].size != 0: imgs, affines, _ = but.estimator_preprocess( src_img, detections, scale, pad) estimator.set_input_shape(imgs.shape) flags, handedness, normalized_landmarks = estimator.predict([imgs]) # postprocessing landmarks = but.denormalize_landmarks(normalized_landmarks, affines) for i in range(len(flags)): landmark, flag, handed = landmarks[i], flags[ i], 1 - handedness[i] if flag > 0.75: if handed > 0.5: # Right handedness when not flipped camera input presence[0] = 1 else: presence[1] = 1 draw_landmarks(src_img, landmark[:, :2], but.HAND_CONNECTIONS, size=2) if presence[0] and presence[1]: hand_presence = 'Left and right' elif presence[0]: hand_presence = 'Left' elif presence[1]: hand_presence = 'Right' else: hand_presence = 'No hand' print(f'Hand presence: {hand_presence}') cv2.imwrite(args.savepath, src_img) print('Script finished successfully.')
def recognize_from_video(): # net initialize env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') detector = ailia.Net(DETECTION_MODEL_PATH, DETECTION_WEIGHT_PATH, env_id=env_id) estimator = ailia.Net(LANDMARK_MODEL_PATH, LANDMARK_WEIGHT_PATH, env_id=env_id) capture = get_capture(args.video) # create video writer if savepath is specified as video format if args.savepath != SAVE_IMAGE_PATH: f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) save_h, save_w = webcamera_utils.calc_adjust_fsize( f_h, f_w, IMAGE_HEIGHT, IMAGE_WIDTH) writer = webcamera_utils.get_writer(args.savepath, save_h, save_w) else: writer = None while (True): ret, frame = capture.read() frame = np.ascontiguousarray(frame[:, ::-1, :]) if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break img256, _, scale, pad = but.resize_pad(frame[:, :, ::-1]) input_data = img256.astype('float32') / 255. input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0) # inference # Palm detection preds = detector.predict([input_data]) detections = but.detector_postprocess(preds) # Hand landmark estimation presence = [0, 0] # [left, right] if detections[0].size != 0: img, affine, _ = but.estimator_preprocess(frame, detections, scale, pad) estimator.set_input_shape(img.shape) flags, handedness, normalized_landmarks = estimator.predict([img]) # postprocessing landmarks = but.denormalize_landmarks(normalized_landmarks, affine) for i in range(len(flags)): landmark, flag, handed = landmarks[i], flags[i], handedness[i] if flag > 0.75: if handed > 0.5: presence[0] = 1 else: presence[1] = 1 draw_landmarks(frame, landmark[:, :2], but.HAND_CONNECTIONS, size=2) if presence[0] and presence[1]: text = 'Left and right' elif presence[0]: text = 'Left' elif presence[1]: text = 'Right' else: text = 'No hand' cv2.putText(frame, text, (8, 24), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 255), 2) cv2.imshow('frame', frame) # save results if writer is not None: writer.write(frame) capture.release() cv2.destroyAllWindows() print('Script finished successfully.') pass
def recognize_hand(self, frame, detector, estimator, out_frame=None): img256, _, scale, pad = bhut.resize_pad(frame[:, :, ::-1]) input_data = img256.astype('float32') / 255. input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0) # inference # Perform palm detection on 1st frame and if at least 1 hand has low # confidence (not detected) if np.any(self.tracked_hands < HAND_DETECTION_THRESHOLD): self.tracking = False # Palm detection preds = detector.predict([input_data]) detections = bhut.detector_postprocess( preds, anchor_path="../../hand_recognition/blazehand/anchors.npy") if detections[0].size > 0: self.tracking = True self.roi_imgs, self.affines, _ = bhut.estimator_preprocess( frame, detections[0][:self.num_hands], scale, pad) else: for i, roi in enumerate(self.rois): xc, yc, scale, theta = roi roi_img, affine, _ = bhut.extract_roi(frame, xc, yc, theta, scale) self.roi_imgs[i] = roi_img[0] self.affines[i] = affine[0] # display bbox if args.bbox: detections2 = bhut.denormalize_detections(detections[0].copy(), scale, pad) display_hand_box(out_frame, detections2) # Hand landmark estimation presence = [0, 0] # [left, right] if self.tracking: # img, affine, _ = bhut.estimator_preprocess(frame, detections, scale, pad) estimator.set_input_shape(self.roi_imgs.shape) hand_flags, handedness, normalized_landmarks = estimator.predict( [self.roi_imgs]) # postprocessing landmarks = bhut.denormalize_landmarks(normalized_landmarks, self.affines) self.tracked_hands[:] = 0 n_imgs = len(hand_flags) for i in range(n_imgs): landmark, hand_flag, handed = landmarks[i], hand_flags[ i], handedness[i] if hand_flag > HAND_LANDMARK_THRESHOLD: if handed > 0.5: presence[0] = 1 # Left hand else: presence[1] = 1 # Right hand draw_landmarks_hand(out_frame, landmark[:, :2], bhut.HAND_CONNECTIONS, size=4) self.rois[i] = bhut.landmarks2roi(normalized_landmarks[i], self.affines[i]) self.tracked_hands[i] = hand_flag
def recognize_from_video(): # net initialize detector = ailia.Net(DETECTION_MODEL_PATH, DETECTION_WEIGHT_PATH, env_id=args.env_id) estimator = ailia.Net(LANDMARK_MODEL_PATH, LANDMARK_WEIGHT_PATH, env_id=args.env_id) num_hands = args.hands thresh = 0.5 tracking = False tracked_hands = np.array([0.0] * num_hands) rois = [None] * num_hands capture = get_capture(args.video) # create video writer if savepath is specified as video format if args.savepath != SAVE_IMAGE_PATH: f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) writer = get_writer(args.savepath, f_h, f_w) else: writer = None while (True): ret, frame = capture.read() if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break img256, _, scale, pad = but.resize_pad(frame[:, :, ::-1]) input_data = img256.astype('float32') / 255. input_data = np.expand_dims(np.moveaxis(input_data, -1, 0), 0) # inference # Perform palm detection on 1st frame and if at least 1 hand has low # confidence (not detected) if np.any(tracked_hands < thresh): tracking = False # Palm detection preds = detector.predict([input_data]) detections = but.detector_postprocess(preds) if detections[0].size > 0: tracking = True roi_imgs, affines, _ = but.estimator_preprocess( frame, detections[0][:num_hands], scale, pad) else: for i, roi in enumerate(rois): xc, yc, scale, theta = roi roi_img, affine, _ = but.extract_roi(frame, xc, yc, theta, scale) roi_imgs[i] = roi_img[0] affines[i] = affine[0] # Hand landmark estimation presence = [0, 0] # [left, right] if tracking: estimator.set_input_shape(roi_imgs.shape) hand_flags, handedness, normalized_landmarks = estimator.predict( [roi_imgs]) # postprocessing landmarks = but.denormalize_landmarks(normalized_landmarks, affines) tracked_hands[:] = 0 n_imgs = len(hand_flags) for i in range(n_imgs): landmark, hand_flag, handed = landmarks[i], hand_flags[ i], handedness[i] if hand_flag > thresh: if handed > 0.5: # Right handedness when not flipped camera input presence[0] = 1 else: presence[1] = 1 draw_landmarks(frame, landmark[:, :2], but.HAND_CONNECTIONS, size=2) rois[i] = but.landmarks2roi(normalized_landmarks[i], affines[i]) tracked_hands[i] = hand_flag if presence[0] and presence[1]: text = 'Left and right' elif presence[0]: text = 'Right' elif presence[1]: text = 'Left' else: text = 'No hand' visual_img = frame if args.video == '0': # Flip horizontally if camera visual_img = np.ascontiguousarray(frame[:, ::-1, :]) cv2.putText(visual_img, text, (8, 24), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 255), 2) cv2.imshow('frame', visual_img) # save results if writer is not None: cv2.putText(frame, text, (8, 24), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 255), 2) writer.write(frame) capture.release() if writer is not None: writer.release() cv2.destroyAllWindows() logger.info('Script finished successfully.')