def face_thread(self): face_nn = self.device.getOutputQueue("face_nn") landmark_in = self.device.getInputQueue("landmark_in") pose_in = self.device.getInputQueue("pose_in") while self.running: if self.frame is None: continue try: bboxes = np.array(face_nn.get().getFirstLayerFp16()) except RuntimeError as ex: continue bboxes = bboxes.reshape((bboxes.size // 7, 7)) self.bboxes = bboxes[bboxes[:, 2] > 0.7][:, 3:7] for raw_bbox in self.bboxes: bbox = frame_norm(self.frame, raw_bbox) det_frame = self.frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] land_data = depthai.NNData() land_data.setLayer("0", to_planar(det_frame, (48, 48))) landmark_in.send(land_data) pose_data = depthai.NNData() pose_data.setLayer("data", to_planar(det_frame, (60, 60))) pose_in.send(pose_data) self.face_box_q.put(bbox)
def forward(self, data): results = [] if data is not None: for sample in data[0]: nn_data = dai.NNData() nn_data.setLayer(self.input_name, sample) self.data_in.send(nn_data) assert wait_for_results(self.data_out) results.append(self.data_out.get()) data[0] = results else: assert wait_for_results(self.data_out) results.append(self.data_out.get()) data = [ results, [ DataInfo( scales=(1.0, 1.0), pads=(0, 0), original_width=self.input_width, original_height=self.input_height, ), ], ] return data
def forward(self, in_queue, out_queue, data): results = [] if data is not None: for sample in data[0]: nn_data = dai.NNData() nn_data.setLayer("data", sample) in_queue.send(nn_data) assert wait_for_results(out_queue) results.append(out_queue.get()) data[0] = results else: assert wait_for_results(out_queue) results.append(out_queue.get()) data = [ results, [ DataInfo( scales=( self.input_width / self.video_width, self.input_height / self.video_height, ), pads=(0, 0), original_width=self.video_width, original_height=self.video_height, ), ], ] return data
def run(self): threading.Thread(target=self.det_thread, daemon=True).start() threading.Thread(target=self.reid_thread, daemon=True).start() while self.cap.isOpened(): read_correctly, self.frame = self.cap.read() if not read_correctly: break self.fps.update() self.debug_frame = self.frame.copy() nn_data = depthai.NNData() nn_data.setLayer("input", to_planar(self.frame, (544, 320))) self.detection_in.send(nn_data) aspect_ratio = self.frame.shape[1] / self.frame.shape[0] cv2.imshow( "Camera_view", cv2.resize(self.debug_frame, (int(900), int(900 / aspect_ratio)))) if cv2.waitKey(1) == ord('q'): cv2.destroyAllWindows() break self.fps.stop() print("FPS: {:.2f}".format(self.fps.fps())) self.cap.release()
def run_nn(x_in, x_out, in_dict): nn_data = depthai.NNData() for key in in_dict: nn_data.setLayer(key, in_dict[key]) x_in.send(nn_data) has_results = wait_for_results(x_out) if not has_results: raise RuntimeError("No data from nn!") return x_out.get()
def forward(in_queue, out_queue, data): results = [] for sample in data[0]: nn_data = dai.NNData() nn_data.setLayer("data", sample) in_queue.send(nn_data) assert wait_for_results(out_queue) results.append(out_queue.get()) data[0] = results return data
def forward(self, data): results = [] for sample in data[0]: nn_data = dai.NNData() nn_data.setLayer("data", sample) self.data_in.send(nn_data) assert wait_for_results(self.data_out) results.append(self.data_out.get()) data[0] = results return data
def forward(self, data, stage="age-gender"): results = [] for sample in data[0]: sample_results = [] for face in sample: nn_data = dai.NNData() nn_data.setLayer("data", face) self.age_gender_in.send(nn_data) assert wait_for_results(self.age_gender_out) sample_results.append(self.age_gender_out.get()) results.append(sample_results) data[0] = results return data
def run_nn(x_in, x_out, in_dict): """ :param x_in: X_link_in :param x_out: X_link_out :param in_dict: :return: """ nn_data = depthai.NNData() for key in in_dict: nn_data.setLayer(key, in_dict[key]) x_in.send(nn_data) # has_results = wait_for_results(x_out) # if not has_results: # raise RuntimeError("No data from nn!") return x_out.tryGet()
def det_thread(self): detection_nn = self.device.getOutputQueue("detection_nn") while True: bboxes = np.array(detection_nn.get().getFirstLayerFp16()) bboxes = bboxes[:np.where(bboxes == -1)[0][0]] bboxes = bboxes.reshape((bboxes.size // 7, 7)) bboxes = bboxes[bboxes[:, 2] > 0.5][:, 3:7] for raw_bbox in bboxes: bbox = frame_norm(self.frame, raw_bbox) det_frame = self.frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] nn_data = depthai.NNData() nn_data.setLayer("data", to_planar(det_frame, (48, 96))) self.reid_in.send(nn_data) self.reid_bbox_q.put(bbox)
def run_face(self): face_nn = self.device.getOutputQueue("face_nn") land68_in = self.device.getInputQueue("land68_in", 4, False) while self.running: if self.frame is None: continue bboxes = np.array(face_nn.get().getFirstLayerFp16()) bboxes = bboxes.reshape((bboxes.size // 7, 7)) self.bboxes = bboxes[bboxes[:, 2] > 0.7][:, 3:7] for raw_bbox in self.bboxes: bbox = frame_norm(self.frame, raw_bbox) det_frame = self.frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] land68_data = depthai.NNData() land68_data.setLayer("data", to_planar(det_frame, (160, 160))) land68_in.send(land68_data) self.face_box_q.put(bbox)
def advanced_main(self): pipeline = create_advanced_pipeline() with dai.Device(pipeline) as device: # Create output queues vidQ = device.getOutputQueue(name="cam", maxSize=1, blocking=False) depthQ = device.getOutputQueue(name="depth", maxSize=1, blocking=False) palmQ = device.getOutputQueue(name="palm_nn", maxSize=1, blocking=False) faceQ = device.getOutputQueue("face_nn",maxSize=1, blocking=False) pose_inQ = device.getInputQueue("pose_in",maxSize=1, blocking=False) pose_outQ = device.getOutputQueue(name="pose_nn", maxSize=1, blocking=False) palmDetection = PalmDetection() depthFrame = None frame = None head_loc = None print("Main loop init") self.fps.start() while rclpy.ok(): in_rgb = vidQ.tryGet() if in_rgb is not None: frame = crop_to_rect(in_rgb.getCvFrame()) debug_frame = frame.copy() in_depth = depthQ.tryGet() if in_depth is not None: depthFrame = crop_to_rect(in_depth.getFrame()) in_face = faceQ.tryGet() head_bbox=None if in_face is not None and frame is not None and depthFrame is not None: bboxes = bbox_face_extraction(in_face) color=(143, 184, 77) for raw_bbox in bboxes: bbox = frame_norm(frame, raw_bbox) det_frame = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] pose_data = dai.NNData() pose_data.setLayer("data", to_planar(det_frame, (60, 60))) pose_inQ.send(pose_data) draw_bbox(debug_frame,bbox,color) head_bbox=bbox head_loc = calc_spatials(bbox,depthFrame,RED_RATIO_FACE,filter="median") palm_in = palmQ.tryGet() if palm_in is not None and frame is not None and depthFrame is not None: #perform computation and output drawing palm_coords = palmDetection.run_palm(debug_frame, palm_in) # Calculate and draw spatial coordinates of the palm spatialCoords = draw_palm_detection(debug_frame, palm_coords, depthFrame) #publish palm transform if spatialCoords is not None: self.publish_palm_transform(spatialCoords) ###### IMSHOW FOR DEPTH AND FRAME #cv2.imshow("debug", debug_frame) #show_depth(depthFrame) head_or = pose_outQ.tryGet() if head_or is not None: pose = [val[0][0] for val in to_tensor_result(head_or).values()] if head_loc[2] is not np.nan: self.publish_head_transform(head_loc,pose) #print("Loc:({0},{1},{2}) , Or: ({3},{4},{5})".format(head_loc[0],head_loc[1],head_loc[2],pose[0],pose[1],pose[2])) #draw_3d_axis(debug_frame,pose,(int(head_bbox[0]),int(head_bbox[1])),100) draw_pose_data(debug_frame,pose,head_loc, head_bbox,color=(143, 184, 77)) #publish detection image cvb = CvBridge() stamp = self.get_clock().now().to_msg() image_msg = cvb.cv2_to_imgmsg(debug_frame, encoding='bgr8') image_msg.header.stamp = stamp image_msg.header.frame_id = 'oak-d_frame' self.pub_rectified_img.publish(image_msg) self.fps.update() if cv2.waitKey(1) == ord('q'): cv2.destroyAllWindows() self.fps.stop() print("CAM FPS: {0} P-FPS:{1}".format(CAM_FPS,self.fps.fps())) self.destroy_node()
return True, np.ascontiguousarray(new_frame) else: return cap.read() class_names = ['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips'] result = None while should_run(): read_correctly, frame = get_frame() if not read_correctly: break if not camera: nn_data = dai.NNData() nn_data.setLayer("input", to_planar(frame, (180, 180))) detection_in.send(nn_data) in_nn = q_nn.tryGet() if in_nn is not None: data = softmax(in_nn.getFirstLayerFp16()) result_conf = np.max(data) if result_conf > 0.5: result = { "name": class_names[np.argmax(data)], "conf": round(100 * result_conf, 2) } else: result = None
def process(self, context): video_frame = context["frame"][OAK_Stage.COLOR] if video_frame is None: if self.videoframe is None: return else: video_frame = self.videoframe else: self.videoframe = video_frame self.HandTracker.video_size = video_frame.shape[0] pd_inference = context["output_queues"]["palm_detector"].tryGet() if pd_inference is not None: self.HandTracker.pd_postprocess(pd_inference) results_palm = [] for r in self.HandTracker.regions: box = (np.array(r.pd_box) * self.HandTracker.video_size).astype(int) ponto = np.array([[box[0], box[1]], [box[0] + box[2], box[1] + box[3]]]) result = ObjectDetection("hand", ponto, context["frame_id"]) results_palm.append(result) self._setOutput(results_palm, 'palm_detection_list') bodyposes = [] gestures = [] for i, r in enumerate(self.HandTracker.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.HandTracker.lm_input_length, self.HandTracker.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(img_hand, (self.HandTracker.lm_input_length, self.HandTracker.lm_input_length))) context["input_queues"]['hand_lm_in'].send(nn_data) inference = context["output_queues"]['hand_lm'].get() self.HandTracker.lm_postprocess(r, inference) if r.lm_score < self.HandTracker.lm_score_threshold: continue src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32) dst = np.array( [(x, y) for x, y in r.rect_points[1:]], dtype=np.float32 ) # region.rect_points[0] is left bottom point ! mat = cv2.getAffineTransform(src, dst) lm_xy = np.expand_dims(np.array([(l[0], l[1]) for l in r.landmarks]), axis=0) lm_xy = np.squeeze(cv2.transform(lm_xy, mat)).astype(np.int) bp = BodyPose(frame_id=context["frame_id"], pixel_space=True) for i in range(lm_xy.shape[0]): name = OAK_Handpose.kp_name[i] bp.add_keypoint(name, lm_xy[i][0], lm_xy[i][1]) bodyposes.append(bp) gesture = Gesture() gesture._gesture = r.gesture gestures.append(gesture) self._setOutput(bodyposes, "hand_pose_list") self._setOutput(gestures, 'gesture_list')
def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() # Define data queues if self.input_type == "internal": q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False) q_lm_in = device.getInputQueue(name="lm_in") else: q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") self.fps = FPS(mean_nb_frames=20) seq_num = 0 nb_pd_inferences = 0 nb_lm_inferences = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 while True: self.fps.update() if self.input_type == "internal": in_video = q_video.get() video_frame = in_video.getCvFrame() self.frame_size = video_frame.shape[ 0] # The image is square cropped on the device self.pad_w = self.pad_h = 0 else: if self.input_type == "image": vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] if self.crop: # Cropping the long side to get a square shape self.frame_size = min(h, w) dx = (w - self.frame_size) // 2 dy = (h - self.frame_size) // 2 video_frame = vid_frame[dy:dy + self.frame_size, dx:dx + self.frame_size] else: # Padding on the small side to get a square shape self.frame_size = max(h, w) self.pad_h = int((self.frame_size - h) / 2) self.pad_w = int((self.frame_size - w) / 2) video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData( to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) pd_rtrip_time = now() q_pd_in.send(frame_nn) seq_num += 1 annotated_frame = video_frame.copy() # Get pose detection inference = q_pd_out.get() if self.input_type != "internal": pd_rtrip_time = now() - pd_rtrip_time glob_pd_rtrip_time += pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 # Landmarks self.nb_active_regions = 0 if self.show_3d: self.vis3d.clear_geometries() self.vis3d.add_geometry(self.grid_floor, reset_bounding_box=False) self.vis3d.add_geometry(self.grid_wall, reset_bounding_box=False) for i, r in enumerate(self.regions): frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(frame_nn, (self.lm_input_length, self.lm_input_length))) if i == 0: lm_rtrip_time = now( ) # We measure only for the first region q_lm_in.send(nn_data) # Get landmarks inference = q_lm_out.get() if i == 0: lm_rtrip_time = now() - lm_rtrip_time glob_lm_rtrip_time += lm_rtrip_time nb_lm_inferences += 1 self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) if self.show_3d: self.vis3d.poll_events() self.vis3d.update_renderer() if self.smoothing and self.nb_active_regions == 0: self.filter.reset() if self.input_type != "internal" and not self.crop: annotated_frame = annotated_frame[self.pad_h:self.pad_h + h, self.pad_w:self.pad_w + w] if self.show_fps: self.fps.display(annotated_frame, orig=(50, 50), size=1, color=(240, 180, 100)) cv2.imshow("Blazepose", annotated_frame) if self.output: self.output.write(annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_scores = not self.show_scores elif key == ord('6'): self.show_gesture = not self.show_gesture elif key == ord('f'): self.show_fps = not self.show_fps # Print some stats print(f"# pose detection inferences : {nb_pd_inferences}") print(f"# landmark inferences : {nb_lm_inferences}") if self.input_type != "internal" and nb_pd_inferences != 0: print( f"Pose detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms" ) if nb_lm_inferences != 0: print( f"Landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms" ) if self.output: self.output.release()
def run_nn(x_in, x_out, in_dict): nn_data = depthai.NNData() for key in in_dict: nn_data.setLayer(key, in_dict[key]) x_in.send(nn_data) return x_out.tryGet()
break if sum(body.scores > body.score_thresh) > 8: keypoints = np.clip(body.keypoints, [0, 0], [frame.shape[1], frame.shape[0]]) x, y, w, h = cv2.boundingRect(keypoints) I = np.zeros_like(frame, dtype=np.uint8) I = renderer.draw(I, body) I = cv2.cvtColor(I, cv2.COLOR_BGR2GRAY) I = np.clip(I, 0, 1) * 255 I = pose.crop_and_resize(I, pose.crop_region) # I = I[y : y + h, x : x + w] I = cv2.resize(I, (128, 128)) frame_ac = dai.NNData() frame_ac.setLayer("input", I.ravel()) pose.q_ac_in.send(frame_ac) crown_proportion = w / h # Get result from device predect = pose.q_ac_out.get() predect = np.array(predect.getLayerFp16("output")).reshape(-1, 2) action_id = int(np.argmax(predect)) possible_rate = 0.6 * predect[:, action_id] + 0.4 * (crown_proportion - 1) if possible_rate > 0.55: pose_action = "fall" print(predect) if possible_rate > 0.7: cv2.putText(
def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") q_asl_out = device.getOutputQueue(name="asl_out", maxSize=4, blocking=True) q_asl_in = device.getInputQueue(name="asl_in") while True: in_video = q_video.get() video_frame = in_video.getCvFrame() h, w = video_frame.shape[:2] self.frame_size = max(h, w) self.pad_h = int((self.frame_size - h) / 2) self.pad_w = int((self.frame_size - w) / 2) video_frame = cv2.copyMakeBorder(video_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) frame_nn = dai.ImgFrame() frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData( to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) q_pd_in.send(frame_nn) annotated_frame = video_frame.copy() # Get palm detection inference = q_pd_out.get() self.pd_postprocess(inference) # Send data for hand landmarks for i, r in enumerate(self.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length))) q_lm_in.send(nn_data) # Retrieve hand landmarks for i, r in enumerate(self.regions): inference = q_lm_out.get() self.lm_postprocess(r, inference) hand_frame, handedness, hand_bbox = self.lm_render( video_frame, annotated_frame, r) # ASL recognition if hand_frame is not None and self.asl_recognition: hand_frame = cv2.resize( hand_frame, (self.asl_input_length, self.asl_input_length), interpolation=cv2.INTER_NEAREST) hand_frame = hand_frame.transpose(2, 0, 1) nn_data = dai.NNData() nn_data.setLayer("input", hand_frame) q_asl_in.send(nn_data) asl_result = np.array(q_asl_out.get().getFirstLayerFp16()) asl_idx = np.argmax(asl_result) # Recognized ASL character is associated with a probability asl_char = [ characters[asl_idx], round(asl_result[asl_idx] * 100, 1) ] selected_char = asl_char current_char_queue = None if handedness > 0.5: current_char_queue = self.right_char_queue else: current_char_queue = self.left_char_queue current_char_queue.append(selected_char) # Peform filtering of recognition resuls using the previous 5 results # If there aren't enough reults, take the first result as output if len(current_char_queue) < 5: selected_char = current_char_queue[0] else: char_candidate = {} for i in range(5): if current_char_queue[i][0] not in char_candidate: char_candidate[current_char_queue[i][0]] = [ 1, current_char_queue[i][1] ] else: char_candidate[current_char_queue[i] [0]][0] += 1 char_candidate[current_char_queue[i][0]][ 1] += current_char_queue[i][1] most_voted_char = "" max_votes = 0 most_voted_char_prob = 0 for key in char_candidate: if char_candidate[key][0] > max_votes: max_votes = char_candidate[key][0] most_voted_char = key most_voted_char_prob = round( char_candidate[key][1] / char_candidate[key][0], 1) selected_char = (most_voted_char, most_voted_char_prob) if self.show_asl: gesture_string = "Letter: " + selected_char[ 0] + ", " + str(selected_char[1]) + "%" textSize = self.ft.getTextSize(gesture_string, fontHeight=14, thickness=-1)[0] cv2.rectangle(video_frame, (hand_bbox[0] - 5, hand_bbox[1]), (hand_bbox[0] + textSize[0] + 5, hand_bbox[1] - 18), (36, 152, 0), -1) self.ft.putText(img=video_frame, text=gesture_string, org=(hand_bbox[0], hand_bbox[1] - 5), fontHeight=14, color=(255, 255, 255), thickness=-1, line_type=cv2.LINE_AA, bottomLeftOrigin=True) video_frame = video_frame[self.pad_h:self.pad_h + h, self.pad_w:self.pad_w + w] cv2.imshow("hand tracker", video_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_hand_box = not self.show_hand_box elif key == ord('2'): self.show_landmarks = not self.show_landmarks elif key == ord('3'): self.show_asl = not self.show_asl
def process_image(transform, processing_model, img): global useOAKDCam, bboxes, results, results_path, reid_bbox_q, next_id, device, face_bbox_q, age_gender_in, age_gender_nn, cap, cam_out, detection_in, detection_nn, reid_in, reid_nn tracks = [] try: if useOAKDCam: # ret, frame = cap.read() frame = np.array(cam_out.get().getData()).reshape( (3, 320, 544)).transpose(1, 2, 0).astype(np.uint8) else: frame = img if transform == 'ssd': if frame is not None: if not useOAKDCam: nn_data = depthai.NNData() nn_data.setLayer("input", to_planar(frame, (300, 300))) detection_in.send(nn_data) in_nn = detection_nn.tryGet() if in_nn is not None: # one detection has 7 numbers, and the last detection is followed by -1 digit, which later is filled with 0 bboxes = np.array(in_nn.getFirstLayerFp16()) # take only the results before -1 digit bboxes = bboxes[:np.where(bboxes == -1)[0][0]] # transform the 1D array into Nx7 matrix bboxes = bboxes.reshape((bboxes.size // 7, 7)) # filter out the results which confidence less than a defined threshold bboxes = bboxes[bboxes[:, 2] > 0.5][:, 3:7] # if the frame is available, draw bounding boxes on it and show the frame for raw_bbox in bboxes: bbox = frame_norm2(frame, raw_bbox) cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 3) img = frame #pedestrian reidentification https://github.com/luxonis/depthai-experiments/tree/master/pedestrian-reidentification if transform == 'pre': if frame is not None: debug_frame = frame.copy() if not useOAKDCam: nn_data = depthai.NNData() nn_data.setLayer("input", to_planar(frame, (544, 320))) detection_in.send(nn_data) # else: # return tracks, img while detection_nn.has(): bboxes = np.array(detection_nn.get().getFirstLayerFp16()) bboxes = bboxes[:np.where(bboxes == -1)[0][0]] bboxes = bboxes.reshape((bboxes.size // 7, 7)) bboxes = bboxes[bboxes[:, 2] > 0.7][:, 3:7] for raw_bbox in bboxes: bbox = frame_norm_1(frame, raw_bbox) det_frame = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] nn_data = depthai.NNData() nn_data.setLayer("data", to_planar(det_frame, (48, 96))) reid_in.send(nn_data) reid_bbox_q.put(bbox) while reid_nn.has(): reid_result = reid_nn.get().getFirstLayerFp16() bbox = reid_bbox_q.get() for person_id in results: dist = cos_dist(reid_result, results[person_id]) if dist > 0.7: result_id = person_id results[person_id] = reid_result break else: result_id = next_id results[result_id] = reid_result results_path[result_id] = [] next_id += 1 # if debug: cv2.rectangle(debug_frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (10, 245, 10), 2) x = (bbox[0] + bbox[2]) // 2 y = (bbox[1] + bbox[3]) // 2 results_path[result_id].append([x, y]) cv2.putText(debug_frame, str(result_id), (x, y), cv2.FONT_HERSHEY_TRIPLEX, 1.0, (255, 255, 0)) if len(results_path[result_id]) > 1: cv2.polylines( debug_frame, [np.array(results_path[result_id], dtype=np.int32)], False, (255, 0, 255), 2) # else: # print(f"Saw id: {result_id}") img = debug_frame # gaze estimation https://github.com/luxonis/depthai-experiments/tree/master/gaze-estimation elif transform == 'gaze': model = processing_model model.frame = frame tracks, img = model.parse() # age gender recognition https://github.com/luxonis/depthai-experiments/tree/master/gen2-age-gender elif transform == 'age-gen': if frame is not None: debug_frame = frame.copy() if not useOAKDCam: nn_data = depthai.NNData() nn_data.setLayer("input", to_planar(frame, (300, 300))) detection_in.send(nn_data) while detection_nn.has(): bboxes = np.array(detection_nn.get().getFirstLayerFp16()) bboxes = bboxes.reshape((bboxes.size // 7, 7)) bboxes = bboxes[bboxes[:, 2] > 0.7][:, 3:7] for raw_bbox in bboxes: bbox = frame_norm_1(frame, raw_bbox) det_frame = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] nn_data = depthai.NNData() nn_data.setLayer("data", to_planar(det_frame, (48, 96))) age_gender_in.send(nn_data) face_bbox_q.put(bbox) while age_gender_nn.has(): det = age_gender_nn.get() age = int( float(np.squeeze(np.array(det.getLayerFp16('age_conv3')))) * 100) gender = np.squeeze(np.array(det.getLayerFp16('prob'))) gender_str = "female" if gender[0] > gender[1] else "male" bbox = face_bbox_q.get() while not len(results) < len(bboxes) and len(results) > 0: results.pop(0) results.append({ "bbox": bbox, "gender": gender_str, "age": age, "ts": time.time() }) results = list( filter(lambda result: time.time() - result["ts"] < 0.2, results)) if frame is not None: for result in results: bbox = result["bbox"] cv2.rectangle(debug_frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (10, 245, 10), 2) y = (bbox[1] + bbox[3]) // 2 cv2.putText(debug_frame, str(result["age"]), (bbox[0], y), cv2.FONT_HERSHEY_TRIPLEX, 1.0, (255, 255, 255)) cv2.putText(debug_frame, result["gender"], (bbox[0], y + 20), cv2.FONT_HERSHEY_TRIPLEX, 1.0, (255, 255, 255)) img = debug_frame except Exception as e: track = traceback.format_exc() print(track) print("OAK-D Exception", e) pass return tracks, img
def land_pose_thread(self): landmark_nn = self.device.getOutputQueue(name="landmark_nn", maxSize=1, blocking=False) pose_nn = self.device.getOutputQueue(name="pose_nn", maxSize=1, blocking=False) gaze_in = self.device.getInputQueue("gaze_in") while self.running: try: land_in = landmark_nn.get().getFirstLayerFp16() except RuntimeError as ex: continue try: face_bbox = self.face_box_q.get(block=True, timeout=100) except queue.Empty: continue self.face_box_q.task_done() left = face_bbox[0] top = face_bbox[1] face_frame = self.frame[face_bbox[1]:face_bbox[3], face_bbox[0]:face_bbox[2]] land_data = frame_norm(face_frame, land_in) land_data[::2] += left land_data[1::2] += top left_bbox = padded_point(land_data[:2], padding=30, frame_shape=self.frame.shape) if left_bbox is None: print("Point for left eye is corrupted, skipping nn result...") continue self.left_bbox = left_bbox right_bbox = padded_point(land_data[2:4], padding=30, frame_shape=self.frame.shape) if right_bbox is None: print( "Point for right eye is corrupted, skipping nn result...") continue self.right_bbox = right_bbox self.nose = land_data[4:6] left_img = self.frame[self.left_bbox[1]:self.left_bbox[3], self.left_bbox[0]:self.left_bbox[2]] right_img = self.frame[self.right_bbox[1]:self.right_bbox[3], self.right_bbox[0]:self.right_bbox[2]] try: self.pose = [ val[0][0] for val in to_tensor_result(pose_nn.get()).values() ] except RuntimeError as ex: continue gaze_data = depthai.NNData() gaze_data.setLayer("left_eye_image", to_planar(left_img, (60, 60))) gaze_data.setLayer("right_eye_image", to_planar(right_img, (60, 60))) gaze_data.setLayer("head_pose_angles", self.pose) gaze_in.send(gaze_data)
def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() # Define data queues if self.camera: q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False) if self.use_lm: q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False) q_lm_in = device.getInputQueue(name="lm_in") else: q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) if self.use_lm: q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") self.fps = FPS(mean_nb_frames=20) seq_num = 0 nb_pd_inferences = 0 nb_lm_inferences = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 while True: self.fps.update() if self.camera: in_video = q_video.get() video_frame = in_video.getCvFrame() else: if self.image_mode: vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] dx = (w - self.video_size) // 2 dy = (h - self.video_size) // 2 video_frame = vid_frame[dy:dy+self.video_size, dx:dx+self.video_size] frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData(to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) q_pd_in.send(frame_nn) pd_rtrip_time = now() seq_num += 1 annotated_frame = video_frame.copy() # Get palm detection inference = q_pd_out.get() if not self.camera: glob_pd_rtrip_time += now() - pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 # Hand landmarks if self.use_lm: for i,r in enumerate(self.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer("input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length))) q_lm_in.send(nn_data) if i == 0: lm_rtrip_time = now() # We measure only for the first region # Retrieve hand landmarks for i,r in enumerate(self.regions): inference = q_lm_out.get() if i == 0: glob_lm_rtrip_time += now() - lm_rtrip_time self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) nb_lm_inferences += 1 self.fps.display(annotated_frame, orig=(50,50),color=(240,180,100)) cv2.imshow("video", annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_handedness = not self.show_handedness elif key == ord('6'): self.show_scores = not self.show_scores elif key == ord('7'): self.show_gesture = not self.show_gesture # Print some stats if not self.camera: print(f"# video files frames : {seq_num}") print(f"# palm detection inferences received : {nb_pd_inferences}") print(f"# hand landmark inferences received : {nb_lm_inferences}") print(f"Palm detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms") print(f"Hand landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms")
def run(self): self.threads = [ threading.Thread(target=self.face_thread), threading.Thread(target=self.land_pose_thread), threading.Thread(target=self.gaze_thread) ] for thread in self.threads: thread.start() while self.should_run(): try: read_correctly, new_frame = self.get_frame() except RuntimeError: continue if not read_correctly: break self.fps.update() self.frame = new_frame self.debug_frame = self.frame.copy() if not camera: nn_data = depthai.NNData() nn_data.setLayer("data", to_planar(self.frame, (300, 300))) self.face_in.send(nn_data) if debug: # face if self.gaze is not None and self.left_bbox is not None and self.right_bbox is not None: re_x = (self.right_bbox[0] + self.right_bbox[2]) // 2 re_y = (self.right_bbox[1] + self.right_bbox[3]) // 2 le_x = (self.left_bbox[0] + self.left_bbox[2]) // 2 le_y = (self.left_bbox[1] + self.left_bbox[3]) // 2 x, y = (self.gaze * 100).astype(int)[:2] if args.lazer: beam_img = np.zeros(self.debug_frame.shape, np.uint8) for t in range(10)[::-2]: cv2.line(beam_img, (re_x, re_y), ((re_x + x * 100), (re_y - y * 100)), (0, 0, 255 - t * 10), t * 2) cv2.line(beam_img, (le_x, le_y), ((le_x + x * 100), (le_y - y * 100)), (0, 0, 255 - t * 10), t * 2) self.debug_frame |= beam_img else: cv2.arrowedLine(self.debug_frame, (le_x, le_y), (le_x + x, le_y - y), (255, 0, 255), 3) cv2.arrowedLine(self.debug_frame, (re_x, re_y), (re_x + x, re_y - y), (255, 0, 255), 3) if not args.lazer: for raw_bbox in self.bboxes: bbox = frame_norm(self.frame, raw_bbox) cv2.rectangle(self.debug_frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (10, 245, 10), 2) if self.nose is not None: cv2.circle(self.debug_frame, (self.nose[0], self.nose[1]), 2, (0, 255, 0), thickness=5, lineType=8, shift=0) if self.left_bbox is not None: cv2.rectangle(self.debug_frame, (self.left_bbox[0], self.left_bbox[1]), (self.left_bbox[2], self.left_bbox[3]), (245, 10, 10), 2) if self.right_bbox is not None: cv2.rectangle(self.debug_frame, (self.right_bbox[0], self.right_bbox[1]), (self.right_bbox[2], self.right_bbox[3]), (245, 10, 10), 2) if self.pose is not None and self.nose is not None: draw_3d_axis(self.debug_frame, self.pose, self.nose) if camera: cv2.imshow("Camera view", self.debug_frame) else: aspect_ratio = self.frame.shape[1] / self.frame.shape[0] cv2.imshow( "Video view", cv2.resize(self.debug_frame, (int(900), int(900 / aspect_ratio)))) if cv2.waitKey(1) == ord('q'): cv2.destroyAllWindows() break self.fps.stop() print("FPS: {:.2f}".format(self.fps.fps())) if not camera: self.cap.release() cv2.destroyAllWindows() for i in range(1, 5): # https://stackoverflow.com/a/25794701/5494277 cv2.waitKey(1) self.running = False
def land_pose_thread(self): landmark_nn = self.device.getOutputQueue(name="landmark_nn", maxSize=1, blocking=False) pose_nn = self.device.getOutputQueue(name="pose_nn", maxSize=1, blocking=False) gaze_in = self.device.getInputQueue("gaze_in") while self.running: try: land_in = landmark_nn.get().getFirstLayerFp16() except RuntimeError as ex: continue try: face_bbox = self.face_box_q.get(block=True, timeout=100) except queue.Empty: continue self.face_box_q.task_done() left = face_bbox[0] top = face_bbox[1] face_frame = self.frame[face_bbox[1]:face_bbox[3], face_bbox[0]:face_bbox[2]] land_data = frame_norm(face_frame, land_in) land_data[::2] += left land_data[1::2] += top left_bbox = padded_point(land_data[:2], padding=30, frame_shape=self.frame.shape) if left_bbox is None: print("Point for left eye is corrupted, skipping nn result...") continue self.left_bbox = left_bbox right_bbox = padded_point(land_data[2:4], padding=30, frame_shape=self.frame.shape) if right_bbox is None: print( "Point for right eye is corrupted, skipping nn result...") continue self.right_bbox = right_bbox self.nose = land_data[4:6] left_img = self.frame[self.left_bbox[1]:self.left_bbox[3], self.left_bbox[0]:self.left_bbox[2]] right_img = self.frame[self.right_bbox[1]:self.right_bbox[3], self.right_bbox[0]:self.right_bbox[2]] try: # The output of pose_nn is in YPR format, which is the required sequence input for pose in gaze # https://docs.openvinotoolkit.org/2020.1/_models_intel_head_pose_estimation_adas_0001_description_head_pose_estimation_adas_0001.html # https://docs.openvinotoolkit.org/latest/omz_models_model_gaze_estimation_adas_0002.html # ... three head pose angles – (yaw, pitch, and roll) ... values = to_tensor_result(pose_nn.get()) self.pose = [ values['angle_y_fc'][0][0], values['angle_p_fc'][0][0], values['angle_r_fc'][0][0] ] except RuntimeError as ex: continue gaze_data = depthai.NNData() gaze_data.setLayer("left_eye_image", to_planar(left_img, (60, 60))) gaze_data.setLayer("right_eye_image", to_planar(right_img, (60, 60))) gaze_data.setLayer("head_pose_angles", self.pose) gaze_in.send(gaze_data)
# nn data (bounding box locations) are in <0..1> range - they need to be normalized with frame width/height def frame_norm(frame, bbox): norm_vals = np.full(len(bbox), frame.shape[0]) norm_vals[::2] = frame.shape[1] return (np.clip(np.array(bbox), 0, 1) * norm_vals).astype(int) for nextfile in tqdm(glob.glob("unlabeld/*.jpg")): name = nextfile[9:-4] #print(name) # load image into frame frame = cv2.imread(nextfile, cv2.IMREAD_COLOR) original_frame = frame.copy() # resize frame to 300x300 frame = cv2.resize(frame, (300, 300), interpolation=cv2.INTER_AREA) var_data = dai.NNData() var_data.setLayer("data", to_planar(frame, (300, 300))) q_img_in.send(var_data) in_nn = q_nn.get() detections = in_nn.detections annotation = ET.Element("annotation") folder = ET.SubElement(annotation, "folder").text = "allimages" filename = ET.SubElement(annotation, "filename").text = f"{name}.jpg" path = ET.SubElement( annotation, "path" ).text = f"D:\\Hobby\\tgmb\\to-bee-or-not-to-bee\\allimages\\{name}.jpg" source = ET.SubElement(annotation, "source") database = ET.SubElement(source, "database").text = "Unknown"
def process_image(self, img): annotated_frame = img if self.camera: in_video = self.q_video.get() # Convert NV12 to BGR yuv = in_video.getData().reshape( (in_video.getHeight() * 3 // 2, in_video.getWidth())) video_frame = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR_NV12) else: if self.image_mode is None: vid_frame = img height, width, _ = img.shape self.video_size = int(min(width, height)) elif self.image_mode: vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: # print("not OK video frame") return [], img #break h, w = vid_frame.shape[:2] dx = (w - self.video_size) // 2 dy = (h - self.video_size) // 2 video_frame = vid_frame[dy:dy + self.video_size, dx:dx + self.video_size] frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(self.seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData( to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) self.q_pd_in.send(frame_nn) self.seq_num += 1 annotated_frame = video_frame.copy() inference = self.q_pd_out.get() self.pd_postprocess(inference) self.pd_render(annotated_frame) # Hand landmarks if self.use_lm: for i, r in enumerate(self.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length))) self.q_lm_in.send(nn_data) # Retrieve hand landmarks for i, r in enumerate(self.regions): inference = self.q_lm_out.get() self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) return self.regions, annotated_frame
def inference_task(self): # Queues detection_passthrough = self.device.getOutputQueue( "detection_passthrough") detection_nn = self.device.getOutputQueue("detection_nn") bboxes = [] results = {} results_path = {} next_id = 0 # Match up frames and detections try: prev_passthrough = detection_passthrough.getAll()[0] prev_inference = detection_nn.getAll()[0] except RuntimeError: pass fps = 0 t_fps = time.time() while self.running: try: # Get current detection passthrough = detection_passthrough.getAll()[0] inference = detection_nn.getAll()[0] # Count NN fps fps = fps + 1 # Combine all frames to current inference frames = [] while True: frm = self.frame_queue.get() if camera and hq: # Convert NV12 to BGR yuv = frm.getData().reshape( (frm.getHeight() * 3 // 2, frm.getWidth())) cv_frame = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR_NV12) else: # get the frames corresponding to inference cv_frame = np.ascontiguousarray(frm.getData().reshape( 3, frm.getHeight(), frm.getWidth()).transpose(1, 2, 0)) frames.append(cv_frame) # Break out once all frames received for the current inference if frm.getSequenceNum( ) >= prev_passthrough.getSequenceNum() - 1: break infered_frame = frames[0] # Send bboxes to be infered upon for det in inference.detections: raw_bbox = [det.xmin, det.ymin, det.xmax, det.ymax] bbox = frame_norm(infered_frame, raw_bbox) det_frame = infered_frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] nn_data = dai.NNData() nn_data.setLayer("data", to_planar(det_frame, (48, 96))) self.device.getInputQueue("reid_in").send(nn_data) # Retrieve infered bboxes for det in inference.detections: raw_bbox = [det.xmin, det.ymin, det.xmax, det.ymax] bbox = frame_norm(infered_frame, raw_bbox) reid_result = self.device.getOutputQueue( "reid_nn").get().getFirstLayerFp16() for person_id in results: dist = cos_dist(reid_result, results[person_id]) if dist > 0.7: result_id = person_id results[person_id] = reid_result break else: result_id = next_id results[result_id] = reid_result results_path[result_id] = [] next_id += 1 if debug: for frame in frames: cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (10, 245, 10), 2) x = (bbox[0] + bbox[2]) // 2 y = (bbox[1] + bbox[3]) // 2 results_path[result_id].append([x, y]) cv2.putText(frame, str(result_id), (x, y), cv2.FONT_HERSHEY_TRIPLEX, 1.0, (255, 255, 255)) if len(results_path[result_id]) > 1: cv2.polylines(frame, [ np.array(results_path[result_id], dtype=np.int32) ], False, (255, 0, 0), 2) else: print(f"Saw id: {result_id}") # Send of to visualization thread for frame in frames: # put nn_fps if debug: cv2.putText(frame, 'NN FPS: ' + str(self.nn_fps), (5, 40), cv2.FONT_HERSHEY_DUPLEX, 1.0, (255, 0, 0), 2) if self.visualization_queue.full(): self.visualization_queue.get_nowait() self.visualization_queue.put(frame) # Move current to prev prev_passthrough = passthrough prev_inference = inference if time.time() - t_fps >= 1.0: self.nn_fps = round(fps / (time.time() - t_fps), 2) fps = 0 t_fps = time.time() except RuntimeError: continue
def run(self): self.threads = [ threading.Thread(target=self.run_face, daemon=True), threading.Thread(target=self.run_land68, daemon=True) ] for thread in self.threads: thread.start() while self.should_run(): read_correctly, new_frame = self.get_frame() if not read_correctly: break self.fps.update() self.frame = new_frame self.debug_frame = self.frame.copy() if not camera: nn_data = depthai.NNData() nn_data.setLayer("data", to_planar(self.frame, (300, 300))) self.face_in.send(nn_data) if debug: if self.results.qsize() > 0 and self.face_bboxs.qsize() > 0: try: for i in range(self.results.qsize()): face_bbox = self.face_bboxs.get() result = self.results.get() bbox = frame_norm(self.frame, self.bboxes[i]) self.draw_bbox(bbox, (0, 255, 0)) self.hand_points = [] # 17 Left eyebrow upper left corner/21 Left eyebrow right corner/22 Right eyebrow upper left corner/26 Right eyebrow upper right corner/36 Left eye upper left corner/39 Left eye upper right corner/42 Right eye upper left corner/ # 45 Upper right corner of the right eye/31 Upper left corner of the nose/35 Upper right corner of the nose/48 Upper left corner/54 Upper right corner of the mouth/57 Lower central corner of the mouth/8 Chin corner # The coordinates are two points, so you have to multiply by 2. self.hand_points.append( (result[34] + face_bbox[0], result[35] + face_bbox[1])) self.hand_points.append( (result[42] + face_bbox[0], result[43] + face_bbox[1])) self.hand_points.append( (result[44] + face_bbox[0], result[45] + face_bbox[1])) self.hand_points.append( (result[52] + face_bbox[0], result[53] + face_bbox[1])) self.hand_points.append( (result[72] + face_bbox[0], result[73] + face_bbox[1])) self.hand_points.append( (result[78] + face_bbox[0], result[79] + face_bbox[1])) self.hand_points.append( (result[84] + face_bbox[0], result[85] + face_bbox[1])) self.hand_points.append( (result[90] + face_bbox[0], result[91] + face_bbox[1])) self.hand_points.append( (result[62] + face_bbox[0], result[63] + face_bbox[1])) self.hand_points.append( (result[70] + face_bbox[0], result[71] + face_bbox[1])) self.hand_points.append( (result[96] + face_bbox[0], result[97] + face_bbox[1])) self.hand_points.append( (result[108] + face_bbox[0], result[109] + face_bbox[1])) self.hand_points.append( (result[114] + face_bbox[0], result[115] + face_bbox[1])) self.hand_points.append( (result[16] + face_bbox[0], result[17] + face_bbox[1])) for i in self.hand_points: cv2.circle(self.debug_frame, (i[0], i[1]), 2, (255, 0, 0), thickness=1, lineType=8, shift=0) reprojectdst, _, pitch, yaw, roll = get_head_pose( np.array(self.hand_points)) """ pitch > 0 Head down, < 0 look up yaw > 0 Turn right < 0 Turn left roll > 0 Tilt right, < 0 Tilt left """ cv2.putText( self.debug_frame, "pitch:{:.2f}, yaw:{:.2f}, roll:{:.2f}".format( pitch, yaw, roll), (face_bbox[0] - 30, face_bbox[1] - 30), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 0, 0)) hand_attitude = np.array( [abs(pitch), abs(yaw), abs(roll)]) max_index = np.argmax(hand_attitude) if max_index == 0: if pitch > 0: cv2.putText( self.debug_frame, "Head down", (face_bbox[0], face_bbox[1] - 10), cv2.FONT_HERSHEY_COMPLEX, 0.5, (235, 10, 10)) else: cv2.putText( self.debug_frame, "look up", (face_bbox[0], face_bbox[1] - 10), cv2.FONT_HERSHEY_COMPLEX, 0.5, (235, 10, 10)) elif max_index == 1: if yaw > 0: cv2.putText( self.debug_frame, "Turn right", (face_bbox[0], face_bbox[1] - 10), cv2.FONT_HERSHEY_COMPLEX, 0.5, (235, 10, 10)) else: cv2.putText( self.debug_frame, "Turn left", (face_bbox[0], face_bbox[1] - 10), cv2.FONT_HERSHEY_COMPLEX, 0.5, (235, 10, 10)) elif max_index == 2: if roll > 0: cv2.putText( self.debug_frame, "Tilt right", (face_bbox[0], face_bbox[1] - 10), cv2.FONT_HERSHEY_COMPLEX, 0.5, (235, 10, 10)) else: cv2.putText( self.debug_frame, "Tilt left", (face_bbox[0], face_bbox[1] - 10), cv2.FONT_HERSHEY_COMPLEX, 0.5, (235, 10, 10)) # Draw a cube with 12 axes line_pairs = [[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7], [7, 4], [0, 4], [1, 5], [2, 6], [3, 7]] for start, end in line_pairs: cv2.line(self.debug_frame, reprojectdst[start], reprojectdst[end], (0, 0, 255)) except: pass if camera: cv2.imshow("Camera view", self.debug_frame) else: aspect_ratio = self.frame.shape[1] / self.frame.shape[0] cv2.imshow( "Video view", cv2.resize(self.debug_frame, (int(900), int(900 / aspect_ratio)))) if cv2.waitKey(1) == ord('q'): cv2.destroyAllWindows() break self.fps.stop() print("FPS:{:.2f}".format(self.fps.fps())) if not camera: self.cap.release() cv2.destroyAllWindows() self.running = False for thread in self.threads: thread.join(2) if thread.is_alive(): break
def process(self, context): video_frame = context["frame"][OAK_Stage.COLOR] if video_frame is None: if self.videoframe is None: return else: video_frame = self.videoframe else: self.videoframe = video_frame self.blazepose.frame_size = video_frame.shape[0] pd_inference = context["output_queues"]["blazepose_pd"].tryGet() if pd_inference is not None: self.blazepose.pd_postprocess(pd_inference) else: return self.blazepose.nb_active_regions = 0 bodyposes = [] bodyposes_3d = [] for i, r in enumerate(self.blazepose.regions): frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.blazepose.lm_input_length, self.blazepose.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(frame_nn, (self.blazepose.lm_input_length, self.blazepose.lm_input_length))) context["input_queues"]['blazepose_lm_in'].send(nn_data) lm_inference = context["output_queues"]['blazepose_lm'].get() self.blazepose.lm_postprocess(r, lm_inference) if r.lm_score < self.blazepose.lm_score_threshold: continue bp = BodyPose(frame_id=context["frame_id"], pixel_space=True) bp_3d = None points = r.landmarks_abs bp_3d = BodyPose(frame_id=context["frame_id"], pixel_space=False) for i, x_y in enumerate(r.landmarks_padded[:, :2]): name = OAK_Blazepose.kp_name[i] if name is None: continue bp.add_keypoint(name, x_y[0], x_y[1]) bp_3d.add_keypoint(name, points[i][0], points[i][1], points[i][2]) bodyposes.append(bp) bodyposes_3d.append(bp_3d) self._setOutput(bodyposes, "bodypose_list") self._setOutput(bodyposes_3d, "bodypose3d_list")
return True, np.array(cam_out.get().getData()).reshape( (3, 300, 300)).transpose(1, 2, 0).astype(np.uint8) try: while should_run(): read_correctly, frame = get_frame() if not read_correctly: break if frame is not None: fps.update() debug_frame = frame.copy() if not args.camera: nn_data = depthai.NNData() nn_data.setLayer("input", to_planar(frame, (300, 300))) detection_in.send(nn_data) while detection_nn.has(): bboxes = np.array(detection_nn.get().getFirstLayerFp16()) bboxes = bboxes.reshape((bboxes.size // 7, 7)) bboxes = bboxes[bboxes[:, 2] > 0.7][:, 3:7] for raw_bbox in bboxes: bbox = frame_norm(frame, raw_bbox) det_frame = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] nn_data = depthai.NNData() nn_data.setLayer("data", to_planar(det_frame, (48, 96))) age_gender_in.send(nn_data)
def process_image(transform, processing_model, img): global useOAKDCam, bboxes, results, pd_score_thresh, pd_nms_thresh, bboxes, anchors, device, q_rgb, q_nn, fps, q_in tracks = [] # (q_rgb,q_nn) = processing_model try: # if useOAKDCam: # # ret, frame = cap.read() # frame = np.array(cam_out.get().getData()).reshape((3, 300, 300)).transpose(1, 2, 0).astype(np.uint8) # shape = (3, frame.getHeight(), frame.getWidth()) # frame = in_rgb.getData().reshape(shape).transpose(1, 2, 0).astype(np.uint8) # frame = np.ascontiguousarray(frame) # else: frame = img #palm detection https://github.com/geaxgx/oakd_palm_detection if transform == 'oakd_palm': if device is None: # Start defining a pipeline pipeline = dai.Pipeline() if useOAKDCam: # Define a source - color camera cam_rgb = pipeline.createColorCamera() cam_rgb.setPreviewSize(128, 128) cam_rgb.setFps(90.0) cam_rgb.setInterleaved(False) # Define a neural network that will make predictions based on the source frames detection_nn = pipeline.createNeuralNetwork() detection_nn.setBlobPath( str( Path( "../oakd_palm_detection/models/palm_detection.blob" ).resolve().absolute())) if useOAKDCam: cam_rgb.preview.link(detection_nn.input) else: detection_in = pipeline.createXLinkIn() detection_in.setStreamName("detection_in") detection_in.out.link(detection_nn.input) # Create outputs if useOAKDCam: xout_rgb = pipeline.createXLinkOut() xout_rgb.setStreamName("rgb") cam_rgb.preview.link(xout_rgb.input) xout_nn = pipeline.createXLinkOut() xout_nn.setStreamName("nn") detection_nn.out.link(xout_nn.input) # Pipeline defined, now the device is assigned and pipeline is started device = dai.Device(pipeline) device.startPipeline() if useOAKDCam: # Output queues will be used to get the rgb frames and nn data from the outputs defined above q_rgb = device.getOutputQueue(name="rgb", maxSize=4, blocking=False) else: q_in = device.getInputQueue("detection_in") q_nn = device.getOutputQueue(name="nn", maxSize=4, blocking=False) # fps.update() # if frame is not None: if not useOAKDCam: nn_data = dai.NNData() nn_data.setLayer("input", to_planar(frame, (128, 128))) q_in.send(nn_data) # in_nn = q_nn.get() in_nn = q_nn.tryGet() # 2 output layers: # - classificators: # - regressors : # From: print(in_nn.getAllLayerNames()) if in_nn is not None: scores = np.array(in_nn.getLayerFp16("classificators")) bboxes = np.array( in_nn.getLayerFp16("regressors")).reshape((896, 18)) # Decode bboxes regions = decode_bboxes(pd_score_thresh, 128, 128, scores, bboxes, anchors) # Non maximum suppression regions = non_max_suppression(regions, pd_nms_thresh) tracks = regions for r in regions: raw_bbox = (np.array(r.pd_box) * 128).astype(int) # box = raw_bbox # print("raw_bbox",raw_bbox) # print("frame.shape",frame.shape) box = frame_norm3(frame, raw_bbox) # print("box3",box) # cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (255, 255, 255), 2) cv2.rectangle(frame, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (255, 255, 0), 2) # if frame is not None: # img = frame if frame is not None: # cv2.putText(frame, "FPS: {:.2f}".format(fps.get()), (10,10), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0,0,255), 1) # cv2.imshow("rgb", frame) img = frame else: # in_rgb = q_rgb.tryGet() in_rgb = q_rgb.get() if in_rgb is not None: # if the data from the rgb camera is available, transform the 1D data into a HxWxC frame shape = (3, in_rgb.getHeight(), in_rgb.getWidth()) frame = in_rgb.getData().reshape(shape).transpose( 1, 2, 0).astype(np.uint8) frame = np.ascontiguousarray(frame) in_nn = q_nn.get() # 2 output layers: # - classificators: # - regressors : # From: print(in_nn.getAllLayerNames()) if in_nn is not None: scores = np.array(in_nn.getLayerFp16("classificators")) bboxes = np.array( in_nn.getLayerFp16("regressors")).reshape( (896, 18)) # Decode bboxes regions = decode_bboxes(pd_score_thresh, 128, 128, scores, bboxes, anchors) # Non maximum suppression regions = non_max_suppression(regions, pd_nms_thresh) tracks = regions for r in regions: box = (np.array(r.pd_box) * 128).astype(int) cv2.rectangle(frame, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (255, 255, 0), 2) # if frame is not None: # img = frame if frame is not None: # cv2.putText(frame, "FPS: {:.2f}".format(fps.get()), (10,10), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0,0,255), 1) # cv2.imshow("rgb", frame) img = frame # if cv2.waitKey(1) == ord('q'): # pass except Exception as e: track = traceback.format_exc() print(track) print("OAK-D Exception", e) pass return tracks, img