def detect(self, rgb_image, depth_image=None): """ Detect frontal faces """ face_detections = [] faces = self.model(rgb_image, 0) for face in faces: if depth_image is not None: w = face.width() h = face.height() x = face.left() + w / 2.0 y = face.top() + h / 2.0 x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y depth = depth_image[int(y)][int(x)] / 1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None face_detections.append( Detection(face.left(), face.top(), face.right(), face.bottom(), "face", 1.0, depth=depth)) return face_detections
def predict(self, rgb_image, depth_image=None): """ """ bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) success, bbox = self.tracker.update(bgr_image) xmin, ymin, w, h = bbox xmin = 0 if xmin < 0 else xmin ymin = 0 if ymin < 0 else ymin x = int(xmin + w / 2.0) y = int(ymin + h / 2.0) if depth_image is not None: x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y depth = depth_image[int(y)][int(x)] / 1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None prediction = Detection(xmin, ymin, xmin + w, ymin + h, self.object_label, 1.0, depth=depth) return success, prediction
def predict(self, rgb_image): """ """ bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) success, bbox = self.medianflow_tracker.update(bgr_image) xmin, ymin, width, height = bbox xmin = 0 if xmin < 0 else xmin ymin = 0 if ymin < 0 else ymin prediction = Detection(xmin, ymin, xmin + width, ymin + height, self.object_label, 1.0) return success, prediction
def detect(self, rgb_image, depth_image=None): """ Detect an object based on the color """ hsv_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2HSV) lower = np.array([136, 87, 111], np.uint8) upper = np.array([180, 255, 255], np.uint8) color_mask = cv2.inRange(hsv_image, self.lower, self.upper) color_mask = cv2.dilate(color_mask, self.kernel) if self.debug_topics is True: self.pub.publish(self.bridge.cv2_to_imgmsg(color_mask)) # find the bbox x, y, w, h = cv2.boundingRect(color_mask) xmin = x ymin = y xmax = xmin + w ymax = ymin + h if depth_image is not None: x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y depth = depth_image[int(y)][int(x)] / 1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None mask = color_mask[int(ymin):int(ymax), int(xmin):int(xmax)] object_detection = Detection(int(xmin), int(ymin), int(xmin + w), int(ymin + h), self.color + "_thing", 1.0, mask=mask, depth=depth) if object_detection.bbox.area() > 100: return [object_detection] else: return []
def get_camera_view(self, camera_pose, camera, target_position=None, occlusion_threshold=0.01, rendering_ratio=1.0): visible_tracks = [] rot = quaternion_matrix(camera_pose.quaternion()) trans = translation_matrix(camera_pose.position().to_array().flatten()) if target_position is None: target = translation_matrix([0.0, 0.0, 1000.0]) target = translation_from_matrix(np.dot(np.dot(trans, rot), target)) else: target = target_position.position().to_array() view_matrix = p.computeViewMatrix(camera_pose.position().to_array(), target, [0, 0, 1]) width = camera.width height = camera.height projection_matrix = p.computeProjectionMatrixFOV( camera.fov, float(width) / height, camera.clipnear, camera.clipfar) rendered_width = int(width / rendering_ratio) rendered_height = int(height / rendering_ratio) width_ratio = width / rendered_width height_ratio = height / rendered_height if self.use_gui is True: camera_image = p.getCameraImage( rendered_width, rendered_height, viewMatrix=view_matrix, renderer=p.ER_BULLET_HARDWARE_OPENGL, projectionMatrix=projection_matrix) else: camera_image = p.getCameraImage(rendered_width, rendered_height, viewMatrix=view_matrix, renderer=p.ER_TINY_RENDERER, projectionMatrix=projection_matrix) rgb_image = cv2.resize(np.array(camera_image[2]), (width, height))[:, :, :3] depth_image = np.array(camera_image[3], np.float32).reshape( (rendered_height, rendered_width)) far = camera.clipfar near = camera.clipnear real_depth_image = far * near / (far - (far - near) * depth_image) mask_image = camera_image[4] unique, counts = np.unique(np.array(mask_image).flatten(), return_counts=True) for sim_id, count in zip(unique, counts): if sim_id > 0: cv_mask = np.array(mask_image.copy()) cv_mask[cv_mask != sim_id] = 0 cv_mask[cv_mask == sim_id] = 255 xmin, ymin, w, h = cv2.boundingRect(cv_mask.astype(np.uint8)) mask = cv_mask[ymin:ymin + h, xmin:xmin + w] visible_area = w * h + 1 screen_area = rendered_width * rendered_height + 1 if screen_area - visible_area == 0: confidence = 1.0 else: confidence = visible_area / float(screen_area - visible_area) #TODO compute occlusion score as a ratio between visible 2d bbox and projected 2d bbox areas if confidence > occlusion_threshold: depth = real_depth_image[int(ymin + h / 2.0)][int(xmin + w / 2.0)] xmin = int(xmin * width_ratio) ymin = int(ymin * height_ratio) w = int(w * width_ratio) h = int(h * height_ratio) id = self.reverse_entity_id_map[sim_id] scene_node = self.get_entity(id) det = Detection(int(xmin), int(ymin), int(xmin + w), int(ymin + h), id, confidence, depth=depth, mask=mask) track = SceneNode(detection=det) track.static = scene_node.static track.id = id track.mask = det.mask track.shapes = scene_node.shapes track.pose = scene_node.pose track.label = scene_node.label track.description = scene_node.description visible_tracks.append(track) return rgb_image, real_depth_image, visible_tracks
def detect(self, rgb_image, depth_image=None, table_mask=None): """ Detect unknown objects """ filtered_bbox = [] output_dets = [] h, w, _ = rgb_image.shape static_foreground_mask_full = np.zeros((h, w), dtype=np.uint8) bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) bgr_image_cropped = bgr_image[int(h / 2.0):h, 0:int(w)] cropped_height, cropped_width, _ = bgr_image_cropped.shape foreground_mask = self.long_term_detector.apply( bgr_image_cropped, learningRate=LONGTERM_LEARNING_RATE) foreground_mask[foreground_mask != 255] = 0 # shadows suppression foreground_mask = cv2.morphologyEx(foreground_mask, cv2.MORPH_CLOSE, self.kernel) foreground_mask = cv2.morphologyEx(foreground_mask, cv2.MORPH_OPEN, self.kernel) foreground_mask = cv2.dilate(foreground_mask, self.kernel) motion_mask = self.short_term_detector.apply( bgr_image_cropped, learningRate=SHORTTERM_LEARNING_RATE) motion_mask[motion_mask != 255] = 0 # shadows suppression motion_mask = cv2.morphologyEx(motion_mask, cv2.MORPH_CLOSE, self.kernel) motion_mask = cv2.morphologyEx(motion_mask, cv2.MORPH_OPEN, self.kernel) motion_mask = cv2.dilate(motion_mask, self.kernel) not_moving_mask = cv2.bitwise_not(motion_mask) static_foreground_mask = cv2.bitwise_and(foreground_mask, not_moving_mask) static_foreground_mask_full[int(h / 2.0):h, 0:int(w)] = static_foreground_mask self.motion_mask = motion_mask self.foreground_mask = foreground_mask self.static_foreground_mask = static_foreground_mask_full # find the contours contours, _ = cv2.findContours(static_foreground_mask_full, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) for c in contours: peri = cv2.arcLength(c, True) approx = cv2.approxPolyDP(c, 10e-3 * peri, True) xmin, ymin, w, h = cv2.boundingRect(approx) if w > 20 and h > 20 and w < rgb_image.shape[ 1] and h < rgb_image.shape[0]: filtered_bbox.append(np.array([xmin, ymin, xmin + w, ymin + h])) filtered_bbox = self.non_max_suppression(np.array(filtered_bbox), self.max_overlap) for bbox in filtered_bbox: xmin, ymin, xmax, ymax = bbox w = int(xmax - xmin) h = int(ymax - ymin) x = int(xmin + w / 2.0) y = int(ymin + h / 2.0) if depth_image is not None: x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y depth = depth_image[int(y)][int(x)] / 1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None mask = static_foreground_mask_full[int(ymin):int(ymax), int(xmin):int(xmax)] output_dets.append( Detection(int(xmin), int(ymin), int(xmax), int(ymax), "thing", 1.0, mask=mask, depth=depth)) return output_dets
def detect(self, rgb_image, depth_image=None): """ """ frame_resized = cv2.resize(rgb_image, (self.input_size, self.input_size), interpolation=cv2.INTER_AREA) self.model.setInput( cv2.dnn.blobFromImage(frame_resized, swapRB=self.swapRB)) detections = self.model.forward() filtered_detections = [] detection_per_class = {} score_per_class = {} rows = frame_resized.shape[0] cols = frame_resized.shape[1] height_factor = rgb_image.shape[0] / float(self.input_size) width_factor = rgb_image.shape[1] / float(self.input_size) for i in range(detections.shape[2]): class_id = int(detections[0, 0, i, 1]) confidence = detections[0, 0, i, 2] if class_id in self.config: if self.config[class_id]["activated"] is True: if confidence > self.config[class_id][ "confidence_threshold"]: class_label = self.config[class_id]["label"] xmin = int(detections[0, 0, i, 3] * cols) ymin = int(detections[0, 0, i, 4] * rows) xmax = int(detections[0, 0, i, 5] * cols) ymax = int(detections[0, 0, i, 6] * rows) xmin = int(width_factor * xmin) ymin = int(height_factor * ymin) xmax = int(width_factor * xmax) ymax = int(height_factor * ymax) xmin = 0 if xmin < 0 else xmin ymin = 0 if ymin < 0 else ymin xmax = rgb_image.shape[ 1] - 1 if xmax > rgb_image.shape[1] - 1 else xmax ymax = rgb_image.shape[ 0] - 1 if ymax > rgb_image.shape[0] - 1 else ymax if (xmax - xmin) > 20 and (ymax - ymin) > 20: bbox = [xmin, ymin, xmax, ymax, confidence] if class_label not in detection_per_class: detection_per_class[class_label] = [] if class_label not in score_per_class: score_per_class[class_label] = [] detection_per_class[class_label].append(bbox) for class_label, dets in detection_per_class.items(): filtered_dets = self.non_max_suppression(np.array(dets), self.max_overlap_ratio) for d in filtered_dets: if depth_image is not None: w = d[2] - d[0] h = d[3] - d[1] x = d[0] + w / 2.0 y = d[1] + h / 2.0 x = depth_image.shape[1] - 1 if x > depth_image.shape[ 1] else x y = depth_image.shape[0] - 1 if y > depth_image.shape[ 0] else y depth = depth_image[int(y)][int(x)] / 1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None filtered_detections.append( Detection(d[0], d[1], d[2], d[3], class_label, d[4], depth=depth)) return filtered_detections
def detect(self, rgb_image, depth_image=None): """ """ frame_resized = cv2.resize(rgb_image, self.input_size, interpolation=cv2.INTER_AREA) self.model.setInput( cv2.dnn.blobFromImage(frame_resized, swapRB=self.swapRB)) (detections, masks) = self.model.forward( ["detection_out_final", "detection_masks"]) filtered_detections = [] rows = frame_resized.shape[0] cols = frame_resized.shape[1] height_factor = rgb_image.shape[0] / float(self.input_size[0]) width_factor = rgb_image.shape[1] / float(self.input_size[1]) for i in range(0, detections.shape[2]): class_id = int(detections[0, 0, i, 1]) confidence = detections[0, 0, i, 2] if class_id in self.config: if self.config[class_id]["activated"] is True: if confidence > self.config[class_id][ "confidence_threshold"]: class_label = self.config[class_id]["label"] xmin = int(detections[0, 0, i, 3] * cols * width_factor) ymin = int(detections[0, 0, i, 4] * rows * height_factor) xmax = int(detections[0, 0, i, 5] * cols * width_factor) ymax = int(detections[0, 0, i, 6] * rows * height_factor) w = xmax - xmin h = ymax - ymin mask = masks[i, class_id] mask = cv2.resize(mask, (w, h), interpolation=cv2.INTER_NEAREST) mask = (mask > self.mask_treshold) if depth_image is not None: x = xmin + w / 2.0 y = ymin + h / 2.0 x = depth_image.shape[ 1] - 1 if x > depth_image.shape[1] else x y = depth_image.shape[ 0] - 1 if y > depth_image.shape[0] else y depth = depth_image[int(y)][int(x)] / 1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None filtered_detections.append( Detection(xmin, ymin, xmax, ymax, class_label, confidence, mask=mask, depth=depth)) return filtered_detections
def perception_pipeline(self, view_matrix, rgb_image, depth_image=None, time=None): """ """ ###################################################### # Detection ###################################################### detections = [] image_height, image_width, _ = rgb_image.shape if depth_image is not None: if self.table_depth is None: x = int(image_width * (1 / 2.0)) y = int(image_height * (3 / 4.0)) if not math.isnan(depth_image[y, x]): self.table_depth = depth_image[y, x] / 1000.0 table_depth = self.table_depth else: table_depth = None table_detection = Detection(0, int(image_height / 2.0), int(image_width), image_height, "table", 1.0, table_depth) view_pose = Vector6D().from_transform(view_matrix) if depth_image is not None: if self.table_shape is None: fx = self.camera_matrix[0][0] fy = self.camera_matrix[1][1] cx = self.camera_matrix[0][2] cy = self.camera_matrix[1][2] x = int(image_width * (1 / 2.0)) y = int(image_height * (3 / 4.0)) if not math.isnan(depth_image[y, x]): z = depth_image[y, x] / 1000.0 x = (x - cx) * z / fx y = (y - cy) * z / fy table_border = Vector6D(x=x, y=y, z=z) x = int(image_width * (1 / 2.0)) y = int(image_height * (1 / 2.0)) if not math.isnan(depth_image[y, x]): z = depth_image[y, x] / 1000.0 x = (x - cx) * z / fx y = (y - cy) * z / fy table_center = Vector6D(x=x, y=y, z=z) table_length = (image_width - cx) * z / fx table_width = 2.0 * math.sqrt( pow(table_border.pos.x - table_center.pos.x, 2) + pow(table_border.pos.y - table_center.pos.y, 2) + pow(table_border.pos.z - table_center.pos.z, 2)) center_in_world = view_pose + table_center real_z = center_in_world.position().z print center_in_world.position() self.table_shape = Box(table_width, table_length, real_z) self.table_shape.pose.pos.z = -real_z / 2.0 self.table_shape.color = self.shape_estimator.compute_dominant_color( rgb_image, table_detection.bbox) if self.frame_count == 0: person_detections = self.person_detector.detect( rgb_image, depth_image=depth_image) detections = person_detections elif self.frame_count == 1: object_detections = self.foreground_detector.detect( rgb_image, depth_image=depth_image) detections = object_detections else: detections = [] #################################################################### # Features estimation #################################################################### self.color_features_estimator.estimate(rgb_image, detections) ###################################################### # Tracking ###################################################### if self.frame_count == 0: object_tracks = self.object_tracker.update(rgb_image, [], depth_image=depth_image) person_tracks = self.person_tracker.update(rgb_image, person_detections, depth_image=depth_image) elif self.frame_count == 1: object_tracks = self.object_tracker.update(rgb_image, object_detections, depth_image=depth_image) person_tracks = self.person_tracker.update(rgb_image, [], depth_image=depth_image) else: object_tracks = self.object_tracker.update(rgb_image, [], depth_image=depth_image) person_tracks = self.person_tracker.update(rgb_image, [], depth_image=depth_image) if self.table_track is None: self.table_track = Track(table_detection, 1, 4, 20) else: self.table_track.update(table_detection) if self.table_shape is not None: self.table_track.shapes = [self.table_shape] support_tracks = [self.table_track] tracks = object_tracks + person_tracks + support_tracks ######################################################## # Pose & Shape estimation ######################################################## self.object_pose_estimator.estimate( object_tracks + person_tracks + support_tracks, view_matrix, self.camera_matrix, self.dist_coeffs) self.shape_estimator.estimate(rgb_image, tracks, self.camera_matrix, self.dist_coeffs) ######################################################## # Monitoring ######################################################## events = self.action_monitor.monitor(support_tracks, object_tracks, person_tracks, []) return tracks, events
def detect(self, rgb_image, depth_image=None, roi_points=[], prior_detections=[]): filtered_bbox = [] output_dets = [] h, w, _ = rgb_image.shape foreground_mask = np.zeros((h, w), dtype=np.uint8) bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) foreground_mask[int(h/2.0):h, 0:int(w)] = self.background_substraction.apply(bgr_image[int(h/2.0):h, 0:int(w)], learningRate=10e-7) foreground_mask[foreground_mask != 255] = 0 # shadows suppression self.pub.publish(self.bridge.cv2_to_imgmsg(foreground_mask)) for d in prior_detections: x = int(d.bbox.xmin) x = x - 5 if x > 5 else x y = int(d.bbox.ymin) y = y - 5 if y > 5 else y w = int(d.bbox.width()) w = w + 5 if w + 5 < rgb_image.shape[1] else w h = int(d.bbox.height()) h = h + 5 if h + 5 < rgb_image.shape[0] else h foreground_mask[y:y+h, x:x+w] = 0 # remove the noise of the mask kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) kernel_big = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6)) closing = cv2.morphologyEx(foreground_mask, cv2.MORPH_CLOSE, kernel_small) opening = cv2.morphologyEx(closing, cv2.MORPH_OPEN, kernel_big) if len(self.roi_points) == 2: roi_mask = np.full(foreground_mask.shape, 255, dtype="uint8") roi_mask[self.roi_points[0][1]:self.roi_points[1][1], self.roi_points[0][0]:self.roi_points[1][0]] = 0 opening -= roi_mask opening[opening != 255] = 0 # find the contours contours, hierarchy = cv2.findContours(opening, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) for c in contours: peri = cv2.arcLength(c, True) approx = cv2.approxPolyDP(c, 10e-3 * peri, True) xmin, ymin, w, h = cv2.boundingRect(approx) if w > 10 and h > 10 and w < rgb_image.shape[1] and h < rgb_image.shape[0]: filtered_bbox.append(np.array([xmin, ymin, xmin+w, ymin+h])) if self.interactive_mode is True: debug_image = cv2.cvtColor(opening.copy(), cv2.COLOR_GRAY2BGR) if len(self.roi_points) == 1: opening = cv2.rectangle(debug_image, self.roi_points[0], self.roi_points[0], (0, 255, 0), 3) elif len(self.roi_points) == 2: opening = cv2.rectangle(debug_image, self.roi_points[0], self.roi_points[1], (0, 255, 0), 3) cv2.rectangle(debug_image, (0, 0), (300, 40), (200, 200, 200), -1) cv2.putText(debug_image, "Detector state : {}".format(DetectorState.state[self.state]), (15, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) cv2.putText(debug_image, "Select ROI & press 'r' to start", (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) cv2.imshow("select_roi", debug_image) key = cv2.waitKey(1) & 0xFF if key == ord('r'): self.background_substraction = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=50, detectShadows=True) self.state = DetectorState.RUNNING if self.state == DetectorState.RUNNING: filtered_bbox = self.non_max_suppression(np.array(filtered_bbox), 0.5) for bbox in filtered_bbox: xmin, ymin, xmax, ymax = bbox w = int(xmax - xmin) h = int(ymax - ymin) x = int(xmin + w/2.0) y = int(ymin + h/2.0) if depth_image is not None: x = depth_image.shape[1]-1 if x > depth_image.shape[1] else x y = depth_image.shape[0]-1 if y > depth_image.shape[0] else y depth = depth_image[int(y)][int(x)]/1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None mask = opening[int(ymin):int(ymax), int(xmin):int(xmax)] output_dets.append(Detection(int(xmin), int(ymin), int(xmax), int(ymax), "thing", 0.4, mask=mask, depth=depth)) return output_dets else: return []
def estimate(self, t, q, camera_info, target_position=None, occlusion_treshold=0.01, output_viz=True): perspective_timer = cv2.getTickCount() visible_detections = [] rot = quaternion_matrix(q) trans = translation_matrix(t) if target_position is None: target = translation_matrix([0.0, 0.0, 1000.0]) target_position = translation_from_matrix( np.dot(np.dot(trans, rot), target)) view_matrix = p.computeViewMatrix(t, target_position, [0, 0, 1]) width = HumanVisualModel.WIDTH height = HumanVisualModel.HEIGHT projection_matrix = p.computeProjectionMatrixFOV( HumanVisualModel.FOV, HumanVisualModel.ASPECT, HumanVisualModel.CLIPNEAR, HumanVisualModel.CLIPFAR) rendered_width = int(width / 3.0) rendered_height = int(height / 3.0) width_ratio = width / rendered_width height_ratio = height / rendered_height camera_image = p.getCameraImage(rendered_width, rendered_height, viewMatrix=view_matrix, renderer=p.ER_BULLET_HARDWARE_OPENGL, projectionMatrix=projection_matrix) rgb_image = cv2.resize(np.array(camera_image[2]), (width, height)) depth_image = np.array(camera_image[3], np.float32).reshape( (rendered_height, rendered_width)) far = HumanVisualModel.CLIPFAR near = HumanVisualModel.CLIPNEAR real_depth_image = far * near / (far - (far - near) * depth_image) if self.use_saliency is not False: saliency_map, saliency_heatmap = self.saliency_estimator.estimate( camera_image[2], real_depth_image) saliency_heatmap_resized = cv2.resize(saliency_heatmap, (width, height)) mask_image = camera_image[4] unique, counts = np.unique(np.array(mask_image).flatten(), return_counts=True) for sim_id, count in zip(unique, counts): if sim_id > 0: cv_mask = np.array(mask_image.copy()) cv_mask[cv_mask != sim_id] = 0 cv_mask[cv_mask == sim_id] = 255 xmin, ymin, w, h = cv2.boundingRect(cv_mask.astype(np.uint8)) visible_area = w * h + 1 screen_area = rendered_width * rendered_height + 1 if screen_area - visible_area == 0: confidence = 1.0 else: confidence = visible_area / float(screen_area - visible_area) #TODO compute occlusion score as a ratio between visible 2d bbox and projected 2d bbox areas depth = real_depth_image[int(ymin + h / 2.0)][int(xmin + w / 2.0)] xmin = int(xmin * width_ratio) ymin = int(ymin * height_ratio) w = int(w * width_ratio) h = int(h * height_ratio) id = self.simulator.reverse_entity_id_map[sim_id] if confidence > occlusion_treshold: det = Detection(int(xmin), int(ymin), int(xmin + w), int(ymin + h), id, confidence, depth=depth) visible_detections.append(det) # for subject_detection in visible_detections: # for object_detection in visible_detections: # if subject_detection != object_detection: # pass #TODO create inference batch for egocentric relation detection perspective_fps = cv2.getTickFrequency() / (cv2.getTickCount() - perspective_timer) if output_viz is True: viz_frame = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) if self.use_saliency is not False: viz_frame = cv2.addWeighted(saliency_heatmap_resized, 0.4, viz_frame, 0.7, 0) cv2.rectangle(viz_frame, (0, 0), (250, 40), (200, 200, 200), -1) perspective_fps_str = "Perspective taking fps: {:0.1f}hz".format( perspective_fps) cv2.putText(viz_frame, "Nb detections: {}".format(len(visible_detections)), (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) cv2.putText(viz_frame, perspective_fps_str, (5, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) for detection in visible_detections: detection.draw(viz_frame, (0, 200, 0)) return rgb_image, real_depth_image, visible_detections, viz_frame else: return rgb_image, real_depth_image, visible_detections
def observation_callback(self, bgr_image_msg): """ """ if self.nb_sample < self.max_samples: bgr_image = self.bridge.imgmsg_to_cv2(bgr_image_msg, "bgr8") rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) detections = self.face_detector.detect(rgb_image) tracks = self.face_tracker.update(rgb_image, detections) self.facial_landmarks_estimator.estimate(rgb_image, tracks) self.head_pose_estimator.estimate(tracks) view_image = bgr_image.copy() biggest_face = None for track in tracks: if biggest_face is None: biggest_face = track else: if track.bbox.area() > biggest_face.bbox.area(): biggest_face = track if biggest_face is not None: biggest_face.bbox.draw(view_image, (0, 200, 0), 2) sample_uuid = str(uuid.uuid4()).replace("-", "") facial_landmarks = biggest_face.features[ "facial_landmarks"].to_array() l_eye_contours = biggest_face.features[ "facial_landmarks"].left_eye_contours() biggest_face.bbox.draw(view_image, (0, 200, 0), 2) xmin, ymin, w, h = cv2.boundingRect(l_eye_contours) l_eye_detected = h > MIN_EYE_PATCH_HEIGHT and w > MIN_EYE_PATCH_WIDTH if l_eye_detected: l_eye_mask = cv2.fillConvexPoly( np.zeros(rgb_image.shape[:2], dtype=np.uint8), l_eye_contours, 255)[ymin:ymin + h, xmin:xmin + w] l_eye_detection = Detection(xmin, ymin, xmin + w, ymin + h, "l_eye", 1.0, mask=l_eye_mask) w_margin = int((w * WIDTH_MARGIN / 2.0)) h_margin = int((h * HEIGHT_MARGIN / 2.0)) l_eye_patch = bgr_image[ymin - h_margin:ymin + h + h_margin, xmin - w_margin:xmin + w + w_margin] l_eye_detection.bbox.draw(view_image, (0, 200, 0), 2) r_eye_contours = biggest_face.features[ "facial_landmarks"].right_eye_contours() xmin, ymin, w, h = cv2.boundingRect(r_eye_contours) r_eye_detected = h > MIN_EYE_PATCH_HEIGHT and w > MIN_EYE_PATCH_WIDTH if r_eye_detected: r_eye_mask = cv2.fillConvexPoly( np.zeros(rgb_image.shape[:2], dtype=np.uint8), r_eye_contours, 255)[ymin:ymin + h, xmin:xmin + w] r_eye_detection = Detection(xmin, ymin, xmin + w, ymin + h, "r_eye", 1.0, mask=r_eye_mask) w_margin = int((w * WIDTH_MARGIN / 2.0)) h_margin = int((h * HEIGHT_MARGIN / 2.0)) r_eye_patch = bgr_image[ymin - h_margin:ymin + h + h_margin, xmin - w_margin:xmin + w + w_margin] r_eye_detection.bbox.draw(view_image, (0, 200, 0), 2) if l_eye_detected is True and r_eye_detected is True: l_eye_patch_resized = cv2.resize( l_eye_patch, (EYE_INPUT_WIDTH, EYE_INPUT_HEIGHT), interpolation=cv2.INTER_AREA) r_eye_patch_resized = cv2.resize( r_eye_patch, (EYE_INPUT_WIDTH, EYE_INPUT_HEIGHT), interpolation=cv2.INTER_AREA) view_image[0:EYE_INPUT_HEIGHT, 0:EYE_INPUT_WIDTH] = l_eye_patch_resized view_image[0:EYE_INPUT_HEIGHT, EYE_INPUT_WIDTH:EYE_INPUT_WIDTH * 2] = r_eye_patch_resized biggest_face.features["facial_landmarks"].draw( view_image, (0, 200, 0), 1) sample_uuid = str(uuid.uuid4()).replace("-", "") try: if self.record_headpose: if biggest_face.pose is not None: self.headpose_record.write( biggest_face.position().to_array(), biggest_face.rotation().to_array()) if self.record_faces: xmin = int(biggest_face.bbox.xmin) xmax = int(biggest_face.bbox.xmax) ymin = int(biggest_face.bbox.ymin) ymax = int(biggest_face.bbox.ymax) face_image = rgb_image[ymin:ymax, xmin:xmax] cv2.imwrite( self.data_path + "/" + sample_uuid + "-face.png", face_image, [int(cv2.IMWRITE_PNG_COMPRESSION), 9]) rospy.loginfo( "[gaze_recorder] sample: {} id: {}".format( self.nb_sample, sample_uuid)) if self.record_eyes: cv2.imwrite( self.data_path + "/" + sample_uuid + "-l_eye.png", l_eye_patch_resized, [int(cv2.IMWRITE_PNG_COMPRESSION), 9]) cv2.imwrite( self.data_path + "/" + sample_uuid + "-r_eye.png", r_eye_patch_resized, [int(cv2.IMWRITE_PNG_COMPRESSION), 9]) if self.record_landmarks: with open( self.data_path + "/" + sample_uuid + "-landmarks.txt", 'w') as file: facial_landmarks_str = str( facial_landmarks).replace("[", "") facial_landmarks_str = facial_landmarks_str.replace( "]", "") facial_landmarks_str = "-".join( facial_landmarks_str.split()).replace( "-", " ") file.write(facial_landmarks_str) self.nb_sample += 1 except Exception as e: rospy.logwarn("Exeption occured: {}".format(e)) msg = self.bridge.cv2_to_imgmsg(view_image, "bgr8") msg.header = bgr_image_msg.header self.view_publisher.publish(msg)
def observation_callback(self, bgr_image_msg, depth_image_msg=None): if self.camera_info is not None: perception_timer = cv2.getTickCount() bgr_image = self.bridge.imgmsg_to_cv2(bgr_image_msg, "bgr8") rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) if depth_image_msg is not None: depth_image = self.bridge.imgmsg_to_cv2(depth_image_msg) else: depth_image = None if self.publish_visualization_image is True: viz_frame = bgr_image image_height, image_width, _ = bgr_image.shape success, view_pose = self.get_pose_from_tf2( self.global_frame_id, self.camera_frame_id) if success is not True: rospy.logwarn( "[tabletop_perception] The camera sensor is not localized in world space (frame '{}'), please check if the sensor frame is published in /tf" .format(self.global_frame_id)) else: view_matrix = view_pose.transform() self.frame_count %= self.n_frame ###################################################### # Detection ###################################################### detections = [] if depth_image is not None: if self.table_depth is None: x = int(image_width * (1 / 2.0)) y = int(image_height * (3 / 4.0)) if not math.isnan(depth_image[y, x]): self.table_depth = depth_image[y, x] / 1000.0 table_depth = self.table_depth else: table_depth = None table_detection = Detection(0, int(image_height / 2.0), int(image_width), image_height, "table", 1.0, table_depth) if depth_image is not None: if self.table_shape is None: fx = self.camera_matrix[0][0] fy = self.camera_matrix[1][1] cx = self.camera_matrix[0][2] cy = self.camera_matrix[1][2] x = int(image_width * (1 / 2.0)) y = int(image_height * (3 / 4.0)) if not math.isnan(depth_image[y, x]): z = depth_image[y, x] / 1000.0 x = (x - cx) * z / fx y = (y - cy) * z / fy table_border = Vector6D(x=x, y=y, z=z) x = int(image_width * (1 / 2.0)) y = int(image_height * (1 / 2.0)) if not math.isnan(depth_image[y, x]): z = depth_image[y, x] / 1000.0 x = (x - cx) * z / fx y = (y - cy) * z / fy table_center = Vector6D(x=x, y=y, z=z) table_length = (image_width - cx) * z / fx table_width = 2.0 * math.sqrt( pow( table_border.pos.x - table_center.pos.x, 2) + pow( table_border.pos.y - table_center.pos.y, 2) + pow( table_border.pos.z - table_center.pos.z, 2)) center_in_world = view_pose + table_center real_z = center_in_world.position().z print view_pose.position() print center_in_world.position() self.table_shape = Box(table_width, table_length, real_z) self.table_shape.pose.pos.z = -real_z / 2.0 self.table_shape.color = self.shape_estimator.compute_dominant_color( rgb_image, table_detection.bbox) if self.frame_count == 0: person_detections = self.detector.detect( rgb_image, depth_image=depth_image) detections = person_detections elif self.frame_count == 1: object_detections = self.foreground_detector.detect( rgb_image, depth_image=depth_image) detections = object_detections else: detections = [] #################################################################### # Features estimation #################################################################### self.color_features_estimator.estimate(rgb_image, detections) ###################################################### # Tracking ###################################################### if self.frame_count == 0: object_tracks = self.object_tracker.update( rgb_image, [], depth_image=depth_image) person_tracks = self.person_tracker.update( rgb_image, person_detections, depth_image=depth_image) elif self.frame_count == 1: object_tracks = self.object_tracker.update( rgb_image, object_detections, depth_image=depth_image) person_tracks = self.person_tracker.update( rgb_image, [], depth_image=depth_image) else: object_tracks = self.object_tracker.update( rgb_image, [], depth_image=depth_image) person_tracks = self.person_tracker.update( rgb_image, [], depth_image=depth_image) if self.table_track is None: self.table_track = Track(table_detection, 1, 4, 20) else: self.table_track.update(table_detection) if self.table_shape is not None: self.table_track.shapes = [self.table_shape] table_track = [self.table_track] tracks = object_tracks + person_tracks + table_track ######################################################## # Pose & Shape estimation ######################################################## self.object_pose_estimator.estimate( object_tracks + person_tracks + table_track, view_matrix, self.camera_matrix, self.dist_coeffs) self.shape_estimator.estimate(rgb_image, tracks, self.camera_matrix, self.dist_coeffs) self.frame_count += 1 ###################################################### # Visualization of debug image ###################################################### perception_fps = cv2.getTickFrequency() / (cv2.getTickCount() - perception_timer) cv2.rectangle(viz_frame, (0, 0), (250, 40), (200, 200, 200), -1) perception_fps_str = "Perception fps : {:0.1f}hz".format( perception_fps) cv2.putText( viz_frame, "Nb detections/tracks : {}/{}".format( len(detections), len(tracks)), (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) cv2.putText(viz_frame, perception_fps_str, (5, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) scene_changes = SceneChangesStamped() scene_changes.world = "tracks" header = bgr_image_msg.header header.frame_id = self.global_frame_id scene_changes.header = header for track in tracks: if self.publish_visualization_image is True: track.draw(viz_frame, (230, 0, 120, 125), 1, view_matrix, self.camera_matrix, self.dist_coeffs) if track.is_confirmed(): scene_node = track.to_msg(header) if self.publish_tf is True: if track.is_located() is True: self.publish_tf_pose(scene_node.pose_stamped, self.global_frame_id, scene_node.id) scene_changes.changes.nodes.append(scene_node) if self.publish_visualization_image is True: viz_img_msg = self.bridge.cv2_to_imgmsg(viz_frame) self.visualization_publisher.publish(viz_img_msg) self.tracks_publisher.publish(scene_changes)
def predict(self, rgb_image, depth_image=None): """ """ xmin = self.bbox.xmin ymin = self.bbox.ymin xmax = self.bbox.xmax ymax = self.bbox.ymax w = self.bbox.width() h = self.bbox.height() new_xmin = int(xmin - 2.0 * w) if int(xmin - 2.0 * w) > 0 else 0 new_ymin = int(ymin - 2.0 * h) if int(xmin - 2.0 * h) > 0 else 0 new_xmax = int(xmax + 2.0 * w) if int( xmax + 2.0 * w) < rgb_image.shape[1] else rgb_image.shape[1] new_ymax = int(ymax + 2.0 * h) if int( ymax + 2.0 * h) < rgb_image.shape[0] else rgb_image.shape[0] hsv_image = cv2.cvtColor( rgb_image[new_ymin:new_ymax, new_xmin:new_xmax], cv2.COLOR_RGB2HSV) dst = cv2.calcBackProject([hsv_image], [0], self.histogram, [0, 180], 1) track_window = xmin - new_xmin, ymin - new_ymin, w, h term_crit = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 20, 1) ret, track_window = cv2.CamShift(dst, track_window, term_crit) points = cv2.boxPoints(ret) points = np.int0(points) xmin, ymin, w, h = cv2.boundingRect(points) if h < 7 or w < 7: return False, None if xmin < 0 or ymin < 0: return False, None _, thresh_map = cv2.threshold(dst[ymin:ymin + h, xmin:xmin + w], 20, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) kernel_big = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6)) closing = cv2.morphologyEx(thresh_map, cv2.MORPH_CLOSE, kernel_big) opening = cv2.morphologyEx(closing, cv2.MORPH_OPEN, kernel_small) mask = opening if depth_image is not None: x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y depth = depth_image[int(y)][int(x)] / 1000.0 if math.isnan(depth) or depth == 0.0: depth = None else: depth = None pred = Detection(int(xmin), int(ymin), int(xmin + w), int(ymin + h), self.object_label, 0.6, depth=depth, mask=mask) # # cv2.normalize(dst, dst, 0, 255, cv2.NORM_MINMAX) # _, thresh_map = cv2.threshold(dst[ymin:ymin+h, xmin:xmin+w], 20, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) # kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) # kernel_big = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6)) # closing = cv2.morphologyEx(thresh_map, cv2.MORPH_CLOSE, kernel_big) # opening = cv2.morphologyEx(closing, cv2.MORPH_OPEN, kernel_small) # refined_xmin, refined_ymin, w, h = cv2.boundingRect(opening) # # xmin += new_xmin + refined_xmin # ymin += new_ymin + refined_ymin # if w < 30 or h < 30: # return False, None # else: # x = int(xmin + w/2.0) # y = int(ymin + h/2.0) # if depth_image is not None: # x = depth_image.shape[1]-1 if x > depth_image.shape[1] else x # y = depth_image.shape[0]-1 if y > depth_image.shape[0] else y # depth = depth_image[int(y)][int(x)]/1000.0 # if math.isnan(depth) or depth == 0.0: # depth = None # else: # depth = None # if w < 30 or h < 30: # return False, None # mask = opening[int(ymin):int(ymin+h), int(xmin):int(xmin+w)] # pred = Detection(int(xmin), int(ymin), int(xmin+w), int(ymin+h), self.object_label, 0.6, depth=depth, mask=mask) # print pred.bbox return True, pred