Ejemplo n.º 1
0
    def detect(self, rgb_image, depth_image=None):
        """ Detect frontal faces """
        face_detections = []

        faces = self.model(rgb_image, 0)

        for face in faces:
            if depth_image is not None:
                w = face.width()
                h = face.height()
                x = face.left() + w / 2.0
                y = face.top() + h / 2.0
                x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x
                y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y
                depth = depth_image[int(y)][int(x)] / 1000.0
                if math.isnan(depth) or depth == 0.0:
                    depth = None
            else:
                depth = None
            face_detections.append(
                Detection(face.left(),
                          face.top(),
                          face.right(),
                          face.bottom(),
                          "face",
                          1.0,
                          depth=depth))
        return face_detections
Ejemplo n.º 2
0
 def predict(self, rgb_image, depth_image=None):
     """ """
     bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
     success, bbox = self.tracker.update(bgr_image)
     xmin, ymin, w, h = bbox
     xmin = 0 if xmin < 0 else xmin
     ymin = 0 if ymin < 0 else ymin
     x = int(xmin + w / 2.0)
     y = int(ymin + h / 2.0)
     if depth_image is not None:
         x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x
         y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y
         depth = depth_image[int(y)][int(x)] / 1000.0
         if math.isnan(depth) or depth == 0.0:
             depth = None
     else:
         depth = None
     prediction = Detection(xmin,
                            ymin,
                            xmin + w,
                            ymin + h,
                            self.object_label,
                            1.0,
                            depth=depth)
     return success, prediction
 def predict(self, rgb_image):
     """ """
     bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
     success, bbox = self.medianflow_tracker.update(bgr_image)
     xmin, ymin, width, height = bbox
     xmin = 0 if xmin < 0 else xmin
     ymin = 0 if ymin < 0 else ymin
     prediction = Detection(xmin, ymin, xmin + width, ymin + height,
                            self.object_label, 1.0)
     return success, prediction
Ejemplo n.º 4
0
    def detect(self, rgb_image, depth_image=None):
        """ Detect an object based on the color """
        hsv_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2HSV)

        lower = np.array([136, 87, 111], np.uint8)
        upper = np.array([180, 255, 255], np.uint8)

        color_mask = cv2.inRange(hsv_image, self.lower, self.upper)

        color_mask = cv2.dilate(color_mask, self.kernel)

        if self.debug_topics is True:
            self.pub.publish(self.bridge.cv2_to_imgmsg(color_mask))

        # find the bbox
        x, y, w, h = cv2.boundingRect(color_mask)
        xmin = x
        ymin = y
        xmax = xmin + w
        ymax = ymin + h

        if depth_image is not None:
            x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x
            y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y
            depth = depth_image[int(y)][int(x)] / 1000.0
            if math.isnan(depth) or depth == 0.0:
                depth = None
        else:
            depth = None
        mask = color_mask[int(ymin):int(ymax), int(xmin):int(xmax)]

        object_detection = Detection(int(xmin),
                                     int(ymin),
                                     int(xmin + w),
                                     int(ymin + h),
                                     self.color + "_thing",
                                     1.0,
                                     mask=mask,
                                     depth=depth)

        if object_detection.bbox.area() > 100:
            return [object_detection]
        else:
            return []
Ejemplo n.º 5
0
    def get_camera_view(self,
                        camera_pose,
                        camera,
                        target_position=None,
                        occlusion_threshold=0.01,
                        rendering_ratio=1.0):
        visible_tracks = []
        rot = quaternion_matrix(camera_pose.quaternion())
        trans = translation_matrix(camera_pose.position().to_array().flatten())
        if target_position is None:
            target = translation_matrix([0.0, 0.0, 1000.0])
            target = translation_from_matrix(np.dot(np.dot(trans, rot),
                                                    target))
        else:
            target = target_position.position().to_array()
        view_matrix = p.computeViewMatrix(camera_pose.position().to_array(),
                                          target, [0, 0, 1])

        width = camera.width
        height = camera.height

        projection_matrix = p.computeProjectionMatrixFOV(
            camera.fov,
            float(width) / height, camera.clipnear, camera.clipfar)

        rendered_width = int(width / rendering_ratio)
        rendered_height = int(height / rendering_ratio)

        width_ratio = width / rendered_width
        height_ratio = height / rendered_height

        if self.use_gui is True:
            camera_image = p.getCameraImage(
                rendered_width,
                rendered_height,
                viewMatrix=view_matrix,
                renderer=p.ER_BULLET_HARDWARE_OPENGL,
                projectionMatrix=projection_matrix)
        else:
            camera_image = p.getCameraImage(rendered_width,
                                            rendered_height,
                                            viewMatrix=view_matrix,
                                            renderer=p.ER_TINY_RENDERER,
                                            projectionMatrix=projection_matrix)

        rgb_image = cv2.resize(np.array(camera_image[2]),
                               (width, height))[:, :, :3]
        depth_image = np.array(camera_image[3], np.float32).reshape(
            (rendered_height, rendered_width))

        far = camera.clipfar
        near = camera.clipnear
        real_depth_image = far * near / (far - (far - near) * depth_image)

        mask_image = camera_image[4]
        unique, counts = np.unique(np.array(mask_image).flatten(),
                                   return_counts=True)

        for sim_id, count in zip(unique, counts):
            if sim_id > 0:
                cv_mask = np.array(mask_image.copy())
                cv_mask[cv_mask != sim_id] = 0
                cv_mask[cv_mask == sim_id] = 255
                xmin, ymin, w, h = cv2.boundingRect(cv_mask.astype(np.uint8))
                mask = cv_mask[ymin:ymin + h, xmin:xmin + w]
                visible_area = w * h + 1
                screen_area = rendered_width * rendered_height + 1
                if screen_area - visible_area == 0:
                    confidence = 1.0
                else:
                    confidence = visible_area / float(screen_area -
                                                      visible_area)
                #TODO compute occlusion score as a ratio between visible 2d bbox and projected 2d bbox areas
                if confidence > occlusion_threshold:

                    depth = real_depth_image[int(ymin + h / 2.0)][int(xmin +
                                                                      w / 2.0)]
                    xmin = int(xmin * width_ratio)
                    ymin = int(ymin * height_ratio)
                    w = int(w * width_ratio)
                    h = int(h * height_ratio)

                    id = self.reverse_entity_id_map[sim_id]
                    scene_node = self.get_entity(id)

                    det = Detection(int(xmin),
                                    int(ymin),
                                    int(xmin + w),
                                    int(ymin + h),
                                    id,
                                    confidence,
                                    depth=depth,
                                    mask=mask)
                    track = SceneNode(detection=det)
                    track.static = scene_node.static
                    track.id = id
                    track.mask = det.mask
                    track.shapes = scene_node.shapes
                    track.pose = scene_node.pose
                    track.label = scene_node.label
                    track.description = scene_node.description
                    visible_tracks.append(track)
        return rgb_image, real_depth_image, visible_tracks
Ejemplo n.º 6
0
    def detect(self, rgb_image, depth_image=None, table_mask=None):
        """ Detect unknown objects
        """
        filtered_bbox = []
        output_dets = []

        h, w, _ = rgb_image.shape
        static_foreground_mask_full = np.zeros((h, w), dtype=np.uint8)

        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)

        bgr_image_cropped = bgr_image[int(h / 2.0):h, 0:int(w)]
        cropped_height, cropped_width, _ = bgr_image_cropped.shape

        foreground_mask = self.long_term_detector.apply(
            bgr_image_cropped, learningRate=LONGTERM_LEARNING_RATE)
        foreground_mask[foreground_mask != 255] = 0  # shadows suppression
        foreground_mask = cv2.morphologyEx(foreground_mask, cv2.MORPH_CLOSE,
                                           self.kernel)
        foreground_mask = cv2.morphologyEx(foreground_mask, cv2.MORPH_OPEN,
                                           self.kernel)
        foreground_mask = cv2.dilate(foreground_mask, self.kernel)

        motion_mask = self.short_term_detector.apply(
            bgr_image_cropped, learningRate=SHORTTERM_LEARNING_RATE)
        motion_mask[motion_mask != 255] = 0  # shadows suppression
        motion_mask = cv2.morphologyEx(motion_mask, cv2.MORPH_CLOSE,
                                       self.kernel)
        motion_mask = cv2.morphologyEx(motion_mask, cv2.MORPH_OPEN,
                                       self.kernel)
        motion_mask = cv2.dilate(motion_mask, self.kernel)

        not_moving_mask = cv2.bitwise_not(motion_mask)
        static_foreground_mask = cv2.bitwise_and(foreground_mask,
                                                 not_moving_mask)
        static_foreground_mask_full[int(h / 2.0):h,
                                    0:int(w)] = static_foreground_mask

        self.motion_mask = motion_mask
        self.foreground_mask = foreground_mask
        self.static_foreground_mask = static_foreground_mask_full

        # find the contours
        contours, _ = cv2.findContours(static_foreground_mask_full,
                                       cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

        for c in contours:
            peri = cv2.arcLength(c, True)
            approx = cv2.approxPolyDP(c, 10e-3 * peri, True)
            xmin, ymin, w, h = cv2.boundingRect(approx)
            if w > 20 and h > 20 and w < rgb_image.shape[
                    1] and h < rgb_image.shape[0]:
                filtered_bbox.append(np.array([xmin, ymin, xmin + w,
                                               ymin + h]))

        filtered_bbox = self.non_max_suppression(np.array(filtered_bbox),
                                                 self.max_overlap)

        for bbox in filtered_bbox:
            xmin, ymin, xmax, ymax = bbox
            w = int(xmax - xmin)
            h = int(ymax - ymin)
            x = int(xmin + w / 2.0)
            y = int(ymin + h / 2.0)
            if depth_image is not None:
                x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x
                y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y
                depth = depth_image[int(y)][int(x)] / 1000.0
                if math.isnan(depth) or depth == 0.0:
                    depth = None
            else:
                depth = None
            mask = static_foreground_mask_full[int(ymin):int(ymax),
                                               int(xmin):int(xmax)]
            output_dets.append(
                Detection(int(xmin),
                          int(ymin),
                          int(xmax),
                          int(ymax),
                          "thing",
                          1.0,
                          mask=mask,
                          depth=depth))

        return output_dets
Ejemplo n.º 7
0
    def detect(self, rgb_image, depth_image=None):
        """
        """
        frame_resized = cv2.resize(rgb_image,
                                   (self.input_size, self.input_size),
                                   interpolation=cv2.INTER_AREA)

        self.model.setInput(
            cv2.dnn.blobFromImage(frame_resized, swapRB=self.swapRB))

        detections = self.model.forward()
        filtered_detections = []

        detection_per_class = {}
        score_per_class = {}

        rows = frame_resized.shape[0]
        cols = frame_resized.shape[1]

        height_factor = rgb_image.shape[0] / float(self.input_size)
        width_factor = rgb_image.shape[1] / float(self.input_size)

        for i in range(detections.shape[2]):
            class_id = int(detections[0, 0, i, 1])
            confidence = detections[0, 0, i, 2]
            if class_id in self.config:
                if self.config[class_id]["activated"] is True:
                    if confidence > self.config[class_id][
                            "confidence_threshold"]:

                        class_label = self.config[class_id]["label"]

                        xmin = int(detections[0, 0, i, 3] * cols)
                        ymin = int(detections[0, 0, i, 4] * rows)
                        xmax = int(detections[0, 0, i, 5] * cols)
                        ymax = int(detections[0, 0, i, 6] * rows)

                        xmin = int(width_factor * xmin)
                        ymin = int(height_factor * ymin)
                        xmax = int(width_factor * xmax)
                        ymax = int(height_factor * ymax)

                        xmin = 0 if xmin < 0 else xmin
                        ymin = 0 if ymin < 0 else ymin
                        xmax = rgb_image.shape[
                            1] - 1 if xmax > rgb_image.shape[1] - 1 else xmax
                        ymax = rgb_image.shape[
                            0] - 1 if ymax > rgb_image.shape[0] - 1 else ymax
                        if (xmax - xmin) > 20 and (ymax - ymin) > 20:
                            bbox = [xmin, ymin, xmax, ymax, confidence]
                            if class_label not in detection_per_class:
                                detection_per_class[class_label] = []
                            if class_label not in score_per_class:
                                score_per_class[class_label] = []
                            detection_per_class[class_label].append(bbox)

        for class_label, dets in detection_per_class.items():
            filtered_dets = self.non_max_suppression(np.array(dets),
                                                     self.max_overlap_ratio)
            for d in filtered_dets:
                if depth_image is not None:
                    w = d[2] - d[0]
                    h = d[3] - d[1]
                    x = d[0] + w / 2.0
                    y = d[1] + h / 2.0
                    x = depth_image.shape[1] - 1 if x > depth_image.shape[
                        1] else x
                    y = depth_image.shape[0] - 1 if y > depth_image.shape[
                        0] else y
                    depth = depth_image[int(y)][int(x)] / 1000.0
                    if math.isnan(depth) or depth == 0.0:
                        depth = None
                else:
                    depth = None
                filtered_detections.append(
                    Detection(d[0],
                              d[1],
                              d[2],
                              d[3],
                              class_label,
                              d[4],
                              depth=depth))

        return filtered_detections
Ejemplo n.º 8
0
    def detect(self, rgb_image, depth_image=None):
        """ """
        frame_resized = cv2.resize(rgb_image,
                                   self.input_size,
                                   interpolation=cv2.INTER_AREA)

        self.model.setInput(
            cv2.dnn.blobFromImage(frame_resized, swapRB=self.swapRB))

        (detections, masks) = self.model.forward(
            ["detection_out_final", "detection_masks"])
        filtered_detections = []

        rows = frame_resized.shape[0]
        cols = frame_resized.shape[1]

        height_factor = rgb_image.shape[0] / float(self.input_size[0])
        width_factor = rgb_image.shape[1] / float(self.input_size[1])

        for i in range(0, detections.shape[2]):
            class_id = int(detections[0, 0, i, 1])
            confidence = detections[0, 0, i, 2]
            if class_id in self.config:
                if self.config[class_id]["activated"] is True:
                    if confidence > self.config[class_id][
                            "confidence_threshold"]:
                        class_label = self.config[class_id]["label"]

                        xmin = int(detections[0, 0, i, 3] * cols *
                                   width_factor)
                        ymin = int(detections[0, 0, i, 4] * rows *
                                   height_factor)
                        xmax = int(detections[0, 0, i, 5] * cols *
                                   width_factor)
                        ymax = int(detections[0, 0, i, 6] * rows *
                                   height_factor)

                        w = xmax - xmin
                        h = ymax - ymin

                        mask = masks[i, class_id]
                        mask = cv2.resize(mask, (w, h),
                                          interpolation=cv2.INTER_NEAREST)
                        mask = (mask > self.mask_treshold)

                        if depth_image is not None:
                            x = xmin + w / 2.0
                            y = ymin + h / 2.0
                            x = depth_image.shape[
                                1] - 1 if x > depth_image.shape[1] else x
                            y = depth_image.shape[
                                0] - 1 if y > depth_image.shape[0] else y
                            depth = depth_image[int(y)][int(x)] / 1000.0
                            if math.isnan(depth) or depth == 0.0:
                                depth = None
                        else:
                            depth = None

                        filtered_detections.append(
                            Detection(xmin,
                                      ymin,
                                      xmax,
                                      ymax,
                                      class_label,
                                      confidence,
                                      mask=mask,
                                      depth=depth))

        return filtered_detections
Ejemplo n.º 9
0
    def perception_pipeline(self,
                            view_matrix,
                            rgb_image,
                            depth_image=None,
                            time=None):
        """ """
        ######################################################
        # Detection
        ######################################################

        detections = []

        image_height, image_width, _ = rgb_image.shape

        if depth_image is not None:
            if self.table_depth is None:
                x = int(image_width * (1 / 2.0))
                y = int(image_height * (3 / 4.0))
                if not math.isnan(depth_image[y, x]):
                    self.table_depth = depth_image[y, x] / 1000.0
            table_depth = self.table_depth
        else:
            table_depth = None
        table_detection = Detection(0, int(image_height / 2.0),
                                    int(image_width), image_height, "table",
                                    1.0, table_depth)

        view_pose = Vector6D().from_transform(view_matrix)
        if depth_image is not None:
            if self.table_shape is None:
                fx = self.camera_matrix[0][0]
                fy = self.camera_matrix[1][1]
                cx = self.camera_matrix[0][2]
                cy = self.camera_matrix[1][2]
                x = int(image_width * (1 / 2.0))
                y = int(image_height * (3 / 4.0))
                if not math.isnan(depth_image[y, x]):
                    z = depth_image[y, x] / 1000.0
                    x = (x - cx) * z / fx
                    y = (y - cy) * z / fy
                    table_border = Vector6D(x=x, y=y, z=z)
                    x = int(image_width * (1 / 2.0))
                    y = int(image_height * (1 / 2.0))
                    if not math.isnan(depth_image[y, x]):
                        z = depth_image[y, x] / 1000.0
                        x = (x - cx) * z / fx
                        y = (y - cy) * z / fy
                        table_center = Vector6D(x=x, y=y, z=z)
                        table_length = (image_width - cx) * z / fx
                        table_width = 2.0 * math.sqrt(
                            pow(table_border.pos.x - table_center.pos.x, 2) +
                            pow(table_border.pos.y - table_center.pos.y, 2) +
                            pow(table_border.pos.z - table_center.pos.z, 2))
                        center_in_world = view_pose + table_center
                        real_z = center_in_world.position().z
                        print center_in_world.position()
                        self.table_shape = Box(table_width, table_length,
                                               real_z)
                        self.table_shape.pose.pos.z = -real_z / 2.0
                        self.table_shape.color = self.shape_estimator.compute_dominant_color(
                            rgb_image, table_detection.bbox)

        if self.frame_count == 0:
            person_detections = self.person_detector.detect(
                rgb_image, depth_image=depth_image)
            detections = person_detections
        elif self.frame_count == 1:
            object_detections = self.foreground_detector.detect(
                rgb_image, depth_image=depth_image)
            detections = object_detections
        else:
            detections = []
        ####################################################################
        # Features estimation
        ####################################################################

        self.color_features_estimator.estimate(rgb_image, detections)

        ######################################################
        # Tracking
        ######################################################

        if self.frame_count == 0:
            object_tracks = self.object_tracker.update(rgb_image, [],
                                                       depth_image=depth_image)
            person_tracks = self.person_tracker.update(rgb_image,
                                                       person_detections,
                                                       depth_image=depth_image)
        elif self.frame_count == 1:
            object_tracks = self.object_tracker.update(rgb_image,
                                                       object_detections,
                                                       depth_image=depth_image)
            person_tracks = self.person_tracker.update(rgb_image, [],
                                                       depth_image=depth_image)
        else:
            object_tracks = self.object_tracker.update(rgb_image, [],
                                                       depth_image=depth_image)
            person_tracks = self.person_tracker.update(rgb_image, [],
                                                       depth_image=depth_image)

        if self.table_track is None:
            self.table_track = Track(table_detection, 1, 4, 20)
        else:
            self.table_track.update(table_detection)
        if self.table_shape is not None:
            self.table_track.shapes = [self.table_shape]
        support_tracks = [self.table_track]

        tracks = object_tracks + person_tracks + support_tracks

        ########################################################
        # Pose & Shape estimation
        ########################################################

        self.object_pose_estimator.estimate(
            object_tracks + person_tracks + support_tracks, view_matrix,
            self.camera_matrix, self.dist_coeffs)

        self.shape_estimator.estimate(rgb_image, tracks, self.camera_matrix,
                                      self.dist_coeffs)

        ########################################################
        # Monitoring
        ########################################################

        events = self.action_monitor.monitor(support_tracks, object_tracks,
                                             person_tracks, [])

        return tracks, events
Ejemplo n.º 10
0
    def detect(self, rgb_image, depth_image=None, roi_points=[], prior_detections=[]):
        filtered_bbox = []
        output_dets = []

        h, w, _ = rgb_image.shape
        foreground_mask = np.zeros((h, w), dtype=np.uint8)

        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
        foreground_mask[int(h/2.0):h, 0:int(w)] = self.background_substraction.apply(bgr_image[int(h/2.0):h, 0:int(w)], learningRate=10e-7)
        foreground_mask[foreground_mask != 255] = 0 # shadows suppression

        self.pub.publish(self.bridge.cv2_to_imgmsg(foreground_mask))

        for d in prior_detections:
            x = int(d.bbox.xmin)
            x = x - 5 if x > 5 else x
            y = int(d.bbox.ymin)
            y = y - 5 if y > 5 else y
            w = int(d.bbox.width())
            w = w + 5 if w + 5 < rgb_image.shape[1] else w
            h = int(d.bbox.height())
            h = h + 5 if h + 5 < rgb_image.shape[0] else h
            foreground_mask[y:y+h, x:x+w] = 0
        # remove the noise of the mask
        kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
        kernel_big = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6))
        closing = cv2.morphologyEx(foreground_mask, cv2.MORPH_CLOSE, kernel_small)
        opening = cv2.morphologyEx(closing, cv2.MORPH_OPEN, kernel_big)

        if len(self.roi_points) == 2:
            roi_mask = np.full(foreground_mask.shape, 255, dtype="uint8")
            roi_mask[self.roi_points[0][1]:self.roi_points[1][1], self.roi_points[0][0]:self.roi_points[1][0]] = 0
            opening -= roi_mask

        opening[opening != 255] = 0
        # find the contours
        contours, hierarchy = cv2.findContours(opening, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

        for c in contours:
            peri = cv2.arcLength(c, True)
            approx = cv2.approxPolyDP(c, 10e-3 * peri, True)
            xmin, ymin, w, h = cv2.boundingRect(approx)
            if w > 10 and h > 10 and w < rgb_image.shape[1] and h < rgb_image.shape[0]:
                filtered_bbox.append(np.array([xmin, ymin, xmin+w, ymin+h]))

        if self.interactive_mode is True:
            debug_image = cv2.cvtColor(opening.copy(), cv2.COLOR_GRAY2BGR)
            if len(self.roi_points) == 1:
                opening = cv2.rectangle(debug_image, self.roi_points[0], self.roi_points[0], (0, 255, 0), 3)
            elif len(self.roi_points) == 2:
                opening = cv2.rectangle(debug_image, self.roi_points[0], self.roi_points[1], (0, 255, 0), 3)
            cv2.rectangle(debug_image, (0, 0), (300, 40), (200, 200, 200), -1)
            cv2.putText(debug_image, "Detector state : {}".format(DetectorState.state[self.state]), (15, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
            cv2.putText(debug_image, "Select ROI & press 'r' to start", (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
            cv2.imshow("select_roi", debug_image)
            key = cv2.waitKey(1) & 0xFF
            if key == ord('r'):
                self.background_substraction = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=50, detectShadows=True)
                self.state = DetectorState.RUNNING

        if self.state == DetectorState.RUNNING:
            filtered_bbox = self.non_max_suppression(np.array(filtered_bbox), 0.5)

            for bbox in filtered_bbox:
                xmin, ymin, xmax, ymax = bbox
                w = int(xmax - xmin)
                h = int(ymax - ymin)
                x = int(xmin + w/2.0)
                y = int(ymin + h/2.0)
                if depth_image is not None:
                    x = depth_image.shape[1]-1 if x > depth_image.shape[1] else x
                    y = depth_image.shape[0]-1 if y > depth_image.shape[0] else y
                    depth = depth_image[int(y)][int(x)]/1000.0
                    if math.isnan(depth) or depth == 0.0:
                        depth = None
                else:
                    depth = None
                mask = opening[int(ymin):int(ymax), int(xmin):int(xmax)]
                output_dets.append(Detection(int(xmin), int(ymin), int(xmax), int(ymax), "thing", 0.4, mask=mask, depth=depth))

            return output_dets
        else:
            return []
Ejemplo n.º 11
0
    def estimate(self,
                 t,
                 q,
                 camera_info,
                 target_position=None,
                 occlusion_treshold=0.01,
                 output_viz=True):
        perspective_timer = cv2.getTickCount()
        visible_detections = []
        rot = quaternion_matrix(q)
        trans = translation_matrix(t)
        if target_position is None:
            target = translation_matrix([0.0, 0.0, 1000.0])
            target_position = translation_from_matrix(
                np.dot(np.dot(trans, rot), target))
        view_matrix = p.computeViewMatrix(t, target_position, [0, 0, 1])

        width = HumanVisualModel.WIDTH
        height = HumanVisualModel.HEIGHT

        projection_matrix = p.computeProjectionMatrixFOV(
            HumanVisualModel.FOV, HumanVisualModel.ASPECT,
            HumanVisualModel.CLIPNEAR, HumanVisualModel.CLIPFAR)

        rendered_width = int(width / 3.0)
        rendered_height = int(height / 3.0)

        width_ratio = width / rendered_width
        height_ratio = height / rendered_height

        camera_image = p.getCameraImage(rendered_width,
                                        rendered_height,
                                        viewMatrix=view_matrix,
                                        renderer=p.ER_BULLET_HARDWARE_OPENGL,
                                        projectionMatrix=projection_matrix)

        rgb_image = cv2.resize(np.array(camera_image[2]), (width, height))
        depth_image = np.array(camera_image[3], np.float32).reshape(
            (rendered_height, rendered_width))

        far = HumanVisualModel.CLIPFAR
        near = HumanVisualModel.CLIPNEAR
        real_depth_image = far * near / (far - (far - near) * depth_image)

        if self.use_saliency is not False:
            saliency_map, saliency_heatmap = self.saliency_estimator.estimate(
                camera_image[2], real_depth_image)
            saliency_heatmap_resized = cv2.resize(saliency_heatmap,
                                                  (width, height))

        mask_image = camera_image[4]
        unique, counts = np.unique(np.array(mask_image).flatten(),
                                   return_counts=True)

        for sim_id, count in zip(unique, counts):
            if sim_id > 0:
                cv_mask = np.array(mask_image.copy())
                cv_mask[cv_mask != sim_id] = 0
                cv_mask[cv_mask == sim_id] = 255
                xmin, ymin, w, h = cv2.boundingRect(cv_mask.astype(np.uint8))

                visible_area = w * h + 1
                screen_area = rendered_width * rendered_height + 1
                if screen_area - visible_area == 0:
                    confidence = 1.0
                else:
                    confidence = visible_area / float(screen_area -
                                                      visible_area)
                #TODO compute occlusion score as a ratio between visible 2d bbox and projected 2d bbox areas
                depth = real_depth_image[int(ymin + h / 2.0)][int(xmin +
                                                                  w / 2.0)]

                xmin = int(xmin * width_ratio)
                ymin = int(ymin * height_ratio)
                w = int(w * width_ratio)
                h = int(h * height_ratio)

                id = self.simulator.reverse_entity_id_map[sim_id]
                if confidence > occlusion_treshold:
                    det = Detection(int(xmin),
                                    int(ymin),
                                    int(xmin + w),
                                    int(ymin + h),
                                    id,
                                    confidence,
                                    depth=depth)
                    visible_detections.append(det)

        # for subject_detection in visible_detections:
        #     for object_detection in visible_detections:
        #         if subject_detection != object_detection:
        #             pass #TODO create inference batch for egocentric relation detection

        perspective_fps = cv2.getTickFrequency() / (cv2.getTickCount() -
                                                    perspective_timer)

        if output_viz is True:
            viz_frame = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
            if self.use_saliency is not False:
                viz_frame = cv2.addWeighted(saliency_heatmap_resized, 0.4,
                                            viz_frame, 0.7, 0)
            cv2.rectangle(viz_frame, (0, 0), (250, 40), (200, 200, 200), -1)
            perspective_fps_str = "Perspective taking fps: {:0.1f}hz".format(
                perspective_fps)
            cv2.putText(viz_frame,
                        "Nb detections: {}".format(len(visible_detections)),
                        (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
            cv2.putText(viz_frame, perspective_fps_str, (5, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
            for detection in visible_detections:
                detection.draw(viz_frame, (0, 200, 0))

            return rgb_image, real_depth_image, visible_detections, viz_frame
        else:
            return rgb_image, real_depth_image, visible_detections
Ejemplo n.º 12
0
    def observation_callback(self, bgr_image_msg):
        """
        """
        if self.nb_sample < self.max_samples:
            bgr_image = self.bridge.imgmsg_to_cv2(bgr_image_msg, "bgr8")
            rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)

            detections = self.face_detector.detect(rgb_image)

            tracks = self.face_tracker.update(rgb_image, detections)

            self.facial_landmarks_estimator.estimate(rgb_image, tracks)

            self.head_pose_estimator.estimate(tracks)

            view_image = bgr_image.copy()

            biggest_face = None
            for track in tracks:
                if biggest_face is None:
                    biggest_face = track
                else:
                    if track.bbox.area() > biggest_face.bbox.area():
                        biggest_face = track

            if biggest_face is not None:
                biggest_face.bbox.draw(view_image, (0, 200, 0), 2)
                sample_uuid = str(uuid.uuid4()).replace("-", "")

                facial_landmarks = biggest_face.features[
                    "facial_landmarks"].to_array()

                l_eye_contours = biggest_face.features[
                    "facial_landmarks"].left_eye_contours()
                biggest_face.bbox.draw(view_image, (0, 200, 0), 2)
                xmin, ymin, w, h = cv2.boundingRect(l_eye_contours)
                l_eye_detected = h > MIN_EYE_PATCH_HEIGHT and w > MIN_EYE_PATCH_WIDTH
                if l_eye_detected:
                    l_eye_mask = cv2.fillConvexPoly(
                        np.zeros(rgb_image.shape[:2], dtype=np.uint8),
                        l_eye_contours, 255)[ymin:ymin + h, xmin:xmin + w]
                    l_eye_detection = Detection(xmin,
                                                ymin,
                                                xmin + w,
                                                ymin + h,
                                                "l_eye",
                                                1.0,
                                                mask=l_eye_mask)
                    w_margin = int((w * WIDTH_MARGIN / 2.0))
                    h_margin = int((h * HEIGHT_MARGIN / 2.0))
                    l_eye_patch = bgr_image[ymin - h_margin:ymin + h +
                                            h_margin, xmin - w_margin:xmin +
                                            w + w_margin]
                    l_eye_detection.bbox.draw(view_image, (0, 200, 0), 2)

                r_eye_contours = biggest_face.features[
                    "facial_landmarks"].right_eye_contours()
                xmin, ymin, w, h = cv2.boundingRect(r_eye_contours)
                r_eye_detected = h > MIN_EYE_PATCH_HEIGHT and w > MIN_EYE_PATCH_WIDTH
                if r_eye_detected:
                    r_eye_mask = cv2.fillConvexPoly(
                        np.zeros(rgb_image.shape[:2], dtype=np.uint8),
                        r_eye_contours, 255)[ymin:ymin + h, xmin:xmin + w]
                    r_eye_detection = Detection(xmin,
                                                ymin,
                                                xmin + w,
                                                ymin + h,
                                                "r_eye",
                                                1.0,
                                                mask=r_eye_mask)
                    w_margin = int((w * WIDTH_MARGIN / 2.0))
                    h_margin = int((h * HEIGHT_MARGIN / 2.0))
                    r_eye_patch = bgr_image[ymin - h_margin:ymin + h +
                                            h_margin, xmin - w_margin:xmin +
                                            w + w_margin]
                    r_eye_detection.bbox.draw(view_image, (0, 200, 0), 2)

                if l_eye_detected is True and r_eye_detected is True:
                    l_eye_patch_resized = cv2.resize(
                        l_eye_patch, (EYE_INPUT_WIDTH, EYE_INPUT_HEIGHT),
                        interpolation=cv2.INTER_AREA)
                    r_eye_patch_resized = cv2.resize(
                        r_eye_patch, (EYE_INPUT_WIDTH, EYE_INPUT_HEIGHT),
                        interpolation=cv2.INTER_AREA)
                    view_image[0:EYE_INPUT_HEIGHT,
                               0:EYE_INPUT_WIDTH] = l_eye_patch_resized
                    view_image[0:EYE_INPUT_HEIGHT,
                               EYE_INPUT_WIDTH:EYE_INPUT_WIDTH *
                               2] = r_eye_patch_resized

                    biggest_face.features["facial_landmarks"].draw(
                        view_image, (0, 200, 0), 1)

                    sample_uuid = str(uuid.uuid4()).replace("-", "")

                    try:
                        if self.record_headpose:
                            if biggest_face.pose is not None:
                                self.headpose_record.write(
                                    biggest_face.position().to_array(),
                                    biggest_face.rotation().to_array())
                        if self.record_faces:
                            xmin = int(biggest_face.bbox.xmin)
                            xmax = int(biggest_face.bbox.xmax)
                            ymin = int(biggest_face.bbox.ymin)
                            ymax = int(biggest_face.bbox.ymax)
                            face_image = rgb_image[ymin:ymax, xmin:xmax]
                            cv2.imwrite(
                                self.data_path + "/" + sample_uuid +
                                "-face.png", face_image,
                                [int(cv2.IMWRITE_PNG_COMPRESSION), 9])
                        rospy.loginfo(
                            "[gaze_recorder] sample: {} id: {}".format(
                                self.nb_sample, sample_uuid))
                        if self.record_eyes:
                            cv2.imwrite(
                                self.data_path + "/" + sample_uuid +
                                "-l_eye.png", l_eye_patch_resized,
                                [int(cv2.IMWRITE_PNG_COMPRESSION), 9])
                            cv2.imwrite(
                                self.data_path + "/" + sample_uuid +
                                "-r_eye.png", r_eye_patch_resized,
                                [int(cv2.IMWRITE_PNG_COMPRESSION), 9])
                        if self.record_landmarks:
                            with open(
                                    self.data_path + "/" + sample_uuid +
                                    "-landmarks.txt", 'w') as file:
                                facial_landmarks_str = str(
                                    facial_landmarks).replace("[", "")
                                facial_landmarks_str = facial_landmarks_str.replace(
                                    "]", "")
                                facial_landmarks_str = "-".join(
                                    facial_landmarks_str.split()).replace(
                                        "-", " ")
                                file.write(facial_landmarks_str)
                        self.nb_sample += 1
                    except Exception as e:
                        rospy.logwarn("Exeption occured: {}".format(e))
            msg = self.bridge.cv2_to_imgmsg(view_image, "bgr8")
            msg.header = bgr_image_msg.header
            self.view_publisher.publish(msg)
Ejemplo n.º 13
0
    def observation_callback(self, bgr_image_msg, depth_image_msg=None):
        if self.camera_info is not None:
            perception_timer = cv2.getTickCount()
            bgr_image = self.bridge.imgmsg_to_cv2(bgr_image_msg, "bgr8")
            rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
            if depth_image_msg is not None:
                depth_image = self.bridge.imgmsg_to_cv2(depth_image_msg)
            else:
                depth_image = None
            if self.publish_visualization_image is True:
                viz_frame = bgr_image

            image_height, image_width, _ = bgr_image.shape

            success, view_pose = self.get_pose_from_tf2(
                self.global_frame_id, self.camera_frame_id)

            if success is not True:
                rospy.logwarn(
                    "[tabletop_perception] The camera sensor is not localized in world space (frame '{}'), please check if the sensor frame is published in /tf"
                    .format(self.global_frame_id))
            else:
                view_matrix = view_pose.transform()

                self.frame_count %= self.n_frame
                ######################################################
                # Detection
                ######################################################

                detections = []

                if depth_image is not None:
                    if self.table_depth is None:
                        x = int(image_width * (1 / 2.0))
                        y = int(image_height * (3 / 4.0))
                        if not math.isnan(depth_image[y, x]):
                            self.table_depth = depth_image[y, x] / 1000.0
                    table_depth = self.table_depth
                else:
                    table_depth = None
                table_detection = Detection(0, int(image_height / 2.0),
                                            int(image_width), image_height,
                                            "table", 1.0, table_depth)

                if depth_image is not None:
                    if self.table_shape is None:
                        fx = self.camera_matrix[0][0]
                        fy = self.camera_matrix[1][1]
                        cx = self.camera_matrix[0][2]
                        cy = self.camera_matrix[1][2]
                        x = int(image_width * (1 / 2.0))
                        y = int(image_height * (3 / 4.0))
                        if not math.isnan(depth_image[y, x]):
                            z = depth_image[y, x] / 1000.0
                            x = (x - cx) * z / fx
                            y = (y - cy) * z / fy
                            table_border = Vector6D(x=x, y=y, z=z)
                            x = int(image_width * (1 / 2.0))
                            y = int(image_height * (1 / 2.0))
                            if not math.isnan(depth_image[y, x]):
                                z = depth_image[y, x] / 1000.0
                                x = (x - cx) * z / fx
                                y = (y - cy) * z / fy
                                table_center = Vector6D(x=x, y=y, z=z)
                                table_length = (image_width - cx) * z / fx
                                table_width = 2.0 * math.sqrt(
                                    pow(
                                        table_border.pos.x -
                                        table_center.pos.x, 2) + pow(
                                            table_border.pos.y -
                                            table_center.pos.y, 2) + pow(
                                                table_border.pos.z -
                                                table_center.pos.z, 2))
                                center_in_world = view_pose + table_center
                                real_z = center_in_world.position().z
                                print view_pose.position()
                                print center_in_world.position()
                                self.table_shape = Box(table_width,
                                                       table_length, real_z)
                                self.table_shape.pose.pos.z = -real_z / 2.0
                                self.table_shape.color = self.shape_estimator.compute_dominant_color(
                                    rgb_image, table_detection.bbox)

                if self.frame_count == 0:
                    person_detections = self.detector.detect(
                        rgb_image, depth_image=depth_image)
                    detections = person_detections
                elif self.frame_count == 1:
                    object_detections = self.foreground_detector.detect(
                        rgb_image, depth_image=depth_image)
                    detections = object_detections
                else:
                    detections = []
                ####################################################################
                # Features estimation
                ####################################################################

                self.color_features_estimator.estimate(rgb_image, detections)

                ######################################################
                # Tracking
                ######################################################

                if self.frame_count == 0:
                    object_tracks = self.object_tracker.update(
                        rgb_image, [], depth_image=depth_image)
                    person_tracks = self.person_tracker.update(
                        rgb_image, person_detections, depth_image=depth_image)
                elif self.frame_count == 1:
                    object_tracks = self.object_tracker.update(
                        rgb_image, object_detections, depth_image=depth_image)
                    person_tracks = self.person_tracker.update(
                        rgb_image, [], depth_image=depth_image)
                else:
                    object_tracks = self.object_tracker.update(
                        rgb_image, [], depth_image=depth_image)
                    person_tracks = self.person_tracker.update(
                        rgb_image, [], depth_image=depth_image)

                if self.table_track is None:
                    self.table_track = Track(table_detection, 1, 4, 20)
                else:
                    self.table_track.update(table_detection)
                if self.table_shape is not None:
                    self.table_track.shapes = [self.table_shape]
                table_track = [self.table_track]

                tracks = object_tracks + person_tracks + table_track

                ########################################################
                # Pose & Shape estimation
                ########################################################

                self.object_pose_estimator.estimate(
                    object_tracks + person_tracks + table_track, view_matrix,
                    self.camera_matrix, self.dist_coeffs)

                self.shape_estimator.estimate(rgb_image, tracks,
                                              self.camera_matrix,
                                              self.dist_coeffs)

                self.frame_count += 1
                ######################################################
                # Visualization of debug image
                ######################################################

                perception_fps = cv2.getTickFrequency() / (cv2.getTickCount() -
                                                           perception_timer)

                cv2.rectangle(viz_frame, (0, 0), (250, 40), (200, 200, 200),
                              -1)
                perception_fps_str = "Perception fps : {:0.1f}hz".format(
                    perception_fps)
                cv2.putText(
                    viz_frame, "Nb detections/tracks : {}/{}".format(
                        len(detections), len(tracks)), (5, 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
                cv2.putText(viz_frame, perception_fps_str, (5, 30),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)

                scene_changes = SceneChangesStamped()
                scene_changes.world = "tracks"

                header = bgr_image_msg.header
                header.frame_id = self.global_frame_id
                scene_changes.header = header

                for track in tracks:
                    if self.publish_visualization_image is True:
                        track.draw(viz_frame, (230, 0, 120, 125), 1,
                                   view_matrix, self.camera_matrix,
                                   self.dist_coeffs)
                    if track.is_confirmed():
                        scene_node = track.to_msg(header)
                        if self.publish_tf is True:
                            if track.is_located() is True:
                                self.publish_tf_pose(scene_node.pose_stamped,
                                                     self.global_frame_id,
                                                     scene_node.id)
                        scene_changes.changes.nodes.append(scene_node)
                if self.publish_visualization_image is True:
                    viz_img_msg = self.bridge.cv2_to_imgmsg(viz_frame)
                    self.visualization_publisher.publish(viz_img_msg)

                self.tracks_publisher.publish(scene_changes)
Ejemplo n.º 14
0
    def predict(self, rgb_image, depth_image=None):
        """ """
        xmin = self.bbox.xmin
        ymin = self.bbox.ymin
        xmax = self.bbox.xmax
        ymax = self.bbox.ymax
        w = self.bbox.width()
        h = self.bbox.height()

        new_xmin = int(xmin - 2.0 * w) if int(xmin - 2.0 * w) > 0 else 0
        new_ymin = int(ymin - 2.0 * h) if int(xmin - 2.0 * h) > 0 else 0
        new_xmax = int(xmax + 2.0 * w) if int(
            xmax + 2.0 * w) < rgb_image.shape[1] else rgb_image.shape[1]
        new_ymax = int(ymax + 2.0 * h) if int(
            ymax + 2.0 * h) < rgb_image.shape[0] else rgb_image.shape[0]

        hsv_image = cv2.cvtColor(
            rgb_image[new_ymin:new_ymax, new_xmin:new_xmax], cv2.COLOR_RGB2HSV)
        dst = cv2.calcBackProject([hsv_image], [0], self.histogram, [0, 180],
                                  1)

        track_window = xmin - new_xmin, ymin - new_ymin, w, h
        term_crit = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 20, 1)
        ret, track_window = cv2.CamShift(dst, track_window, term_crit)
        points = cv2.boxPoints(ret)
        points = np.int0(points)

        xmin, ymin, w, h = cv2.boundingRect(points)
        if h < 7 or w < 7:
            return False, None
        if xmin < 0 or ymin < 0:
            return False, None
        _, thresh_map = cv2.threshold(dst[ymin:ymin + h, xmin:xmin + w], 20,
                                      255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
        kernel_big = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6))
        closing = cv2.morphologyEx(thresh_map, cv2.MORPH_CLOSE, kernel_big)
        opening = cv2.morphologyEx(closing, cv2.MORPH_OPEN, kernel_small)

        mask = opening

        if depth_image is not None:
            x = depth_image.shape[1] - 1 if x > depth_image.shape[1] else x
            y = depth_image.shape[0] - 1 if y > depth_image.shape[0] else y
            depth = depth_image[int(y)][int(x)] / 1000.0
            if math.isnan(depth) or depth == 0.0:
                depth = None
        else:
            depth = None

        pred = Detection(int(xmin),
                         int(ymin),
                         int(xmin + w),
                         int(ymin + h),
                         self.object_label,
                         0.6,
                         depth=depth,
                         mask=mask)
        #
        # cv2.normalize(dst, dst, 0, 255, cv2.NORM_MINMAX)
        # _, thresh_map = cv2.threshold(dst[ymin:ymin+h, xmin:xmin+w], 20, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        # kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
        # kernel_big = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6))
        # closing = cv2.morphologyEx(thresh_map, cv2.MORPH_CLOSE, kernel_big)
        # opening = cv2.morphologyEx(closing, cv2.MORPH_OPEN, kernel_small)
        # refined_xmin, refined_ymin, w, h = cv2.boundingRect(opening)
        #
        # xmin += new_xmin + refined_xmin
        # ymin += new_ymin + refined_ymin
        # if w < 30 or h < 30:
        #     return False, None
        # else:
        #     x = int(xmin + w/2.0)
        #     y = int(ymin + h/2.0)
        #     if depth_image is not None:
        #         x = depth_image.shape[1]-1 if x > depth_image.shape[1] else x
        #         y = depth_image.shape[0]-1 if y > depth_image.shape[0] else y
        #         depth = depth_image[int(y)][int(x)]/1000.0
        #         if math.isnan(depth) or depth == 0.0:
        #             depth = None
        #     else:
        #         depth = None
        #     if w < 30 or h < 30:
        #         return False, None
        #     mask = opening[int(ymin):int(ymin+h), int(xmin):int(xmin+w)]
        #     pred = Detection(int(xmin), int(ymin), int(xmin+w), int(ymin+h), self.object_label, 0.6, depth=depth, mask=mask)
        #     print pred.bbox
        return True, pred