Beispiel #1
0
    def visualize_train_input(self, in_img, p, q, theta, z, roll, pitch):
        """Visualize the training input."""
        points = []
        colors = []
        height = in_img[:, :, 3]

        for i in range(in_img.shape[0]):
            for j in range(in_img.shape[1]):
                pixel = (i, j)
                position = utils.pixel_to_position(pixel, height, self.bounds,
                                                   self.pixel_size)
                points.append(position)
                colors.append(in_img[i, j, :3])

        points = np.array(points).T  # shape (3, N)
        colors = np.array(colors).T / 255.0  # shape (3, N)

        self.vis["pointclouds/scene"].set_object(
            g.PointCloud(position=points, color=colors))

        pick_position = utils.pixel_to_position(p, height, self.bounds,
                                                self.pixel_size)
        label = "pick"
        utils.make_frame(self.vis, label, h=0.05, radius=0.0012, o=0.1)

        pick_transform = np.eye(4)
        pick_transform[0:3, 3] = pick_position
        self.vis[label].set_transform(pick_transform)

        place_position = utils.pixel_to_position(q, height, self.bounds,
                                                 self.pixel_size)
        label = "place"
        utils.make_frame(self.vis, label, h=0.05, radius=0.0012, o=0.1)

        place_transform = np.eye(4)
        place_transform[0:3, 3] = place_position
        place_transform[2, 3] = z

        rotation = utils.get_pybullet_quaternion_from_rot(
            (roll, pitch, -theta))
        quaternion_wxyz = np.asarray(
            [rotation[3], rotation[0], rotation[1], rotation[2]])

        place_transform[0:3, 0:3] = mtf.quaternion_matrix(quaternion_wxyz)[0:3,
                                                                           0:3]
        self.vis[label].set_transform(place_transform)

        _, ax = plt.subplots(2, 1)
        ax[0].imshow(in_img.transpose(1, 0, 2)[:, :, :3] / 255.0)
        ax[0].scatter(p[0], p[1])
        ax[0].scatter(q[0], q[1])
        ax[1].imshow(in_img.transpose(1, 0, 2)[:, :, 3])
        ax[1].scatter(p[0], p[1])
        ax[1].scatter(q[0], q[1])
        plt.show()
Beispiel #2
0
    def random_pose_6dof(self, env, object_size):
        """Get random collision-free pose in workspace bounds for object."""
        plane_id = 1
        max_size = np.linalg.norm(object_size[0:2])
        erode_size = int(np.round(max_size / self.pixel_size))
        _, heightmap, object_mask = self.get_object_masks(env)

        # Sample freespace regions in workspace.
        mask = np.uint8(object_mask == plane_id)
        mask[0, :], mask[:, 0], mask[-1, :], mask[:, -1] = 0, 0, 0, 0
        mask = cv2.erode(mask, np.ones((erode_size, erode_size), np.uint8))
        if np.sum(mask) == 0:
            return
        pixel = utils.sample_distribution(np.float32(mask))
        position = utils.pixel_to_position(pixel, heightmap, self.bounds,
                                           self.pixel_size)

        z_above_table = (np.random.rand(1)[0] / 10) + 0.03

        position = (position[0], position[1],
                    object_size[2] / 2 + z_above_table)

        roll = (np.random.rand() - 0.5) * 0.5 * np.pi
        pitch = (np.random.rand() - 0.5) * 0.5 * np.pi
        yaw = np.random.rand() * 2 * np.pi
        rotation = utils.get_pybullet_quaternion_from_rot((roll, pitch, yaw))

        print(position, rotation)

        return position, rotation
Beispiel #3
0
    def act(self, obs, info, compute_error=False, gt_act=None):
        """Run inference and return best action given visual observations."""

        act = {'camera_config': self.camera_config, 'primitive': None}
        if not obs:
            return act

        # Get heightmap from RGB-D images.
        colormap, heightmap = self.get_heightmap(obs, self.camera_config)

        # Concatenate color with depth images.
        input_image = np.concatenate(
            (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None],
             heightmap[Ellipsis, None]),
            axis=2)

        # Attention model forward pass.
        attention = self.attention_model.forward(input_image)
        argmax = np.argmax(attention)
        argmax = np.unravel_index(argmax, shape=attention.shape)
        p0_pixel = argmax[:2]
        p0_theta = argmax[2] * (2 * np.pi / attention.shape[2])

        # Transport model forward pass.
        transport = self.transport_model.forward(input_image, p0_pixel)
        _, z, roll, pitch = self.rpz_model.forward(input_image, p0_pixel)

        argmax = np.argmax(transport)
        argmax = np.unravel_index(argmax, shape=transport.shape)

        # Index into 3D discrete tensor, grab z, roll, pitch activations
        z_best = z[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None]
        roll_best = roll[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None]
        pitch_best = pitch[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None]

        # Send through regressors for each of z, roll, pitch
        z_best = self.rpz_model.z_regressor(z_best)[0, 0]
        roll_best = self.rpz_model.roll_regressor(roll_best)[0, 0]
        pitch_best = self.rpz_model.pitch_regressor(pitch_best)[0, 0]

        p1_pixel = argmax[:2]
        p1_theta = argmax[2] * (2 * np.pi / transport.shape[2])

        # Pixels to end effector poses.
        p0_position = utils.pixel_to_position(p0_pixel, heightmap, self.bounds,
                                              self.pixel_size)
        p1_position = utils.pixel_to_position(p1_pixel, heightmap, self.bounds,
                                              self.pixel_size)

        p1_position = (p1_position[0], p1_position[1], z_best)

        p0_rotation = utils.get_pybullet_quaternion_from_rot((0, 0, -p0_theta))
        p1_rotation = utils.get_pybullet_quaternion_from_rot(
            (roll_best, pitch_best, -p1_theta))

        if compute_error:
            gt_p0_position, gt_p0_rotation = gt_act['params']['pose0']
            gt_p1_position, gt_p1_rotation = gt_act['params']['pose1']

            gt_p0_pixel = np.array(
                utils.position_to_pixel(gt_p0_position, self.bounds,
                                        self.pixel_size))
            gt_p1_pixel = np.array(
                utils.position_to_pixel(gt_p1_position, self.bounds,
                                        self.pixel_size))

            self.p0_pixel_error(
                np.linalg.norm(gt_p0_pixel - np.array(p0_pixel)))
            self.p1_pixel_error(
                np.linalg.norm(gt_p1_pixel - np.array(p1_pixel)))

            gt_p0_theta = -np.float32(
                utils.get_rot_from_pybullet_quaternion(gt_p0_rotation)[2])
            gt_p1_theta = -np.float32(
                utils.get_rot_from_pybullet_quaternion(gt_p1_rotation)[2])

            self.p0_theta_error(
                abs((np.rad2deg(gt_p0_theta - p0_theta) + 180) % 360 - 180))
            self.p1_theta_error(
                abs((np.rad2deg(gt_p1_theta - p1_theta) + 180) % 360 - 180))

            return None

        return self.p0_p1_position_rotations_to_act(act, p0_position,
                                                    p0_rotation, p1_position,
                                                    p1_rotation)
Beispiel #4
0
    def act(self, obs, info, goal=None):
        """Run inference and return best action given visual observations."""
        del info

        act = {'camera_config': self.camera_config, 'primitive': None}
        if not obs:
            return act

        # Get heightmap from RGB-D images.
        colormap, heightmap = self.get_heightmap(obs, self.camera_config)
        if goal is not None:
            colormap_g, heightmap_g = self.get_heightmap(
                goal, self.camera_config)

        # Concatenate color with depth images.
        input_image = self.concatenate_c_h(colormap, heightmap)

        # Make a goal image if needed, and for consistency stack with input.
        if self.use_goal_image:
            goal_image = self.concatenate_c_h(colormap_g, heightmap_g)
            input_image = np.concatenate((input_image, goal_image), axis=2)
            assert input_image.shape[2] == 12, input_image.shape

        # Attention model forward pass.
        if self.use_goal_image:
            half = int(input_image.shape[2] / 2)
            input_only = input_image[:, :, :half]  # ignore goal portion
            attention = self.attention_model.forward(input_only)
        else:
            attention = self.attention_model.forward(input_image)
        argmax = np.argmax(attention)
        argmax = np.unravel_index(argmax, shape=attention.shape)
        p0_pixel = argmax[:2]
        p0_theta = argmax[2] * (2 * np.pi / attention.shape[2])

        # Transport model forward pass.
        if isinstance(self.transport_model, TransportGoal):
            half = int(input_image.shape[2] / 2)
            img_curr = input_image[:, :, :half]
            img_goal = input_image[:, :, half:]
            transport = self.transport_model.forward(img_curr, img_goal,
                                                     p0_pixel)
        else:
            transport = self.transport_model.forward(input_image, p0_pixel)

        argmax = np.argmax(transport)
        argmax = np.unravel_index(argmax, shape=transport.shape)

        p1_pixel = argmax[:2]
        p1_theta = argmax[2] * (2 * np.pi / transport.shape[2])

        # Pixels to end effector poses.
        p0_position = utils.pixel_to_position(p0_pixel, heightmap, self.bounds,
                                              self.pixel_size)
        p1_position = utils.pixel_to_position(p1_pixel, heightmap, self.bounds,
                                              self.pixel_size)

        p0_rotation = utils.get_pybullet_quaternion_from_rot((0, 0, -p0_theta))
        p1_rotation = utils.get_pybullet_quaternion_from_rot((0, 0, -p1_theta))

        return self.p0_p1_position_rotations_to_act(act, p0_position,
                                                    p0_rotation, p1_position,
                                                    p1_rotation)
Beispiel #5
0
    def act(self, obs, info):
        """Run inference and return best action given visual observations."""
        del info

        act = {'camera_config': self.camera_config, 'primitive': None}
        if not obs:
            return act

        # Get heightmap from RGB-D images.
        colormap, heightmap = self.get_heightmap(obs, self.camera_config)

        # Concatenate color with depth images.
        input_image = np.concatenate(
            (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None],
             heightmap[Ellipsis, None]),
            axis=2)

        # Get top-k pixels from pick and place heatmaps.
        k = 100
        pick_heatmap = self.pick_model.forward(input_image,
                                               apply_softmax=True).squeeze()
        place_heatmap = self.place_model.forward(input_image,
                                                 apply_softmax=True).squeeze()
        descriptors = np.float32(self.match_model.forward(input_image))

        # V4
        pick_heatmap = cv2.GaussianBlur(pick_heatmap, (49, 49), 0)
        place_heatmap = cv2.GaussianBlur(place_heatmap, (49, 49), 0)
        pick_topk = np.int32(
            np.unravel_index(
                np.argsort(pick_heatmap.reshape(-1))[-k:],
                pick_heatmap.shape)).T
        pick_pixel = pick_topk[-1, :]
        from skimage.feature import peak_local_max  # pylint: disable=g-import-not-at-top
        place_peaks = peak_local_max(place_heatmap, num_peaks=1)
        distances = np.ones((place_peaks.shape[0], self.num_rotations)) * 10
        pick_descriptor = descriptors[0, pick_pixel[0],
                                      pick_pixel[1], :].reshape(1, -1)
        for i in range(place_peaks.shape[0]):
            peak = place_peaks[i, :]
            place_descriptors = descriptors[:, peak[0], peak[1], :]
            distances[i, :] = np.linalg.norm(place_descriptors -
                                             pick_descriptor,
                                             axis=1)
        ibest = np.unravel_index(np.argmin(distances), shape=distances.shape)
        p0_pixel = pick_pixel
        p0_theta = 0
        p1_pixel = place_peaks[ibest[0], :]
        p1_theta = ibest[1] * (2 * np.pi / self.num_rotations)

        # # V3
        # pick_heatmap = cv2.GaussianBlur(pick_heatmap, (49, 49), 0)
        # place_heatmap = cv2.GaussianBlur(place_heatmap, (49, 49), 0)
        # pick_topk = np.int32(
        #     np.unravel_index(
        #         np.argsort(pick_heatmap.reshape(-1))[-k:], pick_heatmap.shape)).T
        # place_topk = np.int32(
        #     np.unravel_index(
        #         np.argsort(place_heatmap.reshape(-1))[-k:],
        #         place_heatmap.shape)).T
        # pick_pixel = pick_topk[-1, :]
        # place_pixel = place_topk[-1, :]
        # pick_descriptor = descriptors[0, pick_pixel[0],
        #                               pick_pixel[1], :].reshape(1, -1)
        # place_descriptor = descriptors[:, place_pixel[0], place_pixel[1], :]
        # distances = np.linalg.norm(place_descriptor - pick_descriptor, axis=1)
        # irotation = np.argmin(distances)
        # p0_pixel = pick_pixel
        # p0_theta = 0
        # p1_pixel = place_pixel
        # p1_theta = irotation * (2 * np.pi / self.num_rotations)

        # # V2
        # pick_topk = np.int32(
        #     np.unravel_index(
        #         np.argsort(pick_heatmap.reshape(-1))[-k:], pick_heatmap.shape)).T
        # place_topk = np.int32(
        #     np.unravel_index(
        #         np.argsort(place_heatmap.reshape(-1))[-k:],
        #         place_heatmap.shape)).T
        # pick_pixel = pick_topk[-1, :]
        # pick_descriptor = descriptors[0, pick_pixel[0],
        #                               pick_pixel[1], :].reshape(1, 1, 1, -1)
        # distances = np.linalg.norm(descriptors - pick_descriptor, axis=3)
        # distances = np.transpose(distances, [1, 2, 0])
        # max_distance = int(np.round(np.max(distances)))
        # for i in range(self.num_rotations):
        #   distances[:, :, i] = cv2.circle(distances[:, :, i],
        #                                   (pick_pixel[1], pick_pixel[0]), 50,
        #                                   max_distance, -1)
        # ibest = np.unravel_index(np.argmin(distances), shape=distances.shape)
        # p0_pixel = pick_pixel
        # p0_theta = 0
        # p1_pixel = ibest[:2]
        # p1_theta = ibest[2] * (2 * np.pi / self.num_rotations)

        # # V1
        # pick_topk = np.int32(
        #     np.unravel_index(
        #         np.argsort(pick_heatmap.reshape(-1))[-k:], pick_heatmap.shape)).T
        # place_topk = np.int32(
        #     np.unravel_index(
        #         np.argsort(place_heatmap.reshape(-1))[-k:],
        #         place_heatmap.shape)).T
        # distances = np.zeros((k, k, self.num_rotations))
        # for ipick in range(k):
        #   pick_descriptor = descriptors[0, pick_topk[ipick, 0],
        #                                 pick_topk[ipick, 1], :].reshape(1, -1)
        #   for iplace in range(k):
        #     place_descriptors = descriptors[:, place_topk[iplace, 0],
        #                                     place_topk[iplace, 1], :]
        #     distances[ipick, iplace, :] = np.linalg.norm(
        #         place_descriptors - pick_descriptor, axis=1)
        # ibest = np.unravel_index(np.argmin(distances), shape=distances.shape)
        # p0_pixel = pick_topk[ibest[0], :]
        # p0_theta = 0
        # p1_pixel = place_topk[ibest[1], :]
        # p1_theta = ibest[2] * (2 * np.pi / self.num_rotations)

        # Pixels to end effector poses.
        p0_position = utils.pixel_to_position(p0_pixel, heightmap, self.bounds,
                                              self.pixel_size)
        p1_position = utils.pixel_to_position(p1_pixel, heightmap, self.bounds,
                                              self.pixel_size)
        p0_rotation = utils.get_pybullet_quaternion_from_rot((0, 0, -p0_theta))
        p1_rotation = utils.get_pybullet_quaternion_from_rot((0, 0, -p1_theta))

        act['primitive'] = 'pick_place'
        if self.task == 'sweeping':
            act['primitive'] = 'sweep'
        elif self.task == 'pushing':
            act['primitive'] = 'push'
        params = {
            'pose0': (p0_position, p0_rotation),
            'pose1': (p1_position, p1_rotation)
        }
        act['params'] = params
        return act
    def act(self, obs, info):
        """Run inference and return best action given visual observations."""
        self.pick_regression_model.set_batch_size(1)
        self.place_regression_model.set_batch_size(1)
        act = {'camera_config': self.camera_config, 'primitive': None}
        if not obs:
            return act

        # Get heightmap from RGB-D images.
        colormap, heightmap = self.get_heightmap(obs, self.camera_config)

        # Concatenate color with depth images.
        # input_image = np.concatenate((colormap,
        #                               heightmap[..., None],
        #                               heightmap[..., None],
        #                               heightmap[..., None]), axis=2)

        input_image = colormap[None, ...]

        # Regression pick model
        p0_yxtheta = self.pick_regression_model.forward(input_image)[
            0]  # unbatch
        p0_pixel = [int(p0_yxtheta[0]), int(p0_yxtheta[1])]
        p0_theta = p0_yxtheta[2]

        # Regression place model
        p1_yxtheta = self.place_regression_model.forward(input_image)[
            0]  # unbatch
        p1_pixel = [int(p1_yxtheta[0]), int(p1_yxtheta[1])]
        p1_theta = p1_yxtheta[2]

        # make sure safe:
        if p1_pixel[0] < 0:
            p1_pixel[0] = 0
        if p1_pixel[0] > 319:
            p1_pixel[0] = 319

        if p1_pixel[1] < 0:
            p1_pixel[1] = 0
        if p1_pixel[1] > 159:
            p1_pixel[1] = 159

        # Pixels to end effector poses.
        p0_position = utils.pixel_to_position(p0_pixel, heightmap, self.bounds,
                                              self.pixel_size)
        p1_position = utils.pixel_to_position(p1_pixel, heightmap, self.bounds,
                                              self.pixel_size)

        p0_rotation = p.getQuaternionFromEuler((0, 0, -p0_theta))
        p1_rotation = p.getQuaternionFromEuler((0, 0, -p1_theta))

        act['primitive'] = 'pick_place'
        if self.task == 'sweeping':
            act['primitive'] = 'sweep'
        elif self.task == 'pushing':
            act['primitive'] = 'push'
        params = {
            'pose0': (p0_position, p0_rotation),
            'pose1': (p1_position, p1_rotation)
        }
        act['params'] = params
        self.pick_regression_model.set_batch_size(self.batch_size)
        self.place_regression_model.set_batch_size(self.batch_size)
        return act
    def act(self, obs, info, debug_imgs=False, goal=None):
        """Run inference and return best action given visual observations.

        If goal-conditioning, provide `goal`. Both `obs` and `goal` have
        'color' and 'depth' keys, but `obs['color']` and `goal['color']` are
        of type list and np.array, respectively. This is different from
        training, above, where both `obs` and `goal` are sampled from the
        dataset class, which will load both as np.arrays. Here, the `goal` is
        still from dataset, but `obs` is from the environment stepping, which
        returns things in a list. Wrap an np.array(...) to get shapes:

        np.array(obs['color']) and goal['color']: (3, 480, 640, 3)
        np.array(obs['depth']) and goal['depth']: (3, 480, 640)
        """
        act = {'camera_config': self.camera_config, 'primitive': None}
        if not obs:
            return act

        # Get heightmap from RGB-D images.
        colormap, heightmap = self.get_heightmap(obs, self.camera_config)
        if goal is not None:
            colormap_g, heightmap_g = self.get_heightmap(goal, self.camera_config)

        # Concatenate color with depth images.
        input_image = self.concatenate_c_h(colormap, heightmap)

        # Make a goal image if needed, and for consistency stack with input.
        if self.use_goal_image:
            goal_image = self.concatenate_c_h(colormap_g, heightmap_g)
            input_image = np.concatenate((input_image, goal_image), axis=2)
            assert input_image.shape[2] == 12, input_image.shape

        # Attention model forward pass.
        if self.attn_no_targ and self.use_goal_image:
            maxdim = int(input_image.shape[2] / 2)
            input_only = input_image[:, :, :maxdim]
            attention = self.attention_model.forward(input_only)
        else:
            attention = self.attention_model.forward(input_image)
        argmax = np.argmax(attention)
        argmax = np.unravel_index(argmax, shape=attention.shape)
        p0_pixel = argmax[:2]
        p0_theta = argmax[2] * (2 * np.pi / attention.shape[2])

        # Transport model forward pass.
        if isinstance(self.transport_model, TransportGoal):
            half = int(input_image.shape[2] / 2)
            img_curr = input_image[:, :, :half]
            img_goal = input_image[:, :, half:]
            transport = self.transport_model.forward(img_curr, img_goal, p0_pixel)
        else:
            transport = self.transport_model.forward(input_image, p0_pixel)
        argmax = np.argmax(transport)
        argmax = np.unravel_index(argmax, shape=transport.shape)
        p1_pixel = argmax[:2]
        p1_theta = argmax[2] * (2 * np.pi / transport.shape[2])

        # Pixels to end effector poses.
        p0_position = utils.pixel_to_position(p0_pixel, heightmap, self.bounds, self.pixel_size)
        p1_position = utils.pixel_to_position(p1_pixel, heightmap, self.bounds, self.pixel_size)
        p0_rotation = p.getQuaternionFromEuler((0, 0, -p0_theta))
        p1_rotation = p.getQuaternionFromEuler((0, 0, -p1_theta))

        act['primitive'] = 'pick_place'
        if self.task == 'sweeping':
            act['primitive'] = 'sweep'
        elif self.task == 'pushing':
            act['primitive'] = 'push'
        params = {'pose0': (p0_position, p0_rotation),
                  'pose1': (p1_position, p1_rotation)}
        act['params'] = params

        # Daniel: determine the task stage if applicable. (AND if loading only)
        if self.task in ['bag-items-easy', 'bag-items-hard', 'bag-color-goal']:
            self._determine_task_stage(p0_pixel, p1_pixel)

        # Daniel: only change is potentially returning more info.
        if debug_imgs:
            # FWIW, attention (320,160,1), and already has softmax applied.
            # Then the attention heat map will return a (160,320,3) image.
            # The transport also has softmax, and is shape (320,160,num_rot).
            # Thus, t_heat is actually a LIST of (160,320,3) shaped images.
            # (For forward passes, we apply the softmax to the attention and
            # transport tensors; for training we don't because the TensorFlow
            # cross entropy loss assumes it's applied before the softmax.)
            a_heat = self.attention_model.get_attention_heatmap(attention)
            t_heat = self.transport_model.get_transport_heatmap(transport)
            extras = {
                'input_c': cv2.cvtColor(colormap, cv2.COLOR_RGB2BGR),
                'attn_heat_bgr': a_heat,  # already converted to BGR
                'tran_heat_bgr': t_heat,  # already converted to BGR
                'tran_rot_argmax': argmax[2],
                'tran_p1_theta': p1_theta,
            }
            # Images by default should be vertically oriented. Can make
            # horizontal if we use .transpose(1,0,2).
            return act, extras
        else:
            return act