Beispiel #1
0
    def train(self, dataset, num_iter, writer, validation_dataset=None):
        """Train on dataset for a specific number of iterations."""
        del validation_dataset

        for i in range(num_iter):
            obs, act, _ = dataset.random_sample()

            # Get heightmap from RGB-D images.
            configs = act['camera_config']
            colormap, heightmap = self.get_heightmap(obs, configs)

            # Get training labels from data sample.
            pose0, pose1 = act['params']['pose0'], act['params']['pose1']
            p0_position, p0_rotation = pose0[0], pose0[1]
            p0 = utils.position_to_pixel(p0_position, self.bounds,
                                         self.pixel_size)
            p0_theta = -np.float32(
                utils.get_rot_from_pybullet_quaternion(p0_rotation)[2])
            p1_position, p1_rotation = pose1[0], pose1[1]
            p1 = utils.position_to_pixel(p1_position, self.bounds,
                                         self.pixel_size)
            p1_theta = -np.float32(
                utils.get_rot_from_pybullet_quaternion(p1_rotation)[2])
            p1_theta = p1_theta - p0_theta
            p0_theta = 0

            # Concatenate color with depth images.
            input_image = np.concatenate(
                (colormap, heightmap[Ellipsis, None],
                 heightmap[Ellipsis, None], heightmap[Ellipsis, None]),
                axis=2)

            # Do data augmentation (perturb rotation and translation).
            input_image, _, roundedpixels, _ = utils.perturb(
                input_image, [p0, p1])
            p0, p1 = roundedpixels

            # Compute training loss.
            loss0 = self.pick_model.train(input_image, p0, theta=0)
            loss1 = self.place_model.train(input_image, p1, theta=0)
            loss2 = self.match_model.train(input_image, p0, p1, theta=p1_theta)
            with writer.as_default():
                tf.summary.scalar('pick_loss',
                                  self.pick_model.metric.result(),
                                  step=self.total_iter + i)
                tf.summary.scalar('place_loss',
                                  self.place_model.metric.result(),
                                  step=self.total_iter + i)
                tf.summary.scalar('match_loss',
                                  self.match_model.metric.result(),
                                  step=self.total_iter + i)
            print(
                f'Train Iter: {self.total_iter + i} Loss: {loss0:.4f} {loss1:.4f} {loss2:.4f}'
            )

        self.total_iter += num_iter
        self.save()
Beispiel #2
0
    def _determine_task_stage(self, p0_position, p1_position):
        """Determines task stage for the bag-items tasks, for gt_state and gt_state_2_step.

        See agents/transporter.py for details. The ONLY difference here is that we have
        the positions and need to do a position to pixel conversion, which is trivial with our
        utility file. Otherwise, we follow the same procedure as in the transporter class, with
        the same set of drawbacks / caveats.
        """
        p0_pixel = utils.position_to_pixel(p0_position,
                                           bounds=self.bounds,
                                           pixel_size=self.pixel_size)
        p1_pixel = utils.position_to_pixel(p1_position,
                                           bounds=self.bounds,
                                           pixel_size=self.pixel_size)

        # Daniel: hack to get things in the bounds.
        p0_x = min(max(p0_pixel[0], 0), 319)
        p0_y = min(max(p0_pixel[1], 0), 159)
        p0_pixel = (int(p0_x), int(p0_y))

        real_task = self.real_task  # assume we assigned this.
        colormap, heightmap, object_mask = real_task.get_object_masks(
            real_task.env)

        if False:
            nb = len([x for x in os.listdir('.') if '.png' in x])
            mask = np.array(object_mask / np.max(object_mask) * 255).astype(
                np.uint8)
            mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)  # debugging
            p0 = (p0_pixel[1], p0_pixel[0])
            p1 = (p1_pixel[1], p1_pixel[0])
            cv2.circle(mask, p0, radius=3, color=(255, 0, 255), thickness=-1)
            cv2.circle(mask, p1, radius=3, color=(255, 255, 0), thickness=-1)
            cv2.imwrite(f'mask_{nb}.png', mask)

        # Copied from ravens/agents/transporter.py.
        if self.task in ['bag-items-easy', 'bag-items-hard']:
            if object_mask[p0_pixel] in [38, 39]:
                real_task.task_stage = 2
            elif real_task.task_stage == 2:
                real_task.task_stage = 3
        elif self.task in ['bag-color-goal']:
            if object_mask[p0_pixel] == real_task.single_block_ID:
                real_task.task_stage = 2
            else:
                real_task.task_stage = 1
        else:
            raise NotImplementedError(self.task)
Beispiel #3
0
    def act(self, obs, info, compute_error=False, gt_act=None):
        """Run inference and return best action given visual observations."""

        act = {'camera_config': self.camera_config, 'primitive': None}
        if not obs:
            return act

        # Get heightmap from RGB-D images.
        colormap, heightmap = self.get_heightmap(obs, self.camera_config)

        # Concatenate color with depth images.
        input_image = np.concatenate(
            (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None],
             heightmap[Ellipsis, None]),
            axis=2)

        # Attention model forward pass.
        attention = self.attention_model.forward(input_image)
        argmax = np.argmax(attention)
        argmax = np.unravel_index(argmax, shape=attention.shape)
        p0_pixel = argmax[:2]
        p0_theta = argmax[2] * (2 * np.pi / attention.shape[2])

        # Transport model forward pass.
        transport = self.transport_model.forward(input_image, p0_pixel)
        _, z, roll, pitch = self.rpz_model.forward(input_image, p0_pixel)

        argmax = np.argmax(transport)
        argmax = np.unravel_index(argmax, shape=transport.shape)

        # Index into 3D discrete tensor, grab z, roll, pitch activations
        z_best = z[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None]
        roll_best = roll[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None]
        pitch_best = pitch[:, argmax[0], argmax[1], argmax[2]][Ellipsis, None]

        # Send through regressors for each of z, roll, pitch
        z_best = self.rpz_model.z_regressor(z_best)[0, 0]
        roll_best = self.rpz_model.roll_regressor(roll_best)[0, 0]
        pitch_best = self.rpz_model.pitch_regressor(pitch_best)[0, 0]

        p1_pixel = argmax[:2]
        p1_theta = argmax[2] * (2 * np.pi / transport.shape[2])

        # Pixels to end effector poses.
        p0_position = utils.pixel_to_position(p0_pixel, heightmap, self.bounds,
                                              self.pixel_size)
        p1_position = utils.pixel_to_position(p1_pixel, heightmap, self.bounds,
                                              self.pixel_size)

        p1_position = (p1_position[0], p1_position[1], z_best)

        p0_rotation = utils.get_pybullet_quaternion_from_rot((0, 0, -p0_theta))
        p1_rotation = utils.get_pybullet_quaternion_from_rot(
            (roll_best, pitch_best, -p1_theta))

        if compute_error:
            gt_p0_position, gt_p0_rotation = gt_act['params']['pose0']
            gt_p1_position, gt_p1_rotation = gt_act['params']['pose1']

            gt_p0_pixel = np.array(
                utils.position_to_pixel(gt_p0_position, self.bounds,
                                        self.pixel_size))
            gt_p1_pixel = np.array(
                utils.position_to_pixel(gt_p1_position, self.bounds,
                                        self.pixel_size))

            self.p0_pixel_error(
                np.linalg.norm(gt_p0_pixel - np.array(p0_pixel)))
            self.p1_pixel_error(
                np.linalg.norm(gt_p1_pixel - np.array(p1_pixel)))

            gt_p0_theta = -np.float32(
                utils.get_rot_from_pybullet_quaternion(gt_p0_rotation)[2])
            gt_p1_theta = -np.float32(
                utils.get_rot_from_pybullet_quaternion(gt_p1_rotation)[2])

            self.p0_theta_error(
                abs((np.rad2deg(gt_p0_theta - p0_theta) + 180) % 360 - 180))
            self.p1_theta_error(
                abs((np.rad2deg(gt_p1_theta - p1_theta) + 180) % 360 - 180))

            return None

        return self.p0_p1_position_rotations_to_act(act, p0_position,
                                                    p0_rotation, p1_position,
                                                    p1_rotation)
Beispiel #4
0
    def get_data_batch(self, dataset, augment=True):
        """Use dataset to extract and preprocess data.

    Supports adding a goal image, in which case the current and goal
    images are stacked together channel-wise (first 6 for current, last 6
    for goal) before doing data augmentation, to ensure consistency.

    Args:
      dataset: a ravens.Dataset (train or validation)
      augment: if True, perform data augmentation.

    Returns:
      tuple of data for training:
        (input_image, p0, p0_theta, p1, p1_theta)
      tuple additionally includes (z, roll, pitch) if self.six_dof
      if self.use_goal_image, then the goal image is stacked with the
      current image in `input_image`. If splitting up current and goal
      images is desired, it should be done outside this method.
    """
        if self.use_goal_image:
            obs, act, _, goal = dataset.random_sample(goal_images=True)
        else:
            obs, act, _ = dataset.random_sample()

        # Get heightmap from RGB-D images, including goal images if specified.
        configs = act['camera_config']
        colormap, heightmap = self.get_heightmap(obs, configs)
        if self.use_goal_image:
            colormap_g, heightmap_g = self.get_heightmap(goal, configs)

        # Get training labels from data sample.
        pose0, pose1 = act['params']['pose0'], act['params']['pose1']
        p0_position, p0_rotation = pose0[0], pose0[1]
        p0 = utils.position_to_pixel(p0_position, self.bounds, self.pixel_size)
        p0_theta = -np.float32(
            utils.get_rot_from_pybullet_quaternion(p0_rotation)[2])
        p1_position, p1_rotation = pose1[0], pose1[1]
        p1 = utils.position_to_pixel(p1_position, self.bounds, self.pixel_size)
        p1_theta = -np.float32(
            utils.get_rot_from_pybullet_quaternion(p1_rotation)[2])

        # Concatenate color with depth images.
        input_image = self.concatenate_c_h(colormap, heightmap)

        # If using goal image, stack _with_ input_image before data augmentation.
        if self.use_goal_image:
            goal_image = self.concatenate_c_h(colormap_g, heightmap_g)
            input_image = np.concatenate((input_image, goal_image), axis=2)
            assert input_image.shape[2] == 12, input_image.shape

        # Do data augmentation (perturb rotation and translation).
        if augment:
            input_image, _, rounded_pixels, transform_params = utils.perturb(
                input_image, [p0, p1])
            p0, p1 = rounded_pixels

        if self.six_dof:
            if not augment:
                transform_params = None
            p0_theta, p1_theta, z, roll, pitch = self.get_six_dof(
                transform_params, heightmap, pose0, pose1, augment=augment)
            return input_image, p0, p0_theta, p1, p1_theta, z, roll, pitch
        else:
            # If using a goal image, it is stacked with `input_image` and split later.
            p1_theta = p1_theta - p0_theta
            p0_theta = 0
            return input_image, p0, p0_theta, p1, p1_theta
    def train(self, dataset, num_iter, writer):
        """Train on dataset for a specific number of iterations."""
        @tf.function
        def pick_train_step(model, optim, in_tensor, yxtheta, loss_criterion):
            with tf.GradientTape() as tape:
                output = model(in_tensor)
                loss = loss_criterion(yxtheta, output)
            grad = tape.gradient(loss, model.trainable_variables)
            optim.apply_gradients(zip(grad, model.trainable_variables))
            return loss

        @tf.function
        def place_train_step(model, optim, in_tensor, yxtheta, loss_criterion):
            with tf.GradientTape() as tape:
                output = model(in_tensor)
                loss = loss_criterion(yxtheta, output)
            grad = tape.gradient(loss, model.trainable_variables)
            optim.apply_gradients(zip(grad, model.trainable_variables))
            return loss

        for i in range(num_iter):
            start = time.time()

            input_images, p0s, p0_thetas = [], [], []
            p1s, p1_thetas = [], []
            for _ in range(self.batch_size):
                obs, act, info = dataset.random_sample()

                # Get heightmap from RGB-D images.
                configs = act['camera_config']
                colormap, heightmap = self.get_heightmap(obs, configs)
                #self.show_images(colormap, heightmap)

                # Get training labels from data sample.

                # (spatially distributed on object) get actions from oracle distribution
                #pose0, pose1 = act['params']['pose0'], act['params']['pose1']

                # (identical object location) get actions from object poses
                l_object = info[4]
                pose0 = l_object[0], l_object[1]
                l_target = info[5]
                pose1 = l_target[0], l_target[1]

                p0_position, p0_rotation = pose0[0], pose0[1]
                p0 = utils.position_to_pixel(p0_position, self.bounds,
                                             self.pixel_size)
                p0_theta = -np.float32(
                    p.getEulerFromQuaternion(p0_rotation)[2])
                p1_position, p1_rotation = pose1[0], pose1[1]
                p1 = utils.position_to_pixel(p1_position, self.bounds,
                                             self.pixel_size)
                p1_theta = -np.float32(
                    p.getEulerFromQuaternion(p1_rotation)[2])

                # to make it relative
                # p1_theta = p1_theta - p0_theta
                # p0_theta = 0

                p1_xytheta = np.array(
                    [p1_position[0], p1_position[1], p1_theta])

                # Concatenate color with depth images.
                # input_image = np.concatenate((colormap,
                #                               heightmap[..., None],
                #                               heightmap[..., None],
                #                               heightmap[..., None]), axis=2)

                input_image = colormap

                input_images.append(input_image)
                p0s.append(p0)
                p0_thetas.append(p0_theta)
                p1s.append(p1)
                p1_thetas.append(p1_theta)

            input_image = np.array(input_images)
            p0 = np.array(p0s)
            p0_theta = np.array(p0_thetas)
            p1 = np.array(p1s)
            p1_theta = np.array(p1_thetas)

            # Compute train loss - regression place
            loss0 = self.pick_regression_model.train_pick(
                input_image, p0, p0_theta, pick_train_step)
            with writer.as_default():
                tf.summary.scalar('pick_loss',
                                  self.pick_regression_model.metric.result(),
                                  step=self.total_iter + i)

            # Compute train loss - regression place
            loss1 = self.place_regression_model.train_pick(
                input_image, p1, p1_theta, place_train_step)
            with writer.as_default():
                tf.summary.scalar('place_loss',
                                  self.place_regression_model.metric.result(),
                                  step=self.total_iter + i)

            #loss1 = 0.0
            print(
                f'Train Iter: {self.total_iter + i} Loss: {loss0:.4f} {loss1:.4f} Iter time:',
                time.time() - start)

        self.total_iter += num_iter
        self.save()
    def train(self, dataset, num_iter, writer):
        """Train on dataset for a specific number of iterations.

        Daniel: notice how little training data we use! One 'iteration' is
        simply one image and an associated action, drawn by (a) sampling
        demo, then (b) sampling time within it. We do heavy data
        augmentation, but it's still just one real image.

        If using a goal image, we use a different random_sample method that
        also picks the LAST image of that episode, which is assigned as the
        goal image. This would likely not work for super long-horizon tasks,
        but maybe; (Agarwal et al., NeurIPS 2016) in the PokeBot paper
        actually got something like this 'greedy-style' planning to work.
        Otherwise we might have to do something like (Nair et al., ICRA 2017)
        in the follow-up work where we feed in a target image for each time
        step, which would be the *next* image saved.

        For data augmentation with this goal image, I believe we should stack
        the current and goal image together, and THEN do augmentation. The
        perturb method will make sure placing pixels are preserved -- which
        for short-horizon environments usually means the goal image will
        contain most of the relevant information. When data augmenting, for
        both normal and goal-conditioned Transporters, the p1_theta
        (rotation) is the same, but pick points are correctly 'converted' to
        those appropriate for the augmented images.
        """
        for i in range(num_iter):
            if self.use_goal_image:
                obs, act, info, goal = dataset.random_sample(goal_images=True)
            else:
                obs, act, info = dataset.random_sample()

            # Get heightmap from RGB-D images.
            configs = act['camera_config']
            colormap, heightmap = self.get_heightmap(obs, configs)
            if self.use_goal_image:
                colormap_g, heightmap_g = self.get_heightmap(goal, configs)

            # Get training labels from data sample.
            pose0, pose1 = act['params']['pose0'], act['params']['pose1']
            p0_position, p0_rotation = pose0[0], pose0[1]
            p0 = utils.position_to_pixel(p0_position, self.bounds, self.pixel_size)
            p0_theta = -np.float32(p.getEulerFromQuaternion(p0_rotation)[2])
            p1_position, p1_rotation = pose1[0], pose1[1]
            p1 = utils.position_to_pixel(p1_position, self.bounds, self.pixel_size)
            p1_theta = -np.float32(p.getEulerFromQuaternion(p1_rotation)[2])
            p1_theta = p1_theta - p0_theta
            p0_theta = 0

            # Concatenate color with depth images.
            input_image = self.concatenate_c_h(colormap, heightmap)

            # If using goal image, stack _with_ input_image for data augmentation.
            if self.use_goal_image:
                goal_image = self.concatenate_c_h(colormap_g, heightmap_g)
                input_image = np.concatenate((input_image, goal_image), axis=2)
                assert input_image.shape[2] == 12, input_image.shape

            # Do data augmentation (perturb rotation and translation).
            original_pixels = (p0, p1)
            input_image, pixels = utils.perturb(input_image, [p0, p1])
            p0, p1 = pixels

            # Optionally visualize images _after_ data agumentation.
            if False:
                self.visualize_images(p0, p0_theta, p1, p1_theta, original_pixels,
                        colormap=colormap, heightmap=heightmap,
                        colormap_g=colormap_g, heightmap_g=heightmap_g,
                        input_image=input_image, before_aug=False)

            # Compute Attention training loss.
            if self.attn_no_targ and self.use_goal_image:
                maxdim = int(input_image.shape[2] / 2)
                input_only = input_image[:, :, :maxdim]
                loss0 = self.attention_model.train(input_only, p0, p0_theta)
            else:
                loss0 = self.attention_model.train(input_image, p0, p0_theta)
            with writer.as_default():
                tf.summary.scalar('attention_loss', self.attention_model.metric.result(),
                    step=self.total_iter+i)

            # Compute Transport training loss.
            if isinstance(self.transport_model, Attention):
                loss1 = self.transport_model.train(input_image, p1, p1_theta)
            elif isinstance(self.transport_model, TransportGoal):
                half = int(input_image.shape[2] / 2)
                img_curr = input_image[:, :, :half]
                img_goal = input_image[:, :, half:]
                loss1 = self.transport_model.train(img_curr, img_goal, p0, p1, p1_theta)
            else:
                loss1 = self.transport_model.train(input_image, p0, p1, p1_theta)
            with writer.as_default():
                tf.summary.scalar('transport_loss', self.transport_model.metric.result(),
                    step=self.total_iter+i)

            print(f'Train Iter: {self.total_iter + i} Loss: {loss0:.4f} {loss1:.4f}')

        self.total_iter += num_iter
        self.save()