Esempio n. 1
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """

    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=True,
            render_size=84,
            reward_type="dense",
            target_radius=0.5,
            boundary_dist=4,
            ball_radius=0.25,
            walls=None,
            fixed_goal=None,
            randomize_position_on_reset=True,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            action_limit=1,
            initial_position=(0, 0),
            **kwargs
    ):
        if walls is None:
            walls = []
        if len(kwargs) > 0:
            import logging
            LOGGER = logging.getLogger(__name__)
            LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)
        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.randomize_position_on_reset = randomize_position_on_reset
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.action_limit = action_limit

        self._max_episode_steps = 50
        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))

        u = self.action_limit * np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])
        self.observation_space.shape = (2,)   # hack
        self._step = 0
        self.initial_position = initial_position

        self.drawer = None

    def step(self, velocities):
        velocities = np.clip(velocities, a_min=-self.action_limit, a_max=self.action_limit)
        new_position = self._position + velocities
        for wall in self.walls:
            new_position = wall.handle_collision(
                self._position, new_position
            )
        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(
            self._position - self._target_position
        )
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
        }
        self._step += 1
        done = False
        return ob, reward, done, info

    def _sample_goal(self):
        return np.random.uniform(
            size=2, low=-self.max_target_distance, high=self.max_target_distance
        )

    def reset(self):
        self._step = 0
        if self.fixed_goal:
            self._target_position = np.array([0, 0])
        else:
            self._target_position = np.random.uniform(
                size=2, low=-self.max_target_distance, high=self.max_target_distance
            )
        if self.randomize_position_on_reset:
            self._position = np.random.uniform(
                size=2, low=-self.boundary_dist, high=self.boundary_dist
            )
        else:
            self._position = np.array(self.initial_position)
        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _get_obs(self):
        return OrderedDict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=self._position.copy(),
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_observation']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        if self.reward_type == "dense":
            return -d

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
            'distance_to_target',
            'is_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(create_stats_ordered_dict(
                '%s%s' % (prefix, stat_name),
                stat,
                always_show_all_stats=True,
                ))
            statistics.update(create_stats_ordered_dict(
                'Final %s%s' % (prefix, stat_name),
                [s[-1] for s in stat],
                always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def _sample_position(self, low, high, realistic=True):
        pos = np.random.uniform(low, high)
        if realistic:
            while self._position_inside_wall(pos) is True:
                pos = np.random.uniform(low, high)
        return pos

    def sample_goals(self, batch_size):
        if not self.fixed_goal is None:
            goals = np.repeat(
                self.fixed_goal.copy()[None],
                batch_size,
                0
            )
        else:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                goals[b, :] = self._sample_position(self.obs_range.low,
                                         self.obs_range.high,)
                                         # realistic=self.sample_realistic_goals)
            # goals = np.random.uniform(
            #     self.obs_range.low,
            #     self.obs_range.high,
            #     size=(batch_size, self.obs_range.low.size),
            # )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def render(self, mode='human', **kwargs):
        width, height = 256, 256
        """Returns a black and white image"""
        if width is not None:
            if width != height:
                raise NotImplementedError()
            if width != self.render_size:
                self.drawer = PygameViewer(
                    screen_width=width,
                    screen_height=height,
                    x_bounds=(-self.boundary_dist, self.boundary_dist),
                    y_bounds=(-self.boundary_dist, self.boundary_dist),
                    render_onscreen=self.render_onscreen,
                )
                self.render_size = width
        self._render()
        img = self.drawer.get_image()
        return np.rot90(img, k=1, axes=(0, 1))
        # if self.images_are_rgb:
        #     return img.transpose().flatten()
        # else:
        #     r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
        #     img = (-r + b).transpose().flatten()
        #     return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def _render(self, close=False):

        if close:
            self.drawer = None
            return

        if self.drawer is None or self.drawer.terminated:
            self.drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=self.render_onscreen,
            )

        self.drawer.fill(Color('white'))

        self.drawer.draw_solid_circle(
            np.array([10, 10]),
            1,
            Color('red')
        )

        if self.show_goal:
            self.drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        self.drawer.draw_solid_circle(
            self._position,
            self.ball_radius,
            Color('blue'),
        )

        for wall in self.walls:
            self.drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            self.drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            self.drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            self.drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )

        self.drawer.render()
        self.drawer.tick(self.render_dt_msec)

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
            'distance_to_target',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(create_stats_ordered_dict(
                '%s%s' % (prefix, stat_name),
                stat,
                always_show_all_stats=True,
                ))
            statistics.update(create_stats_ordered_dict(
                'Final %s%s' % (prefix, stat_name),
                [s[-1] for s in stat],
                always_show_all_stats=True,
                ))
        return statistics

    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(state[0], -state[1],
                    marker='o', color=color, markersize=10,
                    )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1],
                  scale_units='xy', angles='xy', scale=1, width=0.005)
        ax.quiver(x[:-1], y[:-1], actions_x, actions_y, scale_units='xy',
                  angles='xy', scale=1, color='r',
                  width=0.0035, )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Esempio n. 2
0
class Multiobj2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=84,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.60,
            boundary_dist=4,
            ball_radius=0.50,
            include_colors_in_obs=False,
            walls=None,
            num_colors=8,
            change_colors=True,
            fixed_colors=False,
            fixed_goal=None,
            randomize_position_on_reset=True,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            use_env_labels=False,
            num_objects=1,
            include_white=False,
            **kwargs):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if len(kwargs) > 0:
            LOGGER = logging.getLogger(__name__)
            LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)
        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.randomize_position_on_reset = randomize_position_on_reset
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.num_objects = num_objects
        self.max_target_distance = self.boundary_dist - self.target_radius
        self.change_colors = change_colors
        self.fixed_colors = fixed_colors
        self.num_colors = num_colors
        self._target_position = None
        self._position = np.zeros((2))
        self.include_white = include_white
        self.initial_pass = False
        self.white_passed = False

        if change_colors:
            self.randomize_colors()
        else:
            self.object_colors = [Color('blue')]

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)
        self.include_colors_in_obs = include_colors_in_obs
        o = self.boundary_dist * np.ones(2)
        ohe_range = spaces.Box(np.zeros(self.num_colors),
                               np.ones(self.num_colors),
                               dtype='float32')
        if include_colors_in_obs and self.fixed_colors:
            high = np.concatenate((o, np.ones(self.num_colors)))
            low = np.concatenate((-o, np.zeros(self.num_colors)))
        else:
            high = o
            low = -o

        self.obs_range = spaces.Box(low, high, dtype='float32')
        self.use_env_labels = use_env_labels
        if self.use_env_labels:

            self.observation_space = spaces.Dict([
                ('label', ohe_range),
                ('observation', self.obs_range),
                ('desired_goal', self.obs_range),
                ('achieved_goal', self.obs_range),
                ('state_observation', self.obs_range),
                ('state_desired_goal', self.obs_range),
                ('state_achieved_goal', self.obs_range),
            ])

        else:
            self.observation_space = spaces.Dict([
                ('observation', self.obs_range),
                ('desired_goal', self.obs_range),
                ('achieved_goal', self.obs_range),
                ('state_observation', self.obs_range),
                ('state_desired_goal', self.obs_range),
                ('state_achieved_goal', self.obs_range),
            ])

        self.drawer = None
        self.render_drawer = None

        self.color_index = 0
        self.colors = [
            Color('green'),
            Color('red'),
            Color('blue'),
            Color('black'),
            Color('purple'),
            Color('brown'),
            Color('pink'),
            Color('orange'),
            Color('grey'),
            Color('yellow')
        ]

    def randomize_colors(self):
        self.object_colors = []
        rgbs = np.random.randint(0, 256, (self.num_objects, 3))

        for i in range(self.num_objects):
            if self.fixed_colors:
                self.object_colors.append(self.colors[self.color_index])
                self.color_index = (self.color_index + 1) % 10
            else:
                rgb = map(int, rgbs[i, :])
                self.object_colors.append(Color(*rgb, 255))

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
        }
        done = False
        return ob, reward, done, info

    def reset(self):
        if self.white_passed:
            self.include_white = False
        if self.change_colors:
            self.randomize_colors()
        self._target_position = self.sample_goal()['state_desired_goal']
        if self.randomize_position_on_reset:
            self._position = self._sample_position(
                self.obs_range.low,
                self.obs_range.high,
            )
        if self.initial_pass:
            self.white_passed = True
        self.initial_pass = True
        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        obs = self._position.copy()
        if self.use_env_labels:
            return dict(
                observation=obs,
                label=ohe,
                desired_goal=self._target_position.copy(),
                achieved_goal=self._position.copy(),
                state_observation=self._position.copy(),
                state_desired_goal=self._target_position.copy(),
                state_achieved_goal=self._position.copy(),
            )
        else:
            return dict(
                observation=obs,
                desired_goal=self._target_position.copy(),
                achieved_goal=self._position.copy(),
                state_observation=self._position.copy(),
                state_desired_goal=self._target_position.copy(),
                state_achieved_goal=self._position.copy(),
            )

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'radius',
                'target_position',
                'distance_to_target',
                'velocity',
                'speed',
                'is_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        if not self.fixed_goal is None:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                if batch_size > 1:
                    logging.warning("This is very slow!")
                goals[b, :] = self._sample_position(
                    self.obs_range.low,
                    self.obs_range.high,
                )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self.drawer, False)
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer, tick):
        # if self.drawer is not None:
        #     self.drawer.fill(Color('white'))
        # if self.render_drawer is not None:
        #     self.render_drawer.fill(Color('white'))
        drawer.fill(Color('white'))
        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        if not self.include_white:
            drawer.draw_solid_circle(
                self._position,
                self.ball_radius,
                self.object_colors[0],
            )

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )

        drawer.render()
        if tick:
            drawer.tick(self.render_dt_msec)

    def render(self, close=False):
        if close:
            self.render_drawer = None
            return

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=True,
            )
        self.draw(self.render_drawer, True)

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Esempio n. 3
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=256,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.5,
            boundary_dist=4,
            ball_radius=0.15,
            walls=None,
            init_pos_range=None,
            target_pos_range=None,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            n_bins=10,
            use_count_reward=False,
            show_discrete_grid=False,
            **kwargs):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        # if fixed_goal is not None:
        #     fixed_goal = np.array(fixed_goal)

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.n_bins = n_bins
        self.use_count_reward = use_count_reward
        self.show_discrete_grid = show_discrete_grid
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal

        self.x_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.y_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')

        if not init_pos_range:
            self.init_pos_range = self.obs_range
        else:
            assert np.all(np.abs(init_pos_range) < boundary_dist), (
                f"Init position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )
            low, high = init_pos_range
            self.init_pos_range = spaces.Box(np.array(low),
                                             np.array(high),
                                             dtype='float32')

        if not target_pos_range:
            self.target_pos_range = self.obs_range
        else:
            assert np.all(np.abs(target_pos_range) < boundary_dist), (
                f"Goal position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )

            low, high = target_pos_range
            self.target_pos_range = spaces.Box(np.array(low),
                                               np.array(high),
                                               dtype='float32')

        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('onehot_observation',
             spaces.Box(0,
                        1,
                        shape=(2 * (self.n_bins + 1), ),
                        dtype=np.float32)),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        self.drawer = None
        self.render_drawer = None

        self.reset()

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        x_d, y_d = ob['discrete_observation']
        self.bin_counts[x_d, y_d] += 1

        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            # TODO (kevintli): Make this work for general mazes
            # 'manhattan_dist_to_target': self._medium_maze_manhattan_distance(ob['state_achieved_goal'], ob['state_desired_goal']),
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
            'ext_reward': self._ext_reward,
        }
        if self.use_count_reward:
            info['count_bonus'] = self._count_bonus

        done = False
        return ob, reward, done, info

    def reset(self):
        # TODO: Make this more general
        self._target_position = self.sample_goal()['state_desired_goal']
        # if self.randomize_position_on_reset:
        self._position = self._sample_position(
            # self.obs_range.low,
            # self.obs_range.high,
            self.init_pos_range.low,
            self.init_pos_range.high,
        )
        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        if np.all(low == high):
            return low
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def clear_bin_counts(self):
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

    def _discretize_observation(self, obs):
        x, y = obs['state_observation'].copy()
        x_d, y_d = np.digitize(x, self.x_bins), np.digitize(y, self.y_bins)
        return np.array([x_d, y_d])

    def _get_obs(self):
        obs = collections.OrderedDict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=self._position.copy(),
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

        # Update with discretized state
        pos_discrete = self._discretize_observation(obs)
        pos_onehot = np.eye(self.n_bins + 1)[pos_discrete]
        obs['discrete_observation'] = pos_discrete
        obs['onehot_observation'] = pos_onehot

        return obs

    def _medium_maze_manhattan_distance(self, s1, s2):
        # Maze wall positions
        left_wall_x = -self.boundary_dist / 3
        left_wall_bottom = self.inner_wall_max_dist
        right_wall_x = self.boundary_dist / 3
        right_wall_top = -self.inner_wall_max_dist

        s1 = s1.copy()
        s2 = s2.copy()

        if len(s1.shape) == 1:
            s1, s2 = s1[None], s2[None]

        # Since maze distances are symmetric, redefine s1,s2 for convenience
        # so that points in s1 are to the left of those in s2
        combined = np.hstack((s1[:, None], s2[:, None]))
        indices = np.argmin((s1[:, 0], s2[:, 0]), axis=0)
        s1 = np.take_along_axis(combined, indices[:, None, None],
                                axis=1).squeeze(axis=1)
        s2 = np.take_along_axis(combined, 1 - indices[:, None, None],
                                axis=1).squeeze(axis=1)

        x1 = s1[:, 0]
        x2 = s2[:, 0]

        # Horizontal movement
        x_dist = np.abs(x2 - x1)

        # Vertical movement
        boundary_ys = [left_wall_bottom, right_wall_top, 0]
        boundary_xs = [left_wall_x, right_wall_x, self.boundary_dist, -4.0001]
        y_directions = [
            1, -1, 0
        ]  # +1 means need to get to bottom, -1 means need to get to top
        curr_y, goal_y = s1[:, 1], s2[:, 1]
        y_dist = np.zeros(len(s1))

        for i in range(3):
            # Get all points where s1 and s2 respectively are in the current vertical section of the maze
            curr_in_section = x1 <= boundary_xs[i]
            goal_in_section = np.logical_and(boundary_xs[i - 1] < x2,
                                             x2 <= boundary_xs[i])
            goal_after_section = x2 > boundary_xs[i]

            # Both in current section: move directly to goal
            mask = np.logical_and(curr_in_section, goal_in_section)
            y_dist += mask * np.abs(curr_y - goal_y)
            curr_y[mask] = goal_y[mask]

            # s2 is further in maze: move to next corner
            mask = np.logical_and(
                curr_in_section,
                np.logical_and(goal_after_section, y_directions[i] *
                               (boundary_ys[i] - curr_y) > 0))
            y_dist += mask * np.clip(
                y_directions[i] * (boundary_ys[i] - curr_y), 0, None)
            curr_y[mask] = boundary_ys[i]

        return x_dist + y_dist

    def compute_rewards(self, actions, obs, reward_type=None):
        reward_type = reward_type or self.reward_type

        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if reward_type == "sparse":
            r = -(d > self.target_radius).astype(np.float32)
        elif reward_type == "dense":
            r = -d
        elif reward_type == "vectorized_dense":
            r = -np.abs(achieved_goals - desired_goals)
        elif reward_type == "medium_maze_ground_truth_manhattan":
            """
            # Use maze distance from current position to goal as the negative reward.
            # This is specifically for the medium Maze-v0 env, which has two vertical walls.
            """
            r = -self._medium_maze_manhattan_distance(achieved_goals,
                                                      desired_goals)
        elif reward_type == "laplace_smoothing":
            # Laplace smoothing: 1 within the goal region, 1/(n+2) at all other states
            # (where n is the number of visitations)
            r = np.zeros(d.shape)
            pos_d = obs['discrete_observation']
            r = 1 / (self.bin_counts[pos_d[:, 0], pos_d[:, 1]] + 2)
            r[d <= self.target_radius] = 1
        elif reward_type == "none":
            r = np.zeros(d.shape)
        else:
            raise NotImplementedError(
                f"Unrecognized reward type: {reward_type}")

        if self.use_count_reward:
            # TODO: Add different count based strategies
            pos_d = obs['discrete_observation']
            self._ext_reward = r.copy()
            self._count_bonus = 1 / np.sqrt(self.bin_counts[pos_d[:, 0],
                                                            pos_d[:, 1]])
            r += self._count_bonus
        else:
            self._ext_reward = r.copy()

        return r

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'radius',
                'target_position',
                'distance_to_target',
                'velocity',
                'speed',
                'is_success',
                'ext_reward',
                'count_bonus',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        # if self.fixed_goal:
        #     goals = np.repeat(
        #         self.fixed_goal.copy()[None],
        #         batch_size,
        #         0)
        # else:
        goals = np.zeros((batch_size, self.obs_range.low.size))
        for b in range(batch_size):
            if batch_size > 1:
                logging.warning("This is very slow!")
            goals[b, :] = self._sample_position(
                # self.obs_range.low,
                # self.obs_range.high,
                self.target_pos_range.low,
                self.target_pos_range.high,
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                # TODO(justinvyu): Action scale = 1 breaks rendering, why?
                # x_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                # y_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self.drawer)
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer):
        drawer.fill(Color('white'))

        if self.show_discrete_grid:
            for x in self.x_bins:
                drawer.draw_segment((x, -self.boundary_dist),
                                    (x, self.boundary_dist),
                                    Color(220, 220, 220, 25),
                                    aa=False)
            for y in self.y_bins:
                drawer.draw_segment((-self.boundary_dist, y),
                                    (self.boundary_dist, y),
                                    Color(220, 220, 220, 25),
                                    aa=False)

        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        try:
            drawer.draw_solid_circle(
                self._position,
                self.ball_radius,
                Color('blue'),
            )
        except ValueError as e:
            print('\n\n RENDER ERROR \n\n')

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )
        drawer.render()

    def render(self, mode='human', width=None, height=None, close=False):
        if close:
            self.render_drawer = None
            return
        if mode == 'rgb_array':
            return self.get_image(width=width, height=height)

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                # x_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                # y_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    # def get_diagnostics(self, paths, prefix=''):
    #     statistics = OrderedDict()
    #     for stat_name in ('distance_to_target', ):
    #         stat_name = stat_name
    #         stat = get_stat_in_paths(paths, 'env_infos', stat_name)
    #         statistics.update(create_stats_ordered_dict(
    #             '%s%s' % (prefix, stat_name),
    #             stat,
    #             always_show_all_stats=True,
    #             ))
    #         statistics.update(create_stats_ordered_dict(
    #             'Final %s%s' % (prefix, stat_name),
    #             [s[-1] for s in stat],
    #             always_show_all_stats=True,
    #             ))
    #     return statistics
    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Esempio n. 4
0
class PickAndPlaceEnv(MultitaskEnv, Serializable):
    """
    A simple env where a 'cursor' robot can push objects around.

    TODO: refactor to have base class shared with point2d.py code
    TODO: rather than recreating a drawer, just save the previous drawers
    """
    def __init__(
        self,
        num_objects=2,
        # Environment dynamics
        action_scale=1.0,
        ball_radius=.75,
        boundary_dist=4,
        object_radius=0.50,
        min_grab_distance=0.5,
        walls=None,
        # Rewards
        action_l2norm_penalty=0,
        reward_type="dense",
        success_threshold=0.60,
        # Reset settings
        fixed_goal=None,
        fixed_init_position=None,
        init_position_strategy='random',
        start_near_object=False,
        # Visualization settings
        images_are_rgb=True,
        render_dt_msec=0,
        render_onscreen=False,
        render_size=84,
        get_image_base_render_size=None,
        show_goal=True,
        # Goal sampling
        goal_samplers=None,
        goal_sampling_mode='random',
        num_presampled_goals=10000,
        object_reward_only=False,
    ):
        """
        :param get_image_base_render_size: If set, then the drawer will always
        generate images according to this size, and (smoothly) downsampled
        images will be returned by `get_image`
        """
        walls = walls or []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if action_scale <= 0:
            raise ValueError("Invalid action scale: {}".format(action_scale))
        if init_position_strategy not in {
                'random', 'on_random_object', 'fixed'
        }:
            raise ValueError('Invalid init position strategy: {}'.format(
                init_position_strategy))

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.success_threshold = success_threshold
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.init_position_strategy = init_position_strategy
        self.images_are_rgb = images_are_rgb
        self._show_goal = show_goal

        self._all_objects = [
            Object(
                position=np.zeros((2, )),
                color=Object.IDX_TO_COLOR[i],
                radius=ball_radius if i == 0 else object_radius,
                min_pos=-self.boundary_dist,
                max_pos=self.boundary_dist,
            ) for i in range(num_objects + 1)
        ]
        self.min_grab_distance = min_grab_distance

        u = np.ones(3)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2 * (num_objects + 1))
        self.num_objects = num_objects
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        if get_image_base_render_size:
            base_width, base_height = get_image_base_render_size
            self._drawer = PygameViewer(
                screen_width=base_width,
                screen_height=base_height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
            self._fixed_get_image_render_size = True
        else:
            self._drawer = None
            self._fixed_get_image_render_size = False
        self._render_drawer = None
        if fixed_init_position is None:
            fixed_init_position = np.zeros_like(self.obs_range.low)
        self._fixed_init_position = fixed_init_position

        self._presampled_goals = None
        goal_samplers = goal_samplers or {}
        goal_samplers['fixed'] = PickAndPlaceEnv._sample_fixed_goal
        goal_samplers['presampled'] = PickAndPlaceEnv._sample_presampled_goals
        goal_samplers['random'] = PickAndPlaceEnv._sample_random_feasible_goals
        self._custom_goal_sampler = goal_samplers
        self._num_presampled_goals = num_presampled_goals
        if goal_sampling_mode is None:
            if fixed_goal:
                goal_sampling_mode = 'fixed'
            else:
                goal_sampling_mode = 'presampled'
        self.goal_sampling_mode = goal_sampling_mode
        self.object_reward_only = object_reward_only

    @property
    def cursor(self):
        return self._all_objects[0]

    @property
    def objects(self):
        return self._all_objects[1:]

    def step(self, raw_action):
        velocities = raw_action[:2]
        grab = raw_action[2] > 0

        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        old_position = self.cursor.position.copy()
        new_position = old_position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(old_position, new_position)

        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(old_position,
                                                     new_position)

        if grab:
            grabbed_obj = self._grab_object()
            if grabbed_obj:
                grabbed_obj.move(velocities)

        self.cursor.move(new_position - old_position)

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = self._get_info()
        done = False
        return ob, reward, done, info

    def _get_info(self):
        distance_to_target = self.cursor.distance_to_target()
        is_success = distance_to_target < self.success_threshold
        info = {
            'distance_to_target_cursor': distance_to_target,
            'success_cursor': is_success,
        }
        for i, obj in enumerate(self.objects):
            obj_distance = obj.distance_to_target()
            success = obj_distance < self.success_threshold
            info['distance_to_target_obj_{}'.format(i)] = obj_distance
            info['success_obj_{}'.format(i)] = success
        return info

    def reset(self):
        goal = self.sample_goal()['state_desired_goal']
        self._set_target_positions(goal)

        if self.init_position_strategy == 'random':
            init_pos = (
                self.observation_space.spaces['state_observation'].sample())
        elif self.init_position_strategy == 'fixed':
            init_pos = self._fixed_init_position.copy()
        elif self.init_position_strategy == 'on_random_object':
            init_pos = (
                self.observation_space.spaces['state_observation'].sample())
            start_i = 2 + 2 * random.randint(0, len(self._all_objects) - 2)
            end_i = start_i + 2
            init_pos[:2] = init_pos[start_i:end_i].copy()
        else:
            raise ValueError(self.init_position_strategy)
        self._set_positions(init_pos)

        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        positions = self._get_positions()
        target_positions = self._get_target_positions()
        return dict(
            observation=positions.copy(),
            desired_goal=target_positions.copy(),
            achieved_goal=positions.copy(),
            state_observation=positions.copy(),
            state_desired_goal=target_positions.copy(),
            state_achieved_goal=positions.copy(),
        )

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        if self.object_reward_only:
            achieved_goals = achieved_goals[:, 2:]
            desired_goals = desired_goals[:, 2:]
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.success_threshold *
                     len(self._all_objects)).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == "dense_l1":
            return -np.linalg.norm(
                achieved_goals - desired_goals, axis=-1, ord=1)
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target',
                'is_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        for i, obj in enumerate(self.objects):
            stat_name = 'distance_to_target_obj_{}'.format(i)
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._get_target_positions(),
            'state_desired_goal': self._get_target_positions(),
        }

    def set_goal(self, goal):
        self._set_target_positions(goal['state_desired_goal'])

    def sample_goals(self, batch_size):
        goal_sampler = self._custom_goal_sampler[self.goal_sampling_mode]
        return goal_sampler(self, batch_size)

    def _sample_random_feasible_goals(self, batch_size):
        if len(self.walls) > 0:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                goals[b, :] = self._sample_position(
                    self.obs_range.low,
                    self.obs_range.high,
                )
        else:
            goals = np.random.uniform(
                self.obs_range.low,
                self.obs_range.high,
                size=(batch_size, self.obs_range.low.size),
            )
        state_goals = goals
        return {
            'desired_goal': goals,
            'state_desired_goal': state_goals,
        }

    def _sample_fixed_goal(self, batch_size):
        goals = state_goals = np.repeat(self.fixed_goal.copy()[None],
                                        batch_size, 0)
        return {
            'desired_goal': goals,
            'state_desired_goal': state_goals,
        }

    def _sample_presampled_goals(self, batch_size):
        if self._presampled_goals is None:
            self._presampled_goals = self._sample_random_feasible_goals(
                self._num_presampled_goals)
        random_idxs = np.random.choice(len(
            list(self._presampled_goals.values())[0]),
                                       size=batch_size)
        goals = self._presampled_goals['desired_goal'][random_idxs]
        state_goals = self._presampled_goals['state_desired_goal'][random_idxs]
        return {
            'desired_goal': goals,
            'state_desired_goal': state_goals,
        }

    def _set_positions(self, positions):
        for i, obj in enumerate(self._all_objects):
            start_i = i * 2
            end_i = i * 2 + 2
            obj.position = positions[start_i:end_i]

    def _set_target_positions(self, target_positions):
        for i, obj in enumerate(self._all_objects):
            start_i = i * 2
            end_i = i * 2 + 2
            obj.target_position = target_positions[start_i:end_i]

    def _get_positions(self):
        return np.concatenate([obj.position for obj in self._all_objects])

    def _get_target_positions(self):
        return np.concatenate(
            [obj.target_position for obj in self._all_objects])

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self._drawer is None or (not self._fixed_get_image_render_size and
                                    (self._drawer.width != width
                                     or self._drawer.height != height)):
            if width != height:
                raise NotImplementedError()
            self._drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self._draw(self._drawer)
        if width and height:
            wh_size = (width, height)
        else:
            wh_size = None
        img = self._drawer.get_image(wh_size)
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (r + g + b).transpose() / 3
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._set_positions(goal)

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        self._set_positions(state["state_observation"])
        self._set_target_positions(state["state_desired_goal"])

    def render(self, mode='human', close=False):
        if close:
            self._render_drawer = None
            return

        if self._render_drawer is None or self._render_drawer.terminated:
            self._render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=True,
            )
        self._draw(self._render_drawer)
        self._render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self._render_drawer.check_for_exit()

    def _draw(self, drawer):
        drawer.fill(Color('black'))
        for obj in self._all_objects:
            obj.draw(drawer, draw_target_position=self._show_goal)

        for wall in self.walls:
            draw_wall(drawer, wall)
        drawer.render()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target_obj_{}'.format(i)
                for i in range(len(self.objects))
        ] + ['success_obj_{}'.format(i) for i in range(len(self.objects))] + [
                'distance_to_target_cursor',
                'success_cursor',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def _grab_object(self):
        closest_object = None
        min_dis = None
        for obj in self.objects:
            distance = obj.distance(self.cursor.position)
            if ((distance <= self.min_grab_distance)
                    and (closest_object is None or distance < min_dis)):
                min_dis = distance
                closest_object = obj
        return closest_object

    def goal_conditioned_diagnostics(self, paths, goals):
        statistics = OrderedDict()
        stat_to_lists = defaultdict(list)
        for path, goal in zip(paths, goals):
            difference = path['observations'] - goal
            for i in range(len(self._all_objects)):
                distance = np.linalg.norm(difference[:, 2 * i:2 * i + 2],
                                          axis=-1)
                distance_key = 'distance_to_target_obj_{}'.format(i)
                stat_to_lists[distance_key].append(distance)
                success_key = 'success_obj_{}'.format(i)
                stat_to_lists[success_key].append(
                    distance < self.success_threshold)
        for stat_name, stat_list in stat_to_lists.items():
            statistics.update(
                create_stats_ordered_dict(
                    stat_name,
                    stat_list,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    '{}/final'.format(stat_name),
                    [s[-1:] for s in stat_list],
                    always_show_all_stats=True,
                    exclude_max_min=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    '{}/initial'.format(stat_name),
                    [s[:1] for s in stat_list],
                    always_show_all_stats=True,
                    exclude_max_min=True,
                ))
        return statistics
Esempio n. 5
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=200,
            render_target=True,
            reward_type="dense",
            norm_order=2,
            action_scale=1.0,
            target_radius=0.50,
            boundary_dist=4,
            ball_radius=0.50,
            walls=[],
            fixed_goal=None,
            goal_low=None,
            goal_high=None,
            ball_low=None,
            ball_high=None,
            randomize_position_on_reset=True,
            sample_realistic_goals=False,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            v_func_heatmap_bounds=None,
            **kwargs):
        if walls is None:
            walls = []
        if len(kwargs) > 0:
            import logging
            LOGGER = logging.getLogger(__name__)
            LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)
        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.render_target = render_target
        self.reward_type = reward_type
        self.norm_order = norm_order
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.randomize_position_on_reset = randomize_position_on_reset
        self.sample_realistic_goals = sample_realistic_goals
        if self.fixed_goal is not None:
            self.fixed_goal = np.array(self.fixed_goal)
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')

        if goal_low is None:
            goal_low = -o
        if goal_high is None:
            goal_high = o
        self.goal_range = spaces.Box(np.array(goal_low),
                                     np.array(goal_high),
                                     dtype='float32')

        if ball_low is None:
            ball_low = -o
        if ball_high is None:
            ball_high = o
        self.ball_range = spaces.Box(np.array(ball_low),
                                     np.array(ball_high),
                                     dtype='float32')

        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('desired_goal', self.goal_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.goal_range),
            ('state_achieved_goal', self.obs_range),
        ])

        self.drawer = None
        self.subgoals = None

        self.v_func_heatmap_bounds = v_func_heatmap_bounds

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        below_wall = self._position[1] >= 2.0
        is_success = distance_to_target < 1.0
        is_success_2 = distance_to_target < 1.5
        is_success_3 = distance_to_target < 1.75

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
            'is_success_2': is_success_2,
            'is_success_3': is_success_3,
            'below_wall': below_wall,
        }
        done = False
        return ob, reward, done, info

    def initialize_camera(self, camera):
        pass

    def reset(self):
        self._target_position = self.sample_goal_for_rollout(
        )['state_desired_goal']
        if self.randomize_position_on_reset:
            self._position = self._sample_position(self.ball_range.low,
                                                   self.ball_range.high,
                                                   realistic=True)

        self.subgoals = None
        return self._get_obs()

    def _position_outside_arena(self, pos):
        return not self.obs_range.contains(pos)

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def realistic_state_np(self, state):
        return not self._position_inside_wall(state)

    def realistic_goals(self, g, use_double=False):
        collision = None
        for wall in self.walls:
            wall_collision = wall.contains_points_pytorch(g)
            wall_collision_result = wall_collision[:, 0]
            for i in range(wall_collision.shape[1]):
                wall_collision_result = wall_collision_result * wall_collision[:,
                                                                               i]

            if collision is None:
                collision = wall_collision_result
            else:
                collision = collision | wall_collision_result

        result = collision ^ 1

        # g_np = ptu.get_numpy(g)
        # for i in range(g_np.shape[0]):
        #     print(not self._position_inside_wall(g_np[i]), ptu.get_numpy(result[i]))

        return result.float()

    def _sample_position(self, low, high, realistic=True):
        pos = np.random.uniform(low, high)
        if realistic:
            while self._position_inside_wall(pos) is True:
                pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        return dict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=self._position.copy(),
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

    def compute_rewards(self, actions, obs, prev_obs=None):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals,
                           ord=self.norm_order,
                           axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target',
                'below_wall',
                'is_success',
                'is_success_2',
                'is_success_3',
                'velocity',
                'speed',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def set_goal(self, goal):
        self._target_position = goal['state_desired_goal']

    def sample_goal_for_rollout(self):
        if not self.fixed_goal is None:
            goal = np.copy(self.fixed_goal)
        else:
            goal = self._sample_position(self.goal_range.low,
                                         self.goal_range.high,
                                         realistic=self.sample_realistic_goals)
        return {
            'desired_goal': goal,
            'state_desired_goal': goal,
        }

    def sample_goals(self, batch_size):
        # goals = np.random.uniform(
        #     self.obs_range.low,
        #     self.obs_range.high,
        #     size=(batch_size, self.obs_range.low.size),
        # )
        goals = np.array([
            self._sample_position(self.obs_range.low,
                                  self.obs_range.high,
                                  realistic=self.sample_realistic_goals)
            for _ in range(batch_size)
        ])
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if width is not None:
            if width != height:
                raise NotImplementedError()
            if width != self.render_size:
                self.drawer = PygameViewer(
                    screen_width=width,
                    screen_height=height,
                    x_bounds=(-self.boundary_dist - self.ball_radius,
                              self.boundary_dist + self.ball_radius),
                    y_bounds=(-self.boundary_dist - self.ball_radius,
                              self.boundary_dist + self.ball_radius),
                    render_onscreen=self.render_onscreen,
                )
                self.render_size = width
        self.render()
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (-r + b)
            return img

    def update_subgoals(self,
                        subgoals,
                        subgoals_reproj=None,
                        subgoal_v_vals=None):
        self.subgoals = subgoals

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def get_states_sweep(self, nx, ny):
        x = np.linspace(-4, 4, nx)
        y = np.linspace(-4, 4, ny)
        xv, yv = np.meshgrid(x, y)
        states = np.stack((xv, yv), axis=2).reshape((-1, 2))
        return states

    def render(self, close=False):
        if close:
            self.drawer = None
            return

        if self.drawer is None or self.drawer.terminated:
            self.drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self.drawer.fill(Color('white'))
        if self.render_target:
            self.drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        self.drawer.draw_solid_circle(
            self._position,
            self.ball_radius,
            Color('blue'),
        )

        if self.subgoals is not None:
            for goal in self.subgoals:
                self.drawer.draw_solid_circle(
                    goal,
                    self.ball_radius + 0.1,
                    Color('red'),
                )
            for p1, p2 in zip(
                    np.concatenate(([self._position], self.subgoals[:-1]),
                                   axis=0), self.subgoals):
                self.drawer.draw_segment(p1, p2, Color(100, 0, 0, 10))

        for wall in self.walls:
            self.drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            self.drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            self.drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            self.drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )

        self.drawer.render()
        self.drawer.tick(self.render_dt_msec)

    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Esempio n. 6
0
class Point2DEnv(MultitaskEnv, Serializable):
    """A little 2D point whose life goal is to reach a target.

    Attributes
    ----------
    render_dt_msec : float
        seconds before the next frame in the image is rendered
    action_l2norm_penalty : float
        penalty scale for the actions by the agent
    render_onscreen : bool
        whether to include the rendering visually (instead of simply using the
        image for the observation)
    render_size : int
        width/length number of pixels in the rendered image
    reward_type : str
        the reward type. Must be one of: "sparse", "dense", or
        "vectorized_dense"
    action_scale : float
        the multiple from action to velocity
    target_radius : float
        the radius of the targeted position when being rendered
    boundary_dist : float
        the distance from the center to the boundary
    ball_radius : float
        the radius of the agent when being rendered
    walls : list of object
        a list of wall objects describing the position of the walls and how to
        handle collisions with said walls
    fixed_goal : [float, float] or None
        the goal to use. If set to None, it is picked randomly.
    randomize_position_on_reset : bool
        whether to initialize the position of the agent randomly
    images_are_rgb : bool
        specifies whether the image is RGB. Otherwise, it's black and white
    show_goal : bool
        whether to render the goal(s)
    images_in_obs : bool
        whether to use the image in the observation
    image_size : int
        number of elements in the image. Set to 0 if images are not being used.
    action_space : gym.spaces.*
        the action space of the environment
    obs_range : gym.spaces.*
        the range of the initial position of the agent
    context_space  : gym.spaces.*
        the context space of the environment
    observation_space : gym.spaces.*
        the observation space of the environment
    drawer : multiworld.envs.pygame.pygame_viewer.PygameViewer or None
        The drawer for the images of the environment. Set to None if images are
        not being used.
    render_drawer : multiworld.envs.pygame.pygame_viewer.PygameViewer or None
        The drawer for the images of the environment if the environment is
        being rendered. Set to None if images are not being used.
    horizon : int
        environment time horizon
    t : int
        number of steps since the start of the most recent episode
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=32,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.60,
            boundary_dist=4,
            ball_radius=0.50,
            walls=None,
            fixed_goal=None,
            randomize_position_on_reset=True,
            images_are_rgb=True,  # else black and white
            show_goal=True,
            images_in_obs=True,
            **kwargs):
        """Instantiate the environment.

        Parameters
        ----------
        render_dt_msec : float
            seconds before the next frame in the image is rendered
        action_l2norm_penalty : float
            penalty scale for the actions by the agent
        render_onscreen : bool
            whether to include the rendering visually (instead of simply using
            the image for the observation)
        render_size : int
            width/length number of pixels in the rendered image
        reward_type : str
            the reward type. Must be one of: "sparse", "dense", or
            "vectorized_dense"
        action_scale : float
            the multiple from action to velocity
        target_radius : float
            the radius of the targeted position when being rendered
        boundary_dist : float
            the distance from the center to the boundary
        ball_radius : float
            the radius of the agent when being rendered
        walls : list of object
            a list of wall objects describing the position of the walls and how
            to handle collisions with said walls
        fixed_goal : [float, float] or None
            the goal to use. If set to None, it is picked randomly.
        randomize_position_on_reset : bool
            whether to initialize the position of the agent randomly
        images_are_rgb : bool
            specifies whether the image is RGB. Otherwise, it's black and white
        show_goal : bool
            whether to render the goal(s)
        images_in_obs : bool
            whether to use the image in the obsevation
        kwargs : dict
            additional arguments. Unused here.
        """
        if walls is None:
            walls = []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if len(kwargs) > 0:
            logger = logging.getLogger(__name__)
            logger.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.randomize_position_on_reset = randomize_position_on_reset
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.images_in_obs = images_in_obs
        self.image_size = 1024 * (3 if self.images_are_rgb else 1)
        if not self.images_in_obs:
            self.image_size = 0

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros(2)

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.context_space = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Box(
            np.concatenate([np.zeros([self.image_size]), -o], 0),
            np.concatenate([np.ones([self.image_size]), o], 0),
            dtype='float32')

        self.drawer = None
        self.render_drawer = None
        self.horizon = 200
        self.t = 0

    @property
    def current_context(self):
        """Return the current goal by the environment."""
        return self._target_position

    def step(self, velocities):
        """Advance the simulation by one step.

        Parameters
        ----------
        velocities : array_like
            the action by the agent, defined as its velocities in the x and y
            directions

        Returns
        -------
        array_like
            agent's observation of the current environment
        float
            amount of reward associated with the previous state/action pair
        bool
            indicates whether the episode has ended
        dict
            contains other diagnostic information from the previous action
        """
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            # Hack: sometimes you get caught on two walls at a time. If you
            # process the input in the other direction, you might only get
            # caught on one wall instead.
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self.t += 1

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        reward = self.compute_reward(velocities, {'ob': ob})
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
        }
        done = self.t >= self.horizon
        return ob, reward, done, info

    def reset(self):
        """Reset the environment."""
        self.t = 0
        self._target_position = self.sample_goal()['goals']
        if self.randomize_position_on_reset:
            self._position = self._sample_position(
                self.obs_range.low,
                self.obs_range.high,
            )

        return self._get_obs()

    def _position_inside_wall(self, pos):
        """Return True if the agent is in a wall."""
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        """Sample a starting position for the agent."""
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        """Return the observation of the agent.

        See States in the description of the environment for more.
        """
        if self.images_in_obs:
            img = self.get_image(32, 32).reshape([-1]).astype(
                np.float32) / 255.0
            return np.concatenate([img, self._position.copy()], 0)
        else:
            return self._position.copy()

    def compute_rewards(self, actions, obs):
        """See parent class.

        The rewards are described in the Rewards section of the description of
        the environment.
        """
        achieved_goals = obs['ob'][:, self.image_size:]
        desired_goals = self._target_position[np.newaxis, :]
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_goal(self):
        """See parent class."""
        return self._target_position.copy()

    def sample_goals(self, batch_size):
        """See parent class.

        The goal is the desired x,y coordinates.
        """
        if self.fixed_goal is not None:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                if batch_size > 1:
                    logging.warning("This is very slow!")
                goals[b, :] = self._sample_position(
                    self.obs_range.low,
                    self.obs_range.high,
                )
        return {'goals': goals}

    def set_position(self, pos):
        """Set the position of the agent."""
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    # ======================================================================= #
    #                     Functions for ImageEnv wrapper                      #
    # ======================================================================= #

    def get_image(self, width=None, height=None):
        """Return a black and white image."""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self.drawer)
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, b = img[:, :, 0], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def draw(self, drawer):
        """Create the image corresponding to the current state."""
        drawer.fill(Color('white'))
        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        drawer.draw_solid_circle(
            self._position,
            self.ball_radius,
            Color('blue'),
        )

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )
        drawer.render()

    def render(self, mode='human', close=False):
        """Render the environment state."""
        if mode == 'rgb_array':
            return self.get_image(self.render_size, self.render_size)

        if close:
            self.render_drawer = None
            return

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    # ======================================================================= #
    #                      Visualization/utility methods                      #
    # ======================================================================= #

    def true_model(self, state, action):
        """Return the next position by the agent.

        Parameters
        ----------
        state : array_like
            the state by the agent
        action : array_like
            the action by the agent

        Returns
        -------
        array_like
            the next position.
        """
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )

    def true_states(self, state, actions):
        """Return the next states given a set of states and actions.

        Parameters
        ----------
        state : array_like
            the states by the agent
        actions : array_like
            the actions by the agent

        Returns
        -------
        list of array_like
            the next states
        """
        real_states = [state]
        for action in actions:
            next_state = self.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    def plot_trajectory(self, ax, states, actions, goal=None):
        """Plot the trajectory of an agent.

        Parameters
        ----------
        ax : matplotlib.axes.Axes
            the axis object to plot the figure on
        states : array_like
            the states by the agent
        actions : array_like
            the actions by the agent
        goal : [int, int]
            the x,y coordinates of the goal
        """
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -self.boundary_dist,
                -self.boundary_dist,
            ],
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            [
                self.boundary_dist,
                self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                self.boundary_dist,
                self.boundary_dist,
            ],
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                self.boundary_dist,
                -self.boundary_dist,
            ],
            [
                -self.boundary_dist,
                -self.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -self.boundary_dist - 1,
            self.boundary_dist + 1,
        )
        ax.set_xlim(
            -self.boundary_dist - 1,
            self.boundary_dist + 1,
        )
Esempio n. 7
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=84,
            get_image_base_render_size=None,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.60,
            boundary_dist=4,
            ball_radius=0.50,
            walls=None,
            fixed_goal=None,
            fixed_init_position=None,
            randomize_position_on_reset=True,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            pointmass_color="blue",
            bg_color="black",
            wall_color="white",
            **kwargs):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if fixed_init_position is not None:
            fixed_init_position = np.array(fixed_init_position)
        if len(kwargs) > 0:
            LOGGER = logging.getLogger(__name__)
            LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)
        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self._fixed_init_position = fixed_init_position
        self.randomize_position_on_reset = randomize_position_on_reset
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.pointmass_color = pointmass_color
        self.bg_color = bg_color
        self._wall_color = wall_color
        self.render_drawer = None

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = np.zeros(2)
        self._position = np.zeros(2)

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        if get_image_base_render_size:
            base_width, base_height = get_image_base_render_size
            self._drawer = PygameViewer(
                screen_width=base_width,
                screen_height=base_height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
            self._fixed_get_image_render_size = True
        else:
            self._drawer = None
            self._fixed_get_image_render_size = False

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
        }
        done = False
        return ob, reward, done, info

    def reset(self):
        self._target_position = self.sample_goal()['state_desired_goal']
        if self.randomize_position_on_reset:
            self._position = self._sample_position(
                self.obs_range.low,
                self.obs_range.high,
            )
        else:
            self._position = self._fixed_init_position

        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        return dict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=self._position.copy(),
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'radius',
                'target_position',
                'distance_to_target',
                'velocity',
                'speed',
                'is_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_contextual_diagnostics(self, paths, contexts):
        diagnostics = OrderedDict()
        state_key = "state_observation"
        goal_key = "state_desired_goal"
        values = []
        for i in range(len(paths)):
            state = paths[i]["observations"][-1][state_key]
            goal = contexts[i][goal_key]
            distance = np.linalg.norm(state - goal)
            values.append(distance)
        diagnostics_key = goal_key + "/final/distance"
        diagnostics.update(create_stats_ordered_dict(
            diagnostics_key,
            values,
        ))

        values = []
        for i in range(len(paths)):
            for j in range(len(paths[i]["observations"])):
                state = paths[i]["observations"][j][state_key]
                goal = contexts[i][goal_key]
                distance = np.linalg.norm(state - goal)
                values.append(distance)
        diagnostics_key = goal_key + "/distance"
        diagnostics.update(create_stats_ordered_dict(
            diagnostics_key,
            values,
        ))
        return diagnostics

    def goal_conditioned_diagnostics(self, paths, goals):
        statistics = OrderedDict()
        distance_to_target_list = []
        is_success_list = []
        for path, goal in zip(paths, goals):
            distance_to_target = np.linalg.norm(path['observations'] - goal,
                                                axis=1)
            is_success = distance_to_target < self.target_radius
            distance_to_target_list.append(distance_to_target)
            is_success_list.append(is_success)
        for stat_name, stat_list in [
            ('distance_to_target', distance_to_target_list),
            ('is_success', is_success_list),
        ]:
            statistics.update(
                create_stats_ordered_dict(
                    stat_name,
                    stat_list,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    '{}/final'.format(stat_name),
                    [s[-1:] for s in stat_list],
                    always_show_all_stats=True,
                    exclude_max_min=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    '{}/initial'.format(stat_name),
                    [s[:1] for s in stat_list],
                    always_show_all_stats=True,
                    exclude_max_min=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        if not self.fixed_goal is None:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            if len(self.walls) > 0:
                if batch_size > 1:
                    logging.warning("This is very slow!")
                for b in range(batch_size):
                    goals[b, :] = self._sample_position(
                        self.obs_range.low,
                        self.obs_range.high,
                    )
            else:
                goals = np.random.uniform(
                    self.obs_range.low,
                    self.obs_range.high,
                    size=(batch_size, self.obs_range.low.size),
                )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self._drawer is None or (not self._fixed_get_image_render_size and
                                    (self._drawer.width != width
                                     or self._drawer.height != height)):
            if width != height:
                raise NotImplementedError()
            self._drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self._drawer)
        if width and height:
            wh_size = (width, height)
        else:
            wh_size = None
        img = self._drawer.get_image(wh_size)
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer):
        drawer.fill(Color(self.bg_color))
        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        drawer.draw_solid_circle(
            self._position,
            self.ball_radius,
            Color(self.pointmass_color),
        )

        for wall in self.walls:
            drawer.draw_rect(
                wall.endpoint4,
                wall.endpoint1[0] - wall.endpoint4[0],
                -wall.endpoint1[1] + wall.endpoint2[1],
                Color(self._wall_color),
                thickness=0,
            )
        drawer.render()

    def render(self, mode='human', close=False):
        if close:
            self.render_drawer = None
            return

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius,
                          self.boundary_dist + self.ball_radius),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'distance_to_target',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Esempio n. 8
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """

    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=84,
            reward_type="dense",
            action_scale=0.01,
            target_radius=0.60,
            boundary_dist=0.15,
            ball_radius=0.50,
            walls=None,
            fixed_goal=None,
            randomize_position_on_reset=True,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            record_interval=1,
            **kwargs
    ):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        if fixed_goal is not None:
            fixed_goal = np.array(fixed_goal)
        if len(kwargs) > 0:
            LOGGER = logging.getLogger(__name__)
            LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs)
        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.walls = walls
        self.fixed_goal = fixed_goal
        self.randomize_position_on_reset = randomize_position_on_reset
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.record_interval = record_interval

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))
        self._center = np.array([0.0, -0.05])

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')
        self.observation_space = spaces.Dict([
            ('observations', self.obs_range),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        self.drawer = None
        self.render_drawer = None
        self._meta_time = -1
        self._dtheta = 0.2
        self._index = -1
        self._ac_hist = []
        self._pos_hist = []

        self._step_time = 0

    def step(self, velocities):
        if self._step_time > 0 and (self._step_time + 1) % 50 == 0:
            self.reset(True)
        else:
            assert self.action_scale <= 1.0
            velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale

            self._ac_hist.append(velocities)

            new_position = self._position + velocities
            for wall in self.walls:
                new_position = wall.handle_collision(
                    self._position, new_position
                )
            self._position = new_position
            self._position = np.clip(
                self._position,
                a_min=-self.boundary_dist,
                a_max=self.boundary_dist,
            )

        self._pos_hist.append(self._position.copy())

        distance_to_target = np.linalg.norm(
            self._position - self._target_position
        )
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
        }
        done = False

        self._step_time += 1
        return ob, reward, done, info


    def reset(self, called_by_myself=False):
        # if self._pos_hist and self._meta_time % self.record_interval == 0:
        #     pos_hist = np.array(self._pos_hist)
        #     assert pos_hist.shape[0] == 50

        #     plt.figure()
        #     ax = plt.gca()
        #     ax.set_aspect('equal')

        #     circle = plt.Circle(self._center, 0.1, color='dimgray', fill=False)
        #     ax.add_artist(circle)
        #     plt.scatter(pos_hist[:, 0], pos_hist[:, 1], c=[(i+1)/pos_hist.shape[0] for i in range(pos_hist.shape[0])], cmap='winter', s=20)
        #     plt.scatter(self._target_position[0], self._target_position[1], s=30, c='dimgray')
            
        #     plt.xlim(self._center[0] - 0.12, self._center[0] + 0.12)
        #     plt.ylim(self._center[1] - 0.12, self._center[1] + 0.12)
        #     plt.savefig('/scr/annie/softlearning-ctrl/out1/{}.png'.format(self._meta_time))
        #     plt.close()

        if self._pos_hist:
            tmp1 = 0.1 * np.array([np.cos(self._dtheta * (self._index + 1)), np.sin(self._dtheta * (self._index + 1))]) + self._center
            tmp2 = 0.1 * np.array([np.cos(self._dtheta * (self._index - 1)), np.sin(self._dtheta * (self._index - 1))]) + self._center
            if np.linalg.norm(self._pos_hist[-1] - tmp1) > np.linalg.norm(self._pos_hist[-1] - tmp2):
                self._index += 1
            else:
                self._index -= 1

        self._ac_hist = []
        self._pos_hist = []

        if not called_by_myself:
            self._step_time = 0
            # self._index = 0
            # self._pos_hist = []

        self._target_position = self.sample_goal()['state_desired_goal']
        self._position = np.zeros(2)

        if called_by_myself:
            self._meta_time += 1
        # self._meta_time += 1

        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def _get_obs(self):
        return dict(
            observations=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=self._position.copy(),
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if self.reward_type == "sparse":
            return -(d > self.target_radius).astype(np.float32)
        elif self.reward_type == "dense":
            return -d
        elif self.reward_type == 'vectorized_dense':
            return -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError()

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
            'radius',
            'target_position',
            'distance_to_target',
            'velocity',
            'speed',
            'is_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(create_stats_ordered_dict(
                '%s%s' % (prefix, stat_name),
                stat,
                always_show_all_stats=True,
                ))
            statistics.update(create_stats_ordered_dict(
                'Final %s%s' % (prefix, stat_name),
                [s[-1] for s in stat],
                always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        if not self.fixed_goal is None:
            goals = np.repeat(
                self.fixed_goal.copy()[None],
                batch_size,
                0
            )
        else:
            goals = np.zeros((batch_size, self.obs_range.low.size))
            for b in range(batch_size):
                if batch_size > 1:
                    logging.warning("This is very slow!")

                # DISCRETE
                # if self._meta_time % 4 == 0:
                #     goals[b, 0] = 0.1
                #     goals[b, 1] = 0.0
                # elif self._meta_time % 4 == 1:
                #     goals[b, 0] = 0.0
                #     goals[b, 1] = 0.1
                # elif self._meta_time % 4 == 2:
                #     goals[b, 0] = -0.1
                #     goals[b, 1] = 0.0
                # else:
                #     goals[b, 0] = 0.0
                #     goals[b, 1] = -0.1

                # CIRCLE
                # goals[b, 0] = 0.1 * np.cos(self._dtheta * self._meta_time)
                # goals[b, 1] = 0.1 * np.sin(self._dtheta * self._meta_time)
                goals[b, 0] = 0.1 * np.cos(self._dtheta * self._index) + self._center[0]
                goals[b, 1] = 0.1 * np.sin(self._dtheta * self._index) + self._center[1]

                # LINE
                # goals[b, 0] = 0.1 * self._meta_time / 2000
                # goals[b, 1] = 0.1 - 0.1 * self._meta_time / 2000
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None):
        """Returns a black and white image"""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                x_bounds=(-self.boundary_dist - self.ball_radius, self.boundary_dist + self.ball_radius),
                y_bounds=(-self.boundary_dist - self.ball_radius, self.boundary_dist + self.ball_radius),
                render_onscreen=self.render_onscreen,
            )
        self.draw(self.drawer, False)
        img = self.drawer.get_image()
        if self.images_are_rgb:
            return img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            img = (-r + b).transpose().flatten()
            return img

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer, tick):
        # if self.drawer is not None:
        #     self.drawer.fill(Color('white'))
        # if self.render_drawer is not None:
        #     self.render_drawer.fill(Color('white'))
        drawer.fill(Color('white'))
        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        drawer.draw_solid_circle(
            self._position,
            self.ball_radius,
            Color('blue'),
        )

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )

        drawer.render()
        if tick:
            drawer.tick(self.render_dt_msec)

    def render(self, close=False):
        if close:
            self.render_drawer = None
            return

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                x_bounds=(-self.boundary_dist-self.ball_radius, self.boundary_dist+self.ball_radius),
                y_bounds=(-self.boundary_dist-self.ball_radius, self.boundary_dist+self.ball_radius),
                render_onscreen=True,
            )
        self.draw(self.render_drawer, True)

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
            'distance_to_target',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(create_stats_ordered_dict(
                '%s%s' % (prefix, stat_name),
                stat,
                always_show_all_stats=True,
                ))
            statistics.update(create_stats_ordered_dict(
                'Final %s%s' % (prefix, stat_name),
                [s[-1] for s in stat],
                always_show_all_stats=True,
                ))
        return statistics

    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(state[0], -state[1],
                    marker='o', color=color, markersize=10,
                    )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1],
                  scale_units='xy', angles='xy', scale=1, width=0.005)
        ax.quiver(x[:-1], y[:-1], actions_x, actions_y, scale_units='xy',
                  angles='xy', scale=1, color='r',
                  width=0.0035, )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k', linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass
Esempio n. 9
0
class Point2DEnv(MultitaskEnv, Serializable):
    """
    A little 2D point whose life goal is to reach a target.
    """
    def __init__(
            self,
            render_dt_msec=0,
            action_l2norm_penalty=0,  # disabled for now
            render_onscreen=False,
            render_size=256,
            reward_type="dense",
            action_scale=1.0,
            target_radius=0.5,
            boundary_dist=4,
            ball_radius=0.15,
            ball_pixel_radius=0,
            walls=None,
            init_pos_range=None,
            target_pos_range=None,
            sparse_goals=None,
            shuffle_states=False,
            images_are_rgb=False,  # else black and white
            show_goal=True,
            n_bins=10,
            use_count_reward=False,
            show_discrete_grid=False,
            convert_obs_to_image=False,
            **kwargs):
        if walls is None:
            walls = []
        if walls is None:
            walls = []
        # if fixed_goal is not None:
        #     fixed_goal = np.array(fixed_goal)

        self.quick_init(locals())
        self.render_dt_msec = render_dt_msec
        self.action_l2norm_penalty = action_l2norm_penalty
        self.render_onscreen = render_onscreen
        self.render_size = render_size
        self.reward_type = reward_type
        self.action_scale = action_scale
        self.target_radius = target_radius
        self.boundary_dist = boundary_dist
        self.ball_radius = ball_radius
        self.ball_pixel_radius = ball_pixel_radius
        self.walls = walls
        self.n_bins = n_bins
        self.use_count_reward = use_count_reward
        self.show_discrete_grid = show_discrete_grid
        self.images_are_rgb = images_are_rgb
        self.show_goal = show_goal
        self.sparse_goals = sparse_goals
        self.shuffle_states = shuffle_states
        self.convert_obs_to_image = convert_obs_to_image

        self.x_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.y_bins = np.linspace(-self.boundary_dist, self.boundary_dist,
                                  self.n_bins)
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

        self._random_mapping_x = np.random.permutation(range(N_SHUFFLE_BINS))
        self._random_mapping_y = np.random.permutation(range(N_SHUFFLE_BINS))

        self.max_target_distance = self.boundary_dist - self.target_radius

        self._target_position = None
        self._position = np.zeros((2))

        u = np.ones(2)
        self.action_space = spaces.Box(-u, u, dtype=np.float32)

        o = self.boundary_dist * np.ones(2)
        self.obs_range = spaces.Box(-o, o, dtype='float32')

        print(
            f"[Point2D] Using boundary dist: {self.boundary_dist}, obs space: ({self.obs_range.low}, {self.obs_range.high})"
        )

        if not init_pos_range:
            self.init_pos_range = self.obs_range
        else:
            assert np.all(np.abs(init_pos_range) < boundary_dist), (
                f"Init position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )
            low, high = init_pos_range
            self.init_pos_range = spaces.Box(np.array(low),
                                             np.array(high),
                                             dtype='float32')
            print(
                f"[Point2D] Using initial pos range of low={low}, high={high}")

        if not target_pos_range:
            self.target_pos_range = self.obs_range
        else:
            assert np.all(np.abs(target_pos_range) < boundary_dist), (
                f"Goal position must be"
                "within the boundaries of the environment: ({-boundary_dist}, {boundary_dist})"
            )

            low, high = target_pos_range
            self.target_pos_range = spaces.Box(np.array(low),
                                               np.array(high),
                                               dtype='float32')

        self.observation_space = spaces.Dict([
            ('observation', self.obs_range),
            ('onehot_observation',
             spaces.Box(0,
                        1,
                        shape=(2 * (self.n_bins + 1), ),
                        dtype=np.float32)),
            ('desired_goal', self.obs_range),
            ('achieved_goal', self.obs_range),
            ('state_observation', self.obs_range),
            ('state_desired_goal', self.obs_range),
            ('state_achieved_goal', self.obs_range),
        ])

        self.drawer = None
        self.render_drawer = None

        self.reset()

    def step(self, velocities):
        assert self.action_scale <= 1.0
        velocities = np.clip(velocities, a_min=-1, a_max=1) * self.action_scale
        new_position = self._position + velocities
        orig_new_pos = new_position.copy()
        for wall in self.walls:
            new_position = wall.handle_collision(self._position, new_position)
        if sum(new_position != orig_new_pos) > 1:
            """
            Hack: sometimes you get caught on two walls at a time. If you
            process the input in the other direction, you might only get
            caught on one wall instead.
            """
            new_position = orig_new_pos.copy()
            for wall in self.walls[::-1]:
                new_position = wall.handle_collision(self._position,
                                                     new_position)

        self._position = new_position
        self._position = np.clip(
            self._position,
            a_min=-self.boundary_dist,
            a_max=self.boundary_dist,
        )
        distance_to_target = np.linalg.norm(self._position -
                                            self._target_position)
        is_success = distance_to_target < self.target_radius

        ob = self._get_obs()
        x_d, y_d = ob['discrete_observation']
        self.bin_counts[x_d, y_d] += 1

        reward = self.compute_reward(velocities, ob)
        info = {
            'radius': self.target_radius,
            'target_position': self._target_position,
            'distance_to_target': distance_to_target,
            'velocity': velocities,
            'speed': np.linalg.norm(velocities),
            'is_success': is_success,
            'ext_reward': self._ext_reward,
        }

        if self.sparse_goals is not None:
            # Set reward for anything within threshold of goal(s) to 1
            # (ignore any classifier reward)
            goals, threshold = self.sparse_goals
            min_dist = self.compute_min_dist(goals)
            info['is_sparse_success'] = min_dist < threshold

        if hasattr(self, 'wall_shape'):
            if self.wall_shape == 'medium-maze':
                info[
                    'manhattan_dist_to_target'] = self._medium_maze_manhattan_distance(
                        ob['state_achieved_goal'], ob['state_desired_goal'])
            elif self.wall_shape == 'hard-maze':
                info[
                    'manhattan_dist_to_target'] = self._hard_maze_goal_distance(
                        ob['state_achieved_goal'], ob['state_desired_goal'])
            elif self.wall_shape == 'double-medium-maze':
                goals, threshold = self.sparse_goals
                info['manhattan_dist_to_sparse_goal'] = self.compute_min_dist(
                    goals, self._double_maze_manhattan_distance)
                info['manhattan_dist_to_target'] = self.compute_min_dist(
                    ob['state_desired_goal'],
                    self._double_maze_manhattan_distance)

        if self.use_count_reward:
            info['count_bonus'] = self._count_bonus

        done = False
        return ob, reward, done, info

    def compute_min_dist(
        self,
        goals,
        distance_fn=lambda s1, s2: np.linalg.norm(s1 - s2, axis=-1)):
        dists = []
        if len(goals.shape) == 1:
            goals = goals[None]
        for goal in goals:
            dist = distance_fn(self._position, goal)
            dists.append(dist)
        min_dist = np.array(dists).min(axis=0)
        return min_dist

    def reset(self):
        # TODO: Make this more general
        self._target_position = self.sample_goal()['state_desired_goal']
        # if self.randomize_position_on_reset:
        self._position = self._sample_position(
            # self.obs_range.low,
            # self.obs_range.high,
            self.init_pos_range.low,
            self.init_pos_range.high,
        )
        return self._get_obs()

    def _position_inside_wall(self, pos):
        for wall in self.walls:
            if wall.contains_point(pos):
                return True
        return False

    def _sample_position(self, low, high):
        if np.all(low == high):
            return low
        pos = np.random.uniform(low, high)
        while self._position_inside_wall(pos) is True:
            pos = np.random.uniform(low, high)
        return pos

    def clear_bin_counts(self):
        self.bin_counts = np.ones((self.n_bins + 1, self.n_bins + 1))

    def _discretize_observation(self, obs):
        x, y = obs['state_observation'].copy()
        x_d, y_d = np.digitize(x, self.x_bins), np.digitize(y, self.y_bins)
        return np.array([x_d, y_d])

    def _discretized_states(self, states, bins=16, low=-4, high=4):
        """
        Converts continuous to discrete states.
        
        Params
        - states: A shape (n, 2) batch of continuous observations
        - bins: Number of bins for both x and y coordinates
        - low: Lowest value (inclusive) for continuous x and y
        - high: Highest value (inclusive) for continuous x and y
        """
        bin_size = (high - low) / bins
        shifted_states = states - low
        return np.clip(shifted_states // bin_size, 0,
                       bins - 1).astype(np.int32)

    def _get_shuffled_states(self, states):
        if len(states.shape) == 1:
            states = states[None]
        states = self._discretized_states(states,
                                          bins=N_SHUFFLE_BINS,
                                          low=MAZE_LOW,
                                          high=MAZE_HIGH)
        results = (np.hstack([
            self._random_mapping_x[states[:, 0]][:, None],
            self._random_mapping_y[states[:, 1]][:, None]
        ]) - 8) / 2
        return results

    def _state_to_pixel_coords(self,
                               states,
                               state_min=-4,
                               state_max=4,
                               img_width=28):
        coords = (states - state_min) * (img_width / (state_max - state_min))

        # Need to subtract 1 because pygame rendering shifts everything to the left/up by 1 for some reason
        # (e.g. [-3, -3] is at 5, 5 instead of 6, 6). However this does not apply to the top/left edges.
        return np.clip(coords, 0, None).astype(np.int32)

    def _get_obs(self):
        state_obs = self._get_shuffled_states(self._position.copy()).squeeze() \
            if self.shuffle_states else self._position.copy()

        obs = collections.OrderedDict(
            observation=self._position.copy(),
            desired_goal=self._target_position.copy(),
            achieved_goal=self._position.copy(),
            state_observation=state_obs,
            state_desired_goal=self._target_position.copy(),
            state_achieved_goal=self._position.copy(),
        )

        # Update with discretized state
        pos_discrete = self._discretize_observation(obs)
        pos_onehot = np.eye(self.n_bins + 1)[pos_discrete]
        obs['discrete_observation'] = pos_discrete
        obs['onehot_observation'] = pos_onehot

        return obs

    def _hard_maze_goal_distance(self, s1, goal=None):
        s1 = s1.copy()
        if len(s1.shape) == 1:
            s1 = s1[None]

        x1, y1 = s1[:, 0], s1[:, 1]
        dist = np.zeros(len(s1))

        top_section = y1 <= -2
        dist[top_section] += np.abs(
            2 - x1[top_section])  # Move horizontally to corner
        dist[top_section] += -2 - y1[top_section]  # Move vertically to corner
        x1[top_section], y1[top_section] = 2, -2

        right_section = x1 >= 2
        dist[right_section] += x1[right_section] - 2  # Move horizontally
        dist[right_section] += np.abs(
            2 - y1[right_section])  # Move vertically to corner
        x1[right_section], y1[right_section] = 2, 2

        bottom_section = y1 >= 2
        dist[bottom_section] += np.abs(-2 -
                                       x1[bottom_section])  # Move horizontally
        dist[bottom_section] += y1[
            bottom_section] - 2  # Move vertically to corner
        x1[bottom_section], y1[bottom_section] = -2, 2

        left_section = x1 <= -2
        dist[left_section] += (-2 - x1[left_section])  # Move horizontally
        dist[left_section] += np.abs(
            y1[left_section])  # Move vertically to corner
        x1[left_section], y1[left_section] = -2, 0

        mid_section = np.logical_and(y1 <= 0, y1 >= -2)
        dist[mid_section] += np.abs(x1[mid_section])  # Move horizontally
        dist[mid_section] += np.abs(
            y1[mid_section])  # Move vertically to corner

        # Move to goal!!!
        dist += np.abs(x1 - (-0.5))
        dist += np.abs(y1 - 1.25)

        return dist

    def _medium_maze_manhattan_distance(self, s1, s2):
        # Maze wall positions
        left_wall_x = -self.boundary_dist / 3
        left_wall_bottom = self.inner_wall_max_dist
        right_wall_x = self.boundary_dist / 3
        right_wall_top = -self.inner_wall_max_dist

        s1 = s1.copy()
        s2 = s2.copy()

        if len(s1.shape) == 1:
            s1, s2 = s1[None], s2[None]

        # Since maze distances are symmetric, redefine s1,s2 for convenience
        # so that points in s1 are to the left of those in s2
        combined = np.hstack((s1[:, None], s2[:, None]))
        indices = np.argmin((s1[:, 0], s2[:, 0]), axis=0)
        s1 = np.take_along_axis(combined, indices[:, None, None],
                                axis=1).squeeze(axis=1)
        s2 = np.take_along_axis(combined, 1 - indices[:, None, None],
                                axis=1).squeeze(axis=1)

        x1 = s1[:, 0]
        x2 = s2[:, 0]

        # Horizontal movement
        x_dist = np.abs(x2 - x1)

        # Vertical movement
        boundary_ys = [left_wall_bottom, right_wall_top, 0]
        boundary_xs = [left_wall_x, right_wall_x, self.boundary_dist, -4.0001]
        y_directions = [
            1, -1, 0
        ]  # +1 means need to get to bottom, -1 means need to get to top
        curr_y, goal_y = s1[:, 1], s2[:, 1]
        y_dist = np.zeros(len(s1))

        for i in range(3):
            # Get all points where s1 and s2 respectively are in the current vertical section of the maze
            curr_in_section = x1 <= boundary_xs[i]
            goal_in_section = np.logical_and(boundary_xs[i - 1] < x2,
                                             x2 <= boundary_xs[i])
            goal_after_section = x2 > boundary_xs[i]

            # Both in current section: move directly to goal
            mask = np.logical_and(curr_in_section, goal_in_section)
            y_dist += mask * np.abs(curr_y - goal_y)
            curr_y[mask] = goal_y[mask]

            # s2 is further in maze: move to next corner
            mask = np.logical_and(
                curr_in_section,
                np.logical_and(goal_after_section, y_directions[i] *
                               (boundary_ys[i] - curr_y) > 0))
            y_dist += mask * np.clip(
                y_directions[i] * (boundary_ys[i] - curr_y), 0, None)
            curr_y[mask] = boundary_ys[i]

        return x_dist + y_dist

    def _double_maze_manhattan_distance(self, s1, s2):
        # Maze wall positions
        left_wall_bottom = self.inner_wall_max_dist + 1
        right_wall_top = -self.inner_wall_max_dist - 1

        s1 = s1.copy()
        s2 = s2.copy()

        if len(s1.shape) == 1:
            s1 = s1[None]
        if len(s2.shape) == 1:
            s2 = s2[None]

        # Since maze distances are symmetric, redefine s1,s2 for convenience
        # so that points in s1 are to the left of those in s2
        combined = np.hstack((s1[:, None], s2[:, None]))
        indices = np.argmin((s1[:, 0], s2[:, 0]), axis=0)
        s1 = np.take_along_axis(combined, indices[:, None, None],
                                axis=1).squeeze(axis=1)
        s2 = np.take_along_axis(combined, 1 - indices[:, None, None],
                                axis=1).squeeze(axis=1)

        x1 = s1[:, 0]
        x2 = s2[:, 0]

        # Horizontal movement
        x_dist = np.abs(x2 - x1)

        # Vertical movement
        boundary_ys = [
            right_wall_top, left_wall_bottom, right_wall_top, left_wall_bottom,
            0
        ]
        boundary_xs = [
            -3 / 5 * self.boundary_dist, -1 / 5 * self.boundary_dist,
            1 / 5 * self.boundary_dist, 3 / 5 * self.boundary_dist,
            self.boundary_dist, -4.0001
        ]
        y_directions = [
            -1, +1, -1, +1, 0
        ]  # +1 means need to get to bottom, -1 means need to get to top
        curr_y, goal_y = s1[:, 1], s2[:, 1]
        y_dist = np.zeros(len(s1))

        for i in range(5):
            # Get all points where s1 and s2 respectively are in the current vertical section of the maze
            curr_in_section = x1 <= boundary_xs[i]
            #         print(x1)
            #         print(boundary_xs[i])
            goal_in_section = np.logical_and(boundary_xs[i - 1] < x2,
                                             x2 <= boundary_xs[i])
            goal_after_section = x2 > boundary_xs[i]

            #         print("x1:", x1)
            #         print("x2:", x2)
            #         print(boundary_xs[i])
            #         print(boundary_xs[i-1])
            #         print(curr_in_section, goal_in_section)

            # Both in current section: move directly to goal
            mask = np.logical_and(curr_in_section, goal_in_section)
            y_dist += mask * np.abs(curr_y - goal_y)
            curr_y[mask] = goal_y[mask]

            # s2 is further in maze: move to next corner
            mask = np.logical_and(
                curr_in_section,
                np.logical_and(goal_after_section, y_directions[i] *
                               (boundary_ys[i] - curr_y) > 0))
            y_dist += mask * np.clip(
                y_directions[i] * (boundary_ys[i] - curr_y), 0, None)
            curr_y[mask] = boundary_ys[i]

    #     print(x_dist, y_dist)
        return x_dist + y_dist

    def compute_rewards(self, actions, obs, reward_type=None):
        reward_type = reward_type or self.reward_type

        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        d = np.linalg.norm(achieved_goals - desired_goals, axis=-1)
        if reward_type == "sparse":
            r = -(d > self.target_radius).astype(np.float32)
        elif reward_type == "multi-sparse":
            goals, threshold = self.sparse_goals
            min_dist = np.array([self.compute_min_dist(goals)])
            r = -(min_dist > self.target_radius).astype(np.float32)
        elif reward_type == "dense":
            r = -d
        elif reward_type == "vectorized_dense":
            r = -np.abs(achieved_goals - desired_goals)
        elif reward_type == "medium_maze_ground_truth_manhattan":
            """
            # Use maze distance from current position to goal as the negative reward.
            # This is specifically for the medium Maze-v0 env, which has two vertical walls.
            """
            r = -self._medium_maze_manhattan_distance(achieved_goals,
                                                      desired_goals)
        elif reward_type == "laplace_smoothing":
            # Laplace smoothing: 1 within the goal region, 1/(n+2) at all other states
            # (where n is the number of visitations)
            r = np.zeros(d.shape)
            pos_d = obs['discrete_observation']
            r = 1 / (self.bin_counts[pos_d[:, 0], pos_d[:, 1]] + 2)
            r[d <= self.target_radius] = 1
        elif reward_type == "none":
            r = np.zeros(d.shape)
        else:
            raise NotImplementedError(
                f"Unrecognized reward type: {reward_type}")

        if self.use_count_reward:
            # TODO: Add different count based strategies
            pos_d = obs['discrete_observation']
            self._ext_reward = r.copy()
            self._count_bonus = 1 / np.sqrt(self.bin_counts[pos_d[:, 0],
                                                            pos_d[:, 1]])
            r += self._count_bonus
        else:
            self._ext_reward = r.copy()

        return r

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'radius',
                'target_position',
                'distance_to_target',
                'velocity',
                'speed',
                'is_success',
                'ext_reward',
                'count_bonus',
        ] + (['is_sparse_success'] if self.sparse_goals is not None else []):
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_goal(self):
        return {
            'desired_goal': self._target_position.copy(),
            'state_desired_goal': self._target_position.copy(),
        }

    def sample_goals(self, batch_size):
        # if self.fixed_goal:
        #     goals = np.repeat(
        #         self.fixed_goal.copy()[None],
        #         batch_size,
        #         0)
        # else:
        goals = np.zeros((batch_size, self.obs_range.low.size))
        for b in range(batch_size):
            if batch_size > 1:
                logging.warning("This is very slow!")
            goals[b, :] = self._sample_position(
                # self.obs_range.low,
                # self.obs_range.high,
                self.target_pos_range.low,
                self.target_pos_range.high,
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def set_position(self, pos):
        self._position[0] = pos[0]
        self._position[1] = pos[1]

    """Functions for ImageEnv wrapper"""

    def get_image(self, width=None, height=None, invert_colors=False):
        """Returns a black and white image"""
        if self.drawer is None:
            if width != height:
                raise NotImplementedError()
            self.drawer = PygameViewer(
                screen_width=width,
                screen_height=height,
                # TODO: Action scale = 1 breaks rendering, why?
                # x_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                # y_bounds=(-self.boundary_dist - self.ball_radius,
                #           self.boundary_dist + self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=self.render_onscreen,
            )

        old_position = self._position
        self._position = np.array([-4, -4])

        self.draw(self.drawer)
        img = self.drawer.get_image()
        # img = skimage.transform.resize(img, (width, height), anti_aliasing=True, preserve_range=True)
        self._position = old_position

        if self.images_are_rgb:
            im = img.transpose((1, 0, 2))
        else:
            r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
            im = (-r + b).transpose().flatten()

        im[0:2, 0:2] = np.array([255, 255, 255])
        im = 255 - im  # Invert image - make background black, walls white
        im = im[None]

        # Set pixel at ball position to blue for each individual observation
        pixel_coords = self._state_to_pixel_coords(self._position,
                                                   img_width=width)[None]
        for i in range(-self.ball_pixel_radius, self.ball_pixel_radius + 1):
            for j in range(-self.ball_pixel_radius,
                           self.ball_pixel_radius + 1):
                im[range(len(pixel_coords)),
                   np.clip(pixel_coords[:, 1] + i, 0, 27),
                   np.clip(pixel_coords[:, 0] + j, 0, 27)] = np.array(
                       [0, 0, 255])

        self._position = old_position
        return im[0] / 255

    def set_to_goal(self, goal_dict):
        goal = goal_dict["state_desired_goal"]
        self._position = goal
        self._target_position = goal

    def get_env_state(self):
        return self._get_obs()

    def set_env_state(self, state):
        position = state["state_observation"]
        goal = state["state_desired_goal"]
        self._position = position
        self._target_position = goal

    def draw(self, drawer):
        drawer.fill(Color('white'))

        if self.show_discrete_grid:
            for x in self.x_bins:
                drawer.draw_segment((x, -self.boundary_dist),
                                    (x, self.boundary_dist),
                                    Color(220, 220, 220, 25),
                                    aa=False)
            for y in self.y_bins:
                drawer.draw_segment((-self.boundary_dist, y),
                                    (self.boundary_dist, y),
                                    Color(220, 220, 220, 25),
                                    aa=False)

        if self.show_goal:
            drawer.draw_solid_circle(
                self._target_position,
                self.target_radius,
                Color('green'),
            )
        try:
            drawer.draw_solid_circle(
                self._position,
                self.ball_radius,
                Color('blue'),
            )
        except ValueError as e:
            print('\n\n RENDER ERROR \n\n')

        for wall in self.walls:
            drawer.draw_segment(
                wall.endpoint1,
                wall.endpoint2,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint2,
                wall.endpoint3,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint3,
                wall.endpoint4,
                Color('black'),
            )
            drawer.draw_segment(
                wall.endpoint4,
                wall.endpoint1,
                Color('black'),
            )
        drawer.render()

    def render(self,
               mode='human',
               width=None,
               height=None,
               invert_colors=False,
               close=False):
        if close:
            self.render_drawer = None
            return
        if mode == 'rgb_array':
            return self.get_image(width=width,
                                  height=height,
                                  invert_colors=invert_colors)

        if self.render_drawer is None or self.render_drawer.terminated:
            self.render_drawer = PygameViewer(
                self.render_size,
                self.render_size,
                # x_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                # y_bounds=(-self.boundary_dist-self.ball_radius,
                #           self.boundary_dist+self.ball_radius),
                x_bounds=(-self.boundary_dist, self.boundary_dist),
                y_bounds=(-self.boundary_dist, self.boundary_dist),
                render_onscreen=True,
            )
        self.draw(self.render_drawer)
        self.render_drawer.tick(self.render_dt_msec)
        if mode != 'interactive':
            self.render_drawer.check_for_exit()

    # def get_diagnostics(self, paths, prefix=''):
    #     statistics = OrderedDict()
    #     for stat_name in ('distance_to_target', ):
    #         stat_name = stat_name
    #         stat = get_stat_in_paths(paths, 'env_infos', stat_name)
    #         statistics.update(create_stats_ordered_dict(
    #             '%s%s' % (prefix, stat_name),
    #             stat,
    #             always_show_all_stats=True,
    #             ))
    #         statistics.update(create_stats_ordered_dict(
    #             'Final %s%s' % (prefix, stat_name),
    #             [s[-1] for s in stat],
    #             always_show_all_stats=True,
    #             ))
    #     return statistics
    """Static visualization/utility methods"""

    @staticmethod
    def true_model(state, action):
        velocities = np.clip(action, a_min=-1, a_max=1)
        position = state
        new_position = position + velocities
        return np.clip(
            new_position,
            a_min=-Point2DEnv.boundary_dist,
            a_max=Point2DEnv.boundary_dist,
        )

    @staticmethod
    def true_states(state, actions):
        real_states = [state]
        for action in actions:
            next_state = Point2DEnv.true_model(state, action)
            real_states.append(next_state)
            state = next_state
        return real_states

    @staticmethod
    def plot_trajectory(ax, states, actions, goal=None):
        assert len(states) == len(actions) + 1
        x = states[:, 0]
        y = -states[:, 1]
        num_states = len(states)
        plasma_cm = plt.get_cmap('plasma')
        for i, state in enumerate(states):
            color = plasma_cm(float(i) / num_states)
            ax.plot(
                state[0],
                -state[1],
                marker='o',
                color=color,
                markersize=10,
            )

        actions_x = actions[:, 0]
        actions_y = -actions[:, 1]

        ax.quiver(x[:-1],
                  y[:-1],
                  x[1:] - x[:-1],
                  y[1:] - y[:-1],
                  scale_units='xy',
                  angles='xy',
                  scale=1,
                  width=0.005)
        ax.quiver(
            x[:-1],
            y[:-1],
            actions_x,
            actions_y,
            scale_units='xy',
            angles='xy',
            scale=1,
            color='r',
            width=0.0035,
        )
        ax.plot(
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                Point2DEnv.boundary_dist,
            ],
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )
        ax.plot(
            [
                Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            [
                -Point2DEnv.boundary_dist,
                -Point2DEnv.boundary_dist,
            ],
            color='k',
            linestyle='-',
        )

        if goal is not None:
            ax.plot(goal[0], -goal[1], marker='*', color='g', markersize=15)
        ax.set_ylim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )
        ax.set_xlim(
            -Point2DEnv.boundary_dist - 1,
            Point2DEnv.boundary_dist + 1,
        )

    def initialize_camera(self, init_fctn):
        pass