def __init__(self,
                 vec_env,
                 vec_episodic_memory,
                 observation_embedding_fn,
                 target_image_shape,
                 exploration_reward='episodic_curiosity',
                 scale_task_reward=1.0,
                 scale_surrogate_reward=0.0,
                 append_ec_reward_as_channel=False,
                 bonus_reward_additive_term=0,
                 exploration_reward_min_step=0,
                 similarity_threshold=0.5):
        if exploration_reward == 'episodic_curiosity':
            if len(vec_episodic_memory) != vec_env.num_envs:
                raise ValueError(
                    'Each env must have a unique episodic memory.')

        # Note: post-processing of the observation might change the [0, 255]
        # range of the observation...
        if self._should_postprocess_observation(
                vec_env.observation_space.shape):
            observation_space_shape = target_image_shape[:]
            if append_ec_reward_as_channel:
                observation_space_shape[-1] += 1
            observation_space = gym.spaces.Box(low=0,
                                               high=255,
                                               shape=observation_space_shape,
                                               dtype=np.float)
        else:
            observation_space = vec_env.observation_space
            assert not append_ec_reward_as_channel, (
                'append_ec_reward_as_channel not compatible with non-image-like obs.'
            )

        VecEnvWrapper.__init__(self,
                               vec_env,
                               observation_space=observation_space)

        self._bonus_reward_additive_term = bonus_reward_additive_term
        self._vec_episodic_memory = vec_episodic_memory
        self._observation_embedding_fn = observation_embedding_fn
        self._target_image_shape = target_image_shape
        self._append_ec_reward_as_channel = append_ec_reward_as_channel

        self._exploration_reward = exploration_reward
        self._scale_task_reward = scale_task_reward
        self._scale_surrogate_reward = scale_surrogate_reward
        self._exploration_reward_min_step = exploration_reward_min_step

        # Oracle reward.
        self._oracles = [
            oracle.OracleExplorationReward() for _ in range(self.venv.num_envs)
        ]

        # Cumulative task reward over an episode.
        self._episode_task_reward = [0.0] * self.venv.num_envs
        self._episode_bonus_reward = [0.0] * self.venv.num_envs

        # Stats on the task and exploration reward.
        self._stats_task_reward = MovingAverage(capacity=100)
        self._stats_bonus_reward = MovingAverage(capacity=100)

        # Total number of steps so far per environment.
        self._step_count = 0

        self._similarity_threshold = similarity_threshold

        # Observers are notified each time a new time step is generated by the
        # environment.
        # Observers implement a function "on_new_observation".
        self._observers = []
Esempio n. 2
0
 def __init__(self, env):
     """Creates a new oracle to compute the exploration reward."""
     gym.Wrapper.__init__(self, env)
     self._oracle_exploration_reward = oracle.OracleExplorationReward()
    def __init__(
        self,
        vec_env,
        vec_episodic_memory,
        observation_embedding_fn,
        intrinsic_reward_fn,
        rlb_image_shape,
        target_image_shape,
        exploration_reward='rlb',
        scale_task_reward=1.0,
        scale_surrogate_reward=None,
        exploration_reward_min_step=0,
        ir_normalize_type=0,
        ir_clip_low=None,
        name='',
    ):
        logger.info('RLBEnvWrapper args: {}'.format(locals()))
        if exploration_reward == 'rlb':
            if len(vec_episodic_memory) != vec_env.num_envs:
                raise ValueError(
                    'Each env must have a unique episodic memory.')

        if target_image_shape is None:
            target_image_shape = rlb_image_shape

        if self._should_process_observation(vec_env.observation_space.shape):
            observation_space_shape = target_image_shape[:]
            observation_space = gym.spaces.Box(low=0,
                                               high=255,
                                               shape=observation_space_shape,
                                               dtype=np.float)
        else:
            observation_space = vec_env.observation_space

        VecEnvWrapper.__init__(self,
                               vec_env,
                               observation_space=observation_space)

        self._vec_episodic_memory = vec_episodic_memory
        self._observation_embedding_fn = observation_embedding_fn
        self._intrinsic_reward_fn = intrinsic_reward_fn
        self._rlb_image_shape = rlb_image_shape
        self._target_image_shape = target_image_shape

        self._exploration_reward = exploration_reward
        self._scale_task_reward = scale_task_reward
        self._scale_surrogate_reward = scale_surrogate_reward
        self._exploration_reward_min_step = exploration_reward_min_step

        # Oracle reward.
        self._oracles = [
            oracle.OracleExplorationReward() for _ in range(self.venv.num_envs)
        ]

        self._ir_normalize_type = ir_normalize_type
        if self._ir_normalize_type == 0:
            pass
        elif self._ir_normalize_type == 1:
            ir_normalize_gamma = 0.99
            self._irff = RewardForwardFilter(ir_normalize_gamma)
            self._irff_rms = RunningMeanStd()
        elif self._ir_normalize_type == 2:
            self._ir_rms = RunningMeanStd()
        elif self._ir_normalize_type == 3:
            self._ir_rms = SimpleWeightedMovingScalarMeanStd(alpha=0.0001)
        else:
            assert False

        self._ir_clip_low = ir_clip_low

        self._name = name

        # Cumulative task reward over an episode.
        self._episode_task_reward = [0.0] * self.venv.num_envs
        self._episode_bonus_reward = [0.0] * self.venv.num_envs

        # Stats on the task and exploration reward.
        self._stats_task_reward = MovingAverage(capacity=100)
        self._stats_bonus_reward = MovingAverage(capacity=100)

        # Total number of steps so far per environment.
        self._step_count = 0

        # Observers are notified each time a new time step is generated by the
        # environment.
        self._observers = []

        self._bonus_reward_raw_history = [[]
                                          for _ in range(self.venv.num_envs)]
        self._bonus_reward_history = [[] for _ in range(self.venv.num_envs)]