Ejemplo n.º 1
0
 def setUp(self):
     self.replay_buffer = HierReplayBuffer(buffer_size=2,
                                           batch_size=1,
                                           meta_period=1,
                                           meta_obs_dim=2,
                                           meta_ac_dim=3,
                                           worker_obs_dim=4,
                                           worker_ac_dim=5)
Ejemplo n.º 2
0
 def setUp(self):
     self.replay_buffer = HierReplayBuffer(
         buffer_size=2,
         batch_size=1,
         meta_period=3,
         obs_dim=1,
         ac_dim=1,
         co_dim=1,
         goal_dim=1,
         num_levels=3,
     )
Ejemplo n.º 3
0
class TestHierReplayBuffer(unittest.TestCase):
    """Tests for the HierReplayBuffer object."""
    def setUp(self):
        self.replay_buffer = HierReplayBuffer(
            buffer_size=2,
            batch_size=1,
            meta_period=3,
            obs_dim=1,
            ac_dim=1,
            co_dim=1,
            goal_dim=1,
            num_levels=3,
        )

    def tearDown(self):
        del self.replay_buffer

    def test_buffer_size(self):
        """Validate the buffer_size output from the replay buffer."""
        self.assertEqual(self.replay_buffer.buffer_size, 2)

    def test_add_sample(self):
        """Test the `add` and `sample` methods the replay buffer."""
        # Set the random seed.
        random.seed(0)

        obs_t = [
            np.array([0]),
            np.array([1]),
            np.array([2]),
            np.array([3]),
            np.array([4]),
            np.array([5]),
            np.array([6]),
            np.array([7]),
            np.array([8]),
            np.array([9])
        ]
        action_t = [[
            np.array([0]),
            np.array([1]),
            np.array([2]),
            np.array([3])
        ],
                    [
                        np.array([0]),
                        np.array([1]),
                        np.array([2]),
                        np.array([3]),
                        np.array([4]),
                        np.array([5]),
                        np.array([6]),
                        np.array([7]),
                        np.array([8]),
                        np.array([9])
                    ],
                    [
                        np.array([0]),
                        np.array([1]),
                        np.array([2]),
                        np.array([3]),
                        np.array([4]),
                        np.array([5]),
                        np.array([6]),
                        np.array([7]),
                        np.array([8]),
                        np.array([9])
                    ]]
        context_t = [np.array([0]), np.array([1])]
        reward_t = [[0], [0, 1, 2], [0, 1, 2, 3, 4, 5, 6, 7, 8]]
        done_t = [
            False, False, False, False, False, False, False, False, False
        ]

        # Add an element.
        self.replay_buffer.add(
            obs_t=obs_t,
            action_t=action_t,
            context_t=context_t,
            reward_t=reward_t,
            done_t=done_t,
        )

        # Check is_full in the False case.
        self.assertEqual(self.replay_buffer.is_full(), False)

        # Add an element.
        self.replay_buffer.add(
            obs_t=obs_t,
            action_t=action_t,
            context_t=context_t,
            reward_t=reward_t,
            done_t=done_t,
        )

        # Check is_full in the True case.
        self.assertEqual(self.replay_buffer.is_full(), True)

        # Check can_sample in the True case.
        self.assertEqual(self.replay_buffer.can_sample(), True)

        # Test the `sample` method.
        obs0, obs1, act, rew, done, _ = self.replay_buffer.sample(False)
        np.testing.assert_array_almost_equal(obs0[0], [[0, 0]])
        np.testing.assert_array_almost_equal(obs0[1], [[6, 2]])
        np.testing.assert_array_almost_equal(obs0[2], [[6, 6]])

        np.testing.assert_array_almost_equal(obs1[0], [[9, 1]])
        np.testing.assert_array_almost_equal(obs1[1], [[9, 3]])
        np.testing.assert_array_almost_equal(obs1[2], [[7, 7]])

        np.testing.assert_array_almost_equal(act[0], [[0]])
        np.testing.assert_array_almost_equal(act[1], [[6]])
        np.testing.assert_array_almost_equal(act[2], [[6]])

        np.testing.assert_array_almost_equal(rew[0], [0])
        np.testing.assert_array_almost_equal(rew[1], [2])
        np.testing.assert_array_almost_equal(rew[2], [6])

        np.testing.assert_array_almost_equal(done[0], [0])
        np.testing.assert_array_almost_equal(done[1], [0])
        np.testing.assert_array_almost_equal(done[2], [0])
Ejemplo n.º 4
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 meta_period,
                 worker_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 connected_gradients,
                 cg_weights,
                 use_fingerprints,
                 fingerprint_range,
                 centralized_value_functions,
                 env_name="",
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        meta_period : int
            manger action period
        worker_reward_scale : float
            the value the intrinsic (Worker) reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the Manager is meant to be a
            relative or absolute goal, i.e. specific state or change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        connected_gradients : bool
            whether to connect the graph between the manager and worker
        cg_weights : float
            weights for the gradients of the loss of the worker with respect to
            the parameters of the manager. Only used if `connected_gradients`
            is set to True.
        use_fingerprints : bool
            specifies whether to add a time-dependent fingerprint to the
            observations
        fingerprint_range : (list of float, list of float)
            the low and high values for each fingerprint element, if they are
            being used
        centralized_value_functions : bool
            specifies whether to use centralized value functions for the
            Manager and Worker critic functions
        meta_policy : type [ hbaselines.fcnet.base.ActorCriticPolicy ]
            the policy model to use for the Manager
        worker_policy : type [ hbaselines.fcnet.base.ActorCriticPolicy ]
            the policy model to use for the Worker
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(sess=sess,
                                                    ob_space=ob_space,
                                                    ac_space=ac_space,
                                                    co_space=co_space,
                                                    buffer_size=buffer_size,
                                                    batch_size=batch_size,
                                                    actor_lr=actor_lr,
                                                    critic_lr=critic_lr,
                                                    verbose=verbose,
                                                    tau=tau,
                                                    gamma=gamma,
                                                    layer_norm=layer_norm,
                                                    layers=layers,
                                                    act_fun=act_fun,
                                                    use_huber=use_huber)

        self.meta_period = meta_period
        self.worker_reward_scale = worker_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.connected_gradients = connected_gradients
        self.cg_weights = cg_weights
        self.use_fingerprints = use_fingerprints
        self.fingerprint_range = fingerprint_range
        self.fingerprint_dim = (len(self.fingerprint_range[0]), )
        self.centralized_value_functions = centralized_value_functions

        # Get the Manager's action space.
        manager_ac_space = get_manager_ac_space(ob_space, relative_goals,
                                                env_name, use_fingerprints,
                                                self.fingerprint_dim)

        # Manager observation size
        meta_ob_dim = self._get_ob_dim(ob_space, co_space)

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            meta_obs_dim=meta_ob_dim[0],
            meta_ac_dim=manager_ac_space.shape[0],
            worker_obs_dim=ob_space.shape[0] + manager_ac_space.shape[0],
            worker_ac_dim=ac_space.shape[0],
        )

        # Collect the state indices for the worker rewards.
        self.goal_indices = get_state_indices(ob_space, env_name,
                                              use_fingerprints,
                                              self.fingerprint_dim)

        # Utility method for indexing the goal out of an observation variable.
        self.crop_to_goal = lambda g: tf.gather(
            g,
            tf.tile(tf.expand_dims(np.array(self.goal_indices), 0),
                    [self.batch_size, 1]),
            batch_dims=1,
            axis=1)

        # =================================================================== #
        # Part 1. Setup the Manager                                           #
        # =================================================================== #

        # Create the Manager policy.
        with tf.compat.v1.variable_scope("Manager"):
            self.manager = meta_policy(
                sess=sess,
                ob_space=ob_space,
                ac_space=manager_ac_space,
                co_space=co_space,
                buffer_size=buffer_size,
                batch_size=batch_size,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                verbose=verbose,
                tau=tau,
                gamma=gamma,
                layer_norm=layer_norm,
                layers=layers,
                act_fun=act_fun,
                use_huber=use_huber,
                scope="Manager",
                zero_fingerprint=False,
                fingerprint_dim=self.fingerprint_dim[0],
                **(additional_params or {}),
            )

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        # previous observation by the Manager
        self.prev_meta_obs = None

        # current action by the Manager
        self.meta_action = None

        # current meta reward, counting as the cumulative environment reward
        # during the meta period
        self.meta_reward = None

        # The following is redundant but necessary if the changes to the update
        # function are to be in the GoalConditionedPolicy policy and not
        # FeedForwardPolicy.
        self.batch_size = batch_size

        # Use this to store a list of observations that stretch as long as the
        # dilated horizon chosen for the Manager. These observations correspond
        # to the s(t) in the HIRO paper.
        self._observations = []

        # Use this to store the list of environmental actions that the worker
        # takes. These actions correspond to the a(t) in the HIRO paper.
        self._worker_actions = []

        # rewards provided by the policy to the worker
        self._worker_rewards = []

        # done masks at every time step for the worker
        self._dones = []

        # actions performed by the manager during a given meta period. Used by
        # the replay buffer.
        self._meta_actions = []

        # =================================================================== #
        # Part 2. Setup the Worker                                            #
        # =================================================================== #

        # Create the Worker policy.
        with tf.compat.v1.variable_scope("Worker"):
            self.worker = worker_policy(
                sess,
                ob_space=ob_space,
                ac_space=ac_space,
                co_space=manager_ac_space,
                buffer_size=buffer_size,
                batch_size=batch_size,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                verbose=verbose,
                tau=tau,
                gamma=gamma,
                layer_norm=layer_norm,
                layers=layers,
                act_fun=act_fun,
                use_huber=use_huber,
                scope="Worker",
                zero_fingerprint=self.use_fingerprints,
                fingerprint_dim=self.fingerprint_dim[0],
                **(additional_params or {}),
            )

        # reward function for the worker
        def worker_reward_fn(states, goals, next_states):
            return negative_distance(states=states,
                                     state_indices=self.goal_indices,
                                     goals=goals,
                                     next_states=next_states,
                                     relative_context=relative_goals,
                                     offset=0.0)

        self.worker_reward_fn = worker_reward_fn

        if self.connected_gradients:
            self._setup_connected_gradients()
Ejemplo n.º 5
0
class GoalConditionedPolicy(ActorCriticPolicy):
    r"""Goal-conditioned hierarchical reinforcement learning model.

    This policy is an implementation of the two-level hierarchy presented
    in [1], which itself is similar to the feudal networks formulation [2, 3].
    This network consists of a high-level, or Manager, pi_{\theta_H} that
    computes and outputs goals g_t ~ pi_{\theta_H}(s_t, h) every `meta_period`
    time steps, and a low-level policy pi_{\theta_L} that takes as inputs the
    current state and the assigned goals and attempts to perform an action
    a_t ~ pi_{\theta_L}(s_t,g_t) that satisfies these goals.

    The Manager is rewarded based on the original environment reward function:
    r_H = r(s,a;h).

    The Target term, h, parameterizes the reward assigned to the Manager in
    order to allow the policy to generalize to several goals within a task, a
    technique that was first proposed by [4].

    Finally, the Worker is motivated to follow the goals set by the Manager via
    an intrinsic reward based on the distance between the current observation
    and the goal observation:
    r_L (s_t, g_t, s_{t+1}) = -||s_t + g_t - s_{t+1}||_2

    Bibliography:

    [1] Nachum, Ofir, et al. "Data-efficient hierarchical reinforcement
        learning." Advances in Neural Information Processing Systems. 2018.
    [2] Dayan, Peter, and Geoffrey E. Hinton. "Feudal reinforcement learning."
        Advances in neural information processing systems. 1993.
    [3] Vezhnevets, Alexander Sasha, et al. "Feudal networks for hierarchical
        reinforcement learning." Proceedings of the 34th International
        Conference on Machine Learning-Volume 70. JMLR. org, 2017.
    [4] Schaul, Tom, et al. "Universal value function approximators."
        International Conference on Machine Learning. 2015.

    Attributes
    ----------
    manager : hbaselines.fcnet.base.ActorCriticPolicy
        the manager policy
    meta_period : int
        manger action period
    worker_reward_scale : float
        the value the intrinsic (Worker) reward should be scaled by
    relative_goals : bool
        specifies whether the goal issued by the Manager is meant to be a
        relative or absolute goal, i.e. specific state or change in state
    off_policy_corrections : bool
        whether to use off-policy corrections during the update procedure. See:
        https://arxiv.org/abs/1805.08296.
    hindsight : bool
        whether to use hindsight action and goal transitions, as well as
        subgoal testing. See: https://arxiv.org/abs/1712.00948
    subgoal_testing_rate : float
        rate at which the original (non-hindsight) sample is stored in the
        replay buffer as well. Used only if `hindsight` is set to True.
    connected_gradients : bool
        whether to connect the graph between the manager and worker
    cg_weights : float
        weights for the gradients of the loss of the worker with respect to the
        parameters of the manager. Only used if `connected_gradients` is set to
        True.
    use_fingerprints : bool
        specifies whether to add a time-dependent fingerprint to the
        observations
    fingerprint_range : (list of float, list of float)
        the low and high values for each fingerprint element, if they are being
        used
    fingerprint_dim : tuple of int
        the shape of the fingerprint elements, if they are being used
    centralized_value_functions : bool
        specifies whether to use centralized value functions for the Manager
        critic functions
    prev_meta_obs : array_like
        previous observation by the Manager
    meta_action : array_like
        current action by the Manager
    meta_reward : float
        current meta reward, counting as the cumulative environment reward
        during the meta period
    batch_size : int
        SGD batch size
    worker : hbaselines.fcnet.base.ActorCriticPolicy
        the worker policy
    worker_reward_fn : function
        reward function for the worker
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 meta_period,
                 worker_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 connected_gradients,
                 cg_weights,
                 use_fingerprints,
                 fingerprint_range,
                 centralized_value_functions,
                 env_name="",
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        meta_period : int
            manger action period
        worker_reward_scale : float
            the value the intrinsic (Worker) reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the Manager is meant to be a
            relative or absolute goal, i.e. specific state or change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        connected_gradients : bool
            whether to connect the graph between the manager and worker
        cg_weights : float
            weights for the gradients of the loss of the worker with respect to
            the parameters of the manager. Only used if `connected_gradients`
            is set to True.
        use_fingerprints : bool
            specifies whether to add a time-dependent fingerprint to the
            observations
        fingerprint_range : (list of float, list of float)
            the low and high values for each fingerprint element, if they are
            being used
        centralized_value_functions : bool
            specifies whether to use centralized value functions for the
            Manager and Worker critic functions
        meta_policy : type [ hbaselines.fcnet.base.ActorCriticPolicy ]
            the policy model to use for the Manager
        worker_policy : type [ hbaselines.fcnet.base.ActorCriticPolicy ]
            the policy model to use for the Worker
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(sess=sess,
                                                    ob_space=ob_space,
                                                    ac_space=ac_space,
                                                    co_space=co_space,
                                                    buffer_size=buffer_size,
                                                    batch_size=batch_size,
                                                    actor_lr=actor_lr,
                                                    critic_lr=critic_lr,
                                                    verbose=verbose,
                                                    tau=tau,
                                                    gamma=gamma,
                                                    layer_norm=layer_norm,
                                                    layers=layers,
                                                    act_fun=act_fun,
                                                    use_huber=use_huber)

        self.meta_period = meta_period
        self.worker_reward_scale = worker_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.connected_gradients = connected_gradients
        self.cg_weights = cg_weights
        self.use_fingerprints = use_fingerprints
        self.fingerprint_range = fingerprint_range
        self.fingerprint_dim = (len(self.fingerprint_range[0]), )
        self.centralized_value_functions = centralized_value_functions

        # Get the Manager's action space.
        manager_ac_space = get_manager_ac_space(ob_space, relative_goals,
                                                env_name, use_fingerprints,
                                                self.fingerprint_dim)

        # Manager observation size
        meta_ob_dim = self._get_ob_dim(ob_space, co_space)

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            meta_obs_dim=meta_ob_dim[0],
            meta_ac_dim=manager_ac_space.shape[0],
            worker_obs_dim=ob_space.shape[0] + manager_ac_space.shape[0],
            worker_ac_dim=ac_space.shape[0],
        )

        # Collect the state indices for the worker rewards.
        self.goal_indices = get_state_indices(ob_space, env_name,
                                              use_fingerprints,
                                              self.fingerprint_dim)

        # Utility method for indexing the goal out of an observation variable.
        self.crop_to_goal = lambda g: tf.gather(
            g,
            tf.tile(tf.expand_dims(np.array(self.goal_indices), 0),
                    [self.batch_size, 1]),
            batch_dims=1,
            axis=1)

        # =================================================================== #
        # Part 1. Setup the Manager                                           #
        # =================================================================== #

        # Create the Manager policy.
        with tf.compat.v1.variable_scope("Manager"):
            self.manager = meta_policy(
                sess=sess,
                ob_space=ob_space,
                ac_space=manager_ac_space,
                co_space=co_space,
                buffer_size=buffer_size,
                batch_size=batch_size,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                verbose=verbose,
                tau=tau,
                gamma=gamma,
                layer_norm=layer_norm,
                layers=layers,
                act_fun=act_fun,
                use_huber=use_huber,
                scope="Manager",
                zero_fingerprint=False,
                fingerprint_dim=self.fingerprint_dim[0],
                **(additional_params or {}),
            )

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        # previous observation by the Manager
        self.prev_meta_obs = None

        # current action by the Manager
        self.meta_action = None

        # current meta reward, counting as the cumulative environment reward
        # during the meta period
        self.meta_reward = None

        # The following is redundant but necessary if the changes to the update
        # function are to be in the GoalConditionedPolicy policy and not
        # FeedForwardPolicy.
        self.batch_size = batch_size

        # Use this to store a list of observations that stretch as long as the
        # dilated horizon chosen for the Manager. These observations correspond
        # to the s(t) in the HIRO paper.
        self._observations = []

        # Use this to store the list of environmental actions that the worker
        # takes. These actions correspond to the a(t) in the HIRO paper.
        self._worker_actions = []

        # rewards provided by the policy to the worker
        self._worker_rewards = []

        # done masks at every time step for the worker
        self._dones = []

        # actions performed by the manager during a given meta period. Used by
        # the replay buffer.
        self._meta_actions = []

        # =================================================================== #
        # Part 2. Setup the Worker                                            #
        # =================================================================== #

        # Create the Worker policy.
        with tf.compat.v1.variable_scope("Worker"):
            self.worker = worker_policy(
                sess,
                ob_space=ob_space,
                ac_space=ac_space,
                co_space=manager_ac_space,
                buffer_size=buffer_size,
                batch_size=batch_size,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                verbose=verbose,
                tau=tau,
                gamma=gamma,
                layer_norm=layer_norm,
                layers=layers,
                act_fun=act_fun,
                use_huber=use_huber,
                scope="Worker",
                zero_fingerprint=self.use_fingerprints,
                fingerprint_dim=self.fingerprint_dim[0],
                **(additional_params or {}),
            )

        # reward function for the worker
        def worker_reward_fn(states, goals, next_states):
            return negative_distance(states=states,
                                     state_indices=self.goal_indices,
                                     goals=goals,
                                     next_states=next_states,
                                     relative_context=relative_goals,
                                     offset=0.0)

        self.worker_reward_fn = worker_reward_fn

        if self.connected_gradients:
            self._setup_connected_gradients()

    def initialize(self):
        """See parent class.

        This method calls the initialization methods of the manager and worker.
        """
        self.manager.initialize()
        self.worker.initialize()
        self.meta_reward = 0

    def update(self, update_actor=True, **kwargs):
        """Perform a gradient update step.

        This is done both at the level of the Manager and Worker policies.

        The kwargs argument for this method contains two additional terms:

        * update_meta (bool): specifies whether to perform a gradient update
          step for the meta-policy (i.e. Manager)
        * update_meta_actor (bool): similar to the `update_policy` term, but
          for the meta-policy. Note that, if `update_meta` is set to False,
          this term is void.

        **Note**; The target update soft updates for both the manager and the
        worker policies occur at the same frequency as their respective actor
        update frequencies.

        Parameters
        ----------
        update_actor : bool
            specifies whether to update the actor policy. The critic policy is
            still updated if this value is set to False.

        Returns
        -------
         ([float, float], [float, float])
            manager critic loss, worker critic loss
        (float, float)
            manager actor loss, worker actor loss
        """
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return ([0, 0], [0, 0]), (0, 0)

        # Specifies whether to remove additional data from the replay buffer
        # sampling procedure. Since only a subset of algorithms use additional
        # data, removing it can speedup the other algorithms.
        with_additional = self.off_policy_corrections

        # Get a batch.
        meta_obs0, meta_obs1, meta_act, meta_rew, meta_done, worker_obs0, \
            worker_obs1, worker_act, worker_rew, worker_done, additional = \
            self.replay_buffer.sample(with_additional=with_additional)

        # Update the Manager policy.
        if kwargs['update_meta']:
            # Replace the goals with the most likely goals.
            if self.off_policy_corrections:
                meta_act = self._sample_best_meta_action(
                    meta_obs0=meta_obs0,
                    meta_obs1=meta_obs1,
                    meta_action=meta_act,
                    worker_obses=additional["worker_obses"],
                    worker_actions=additional["worker_actions"],
                    k=8)

            if self.connected_gradients:
                # Perform the connected gradients update procedure.
                m_critic_loss, m_actor_loss = self._connected_gradients_update(
                    obs0=meta_obs0,
                    actions=meta_act,
                    rewards=meta_rew,
                    obs1=meta_obs1,
                    terminals1=meta_done,
                    update_actor=kwargs['update_meta_actor'],
                    worker_obs0=worker_obs0,
                    worker_obs1=worker_obs1,
                    worker_actions=worker_act,
                )
            else:
                # Perform the regular manager update procedure.
                m_critic_loss, m_actor_loss = self.manager.update_from_batch(
                    obs0=meta_obs0,
                    actions=meta_act,
                    rewards=meta_rew,
                    obs1=meta_obs1,
                    terminals1=meta_done,
                    update_actor=kwargs['update_meta_actor'],
                )
        else:
            m_critic_loss, m_actor_loss = [0, 0], 0

        # Update the Worker policy.
        w_critic_loss, w_actor_loss = self.worker.update_from_batch(
            obs0=worker_obs0,
            actions=worker_act,
            rewards=worker_rew,
            obs1=worker_obs1,
            terminals1=worker_done,
            update_actor=update_actor,
        )

        return (m_critic_loss, w_critic_loss), (m_actor_loss, w_actor_loss)

    def get_action(self, obs, context, apply_noise, random_actions):
        """See parent class."""
        if self._update_meta:
            # Update the meta action based on the output from the policy if the
            # time period requires is.
            self.meta_action = self.manager.get_action(obs, context,
                                                       apply_noise,
                                                       random_actions)
        else:
            # Update the meta-action in accordance with the fixed transition
            # function.
            self.meta_action = self.goal_transition_fn(
                obs0=np.asarray([self._observations[-1][self.goal_indices]]),
                goal=self.meta_action,
                obs1=obs[:, self.goal_indices])

        # Return the worker action.
        worker_action = self.worker.get_action(obs, self.meta_action,
                                               apply_noise, random_actions)

        return worker_action

    def value(self, obs, context, action):
        """See parent class."""
        return 0, 0  # FIXME

    def store_transition(self,
                         obs0,
                         context0,
                         action,
                         reward,
                         obs1,
                         context1,
                         done,
                         is_final_step,
                         evaluate=False):
        """See parent class."""
        # Compute the worker reward and append it to the list of rewards.
        self._worker_rewards.append(
            self.worker_reward_scale *
            self.worker_reward_fn(obs0, self.meta_action.flatten(), obs1))

        # Add the environmental observations and done masks, and the manager
        # and worker actions to their respective lists.
        self._worker_actions.append(action)
        self._meta_actions.append(self.meta_action.flatten())
        self._observations.append(self._get_obs(obs0, self.meta_action, 0))

        # Modify the done mask in accordance with the TD3 algorithm. Done
        # masks that correspond to the final step are set to False.
        self._dones.append(done and not is_final_step)

        # Increment the meta reward with the most recent reward.
        self.meta_reward += reward

        # Modify the previous meta observation whenever the action has changed.
        if len(self._observations) == 1:
            self.prev_meta_obs = self._get_obs(obs0, context0, 0)

        # Add a sample to the replay buffer.
        if len(self._observations) == self.meta_period or done:
            # Add the last observation.
            self._observations.append(self._get_obs(obs1, self.meta_action, 0))

            # Add the contextual observation to the most recent environmental
            # observation, if applicable.
            meta_obs1 = self._get_obs(obs1, context1, 0)

            # Avoid storing samples when performing evaluations.
            if not evaluate:
                if not self.hindsight \
                        or random.random() < self.subgoal_testing_rate:
                    # Store a sample in the replay buffer.
                    self.replay_buffer.add(
                        obs_t=self._observations,
                        goal_t=self._meta_actions[0],
                        action_t=self._worker_actions,
                        reward_t=self._worker_rewards,
                        done=self._dones,
                        meta_obs_t=(self.prev_meta_obs, meta_obs1),
                        meta_reward_t=self.meta_reward,
                    )

                if self.hindsight:
                    # Implement hindsight action and goal transitions.
                    goal, obs, rewards = self._hindsight_actions_goals(
                        meta_action=self.meta_action,
                        initial_observations=self._observations,
                        initial_rewards=self._worker_rewards)

                    # Store the hindsight sample in the replay buffer.
                    self.replay_buffer.add(
                        obs_t=obs,
                        goal_t=goal,
                        action_t=self._worker_actions,
                        reward_t=rewards,
                        done=self._dones,
                        meta_obs_t=(self.prev_meta_obs, meta_obs1),
                        meta_reward_t=self.meta_reward,
                    )

            # Clear the worker rewards and actions, and the environmental
            # observation and reward.
            self.clear_memory()

    @property
    def _update_meta(self):
        """Return True if the meta-action should be updated by the policy.

        This is done by checking the length of the observation lists that are
        passed to the replay buffer, which are cleared whenever the meta-period
        has been met or the environment has been reset.
        """
        return len(self._observations) == 0

    def clear_memory(self):
        """Clear internal memory that is used by the replay buffer.

        By clearing memory, the Manager policy is then informed during the
        `get_action` procedure to update the meta-action.
        """
        self.meta_reward = 0
        self._observations = []
        self._worker_actions = []
        self._worker_rewards = []
        self._dones = []
        self._meta_actions = []

    def get_td_map(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return {}

        # Get a batch.
        meta_obs0, meta_obs1, meta_act, meta_rew, meta_done, worker_obs0, \
            worker_obs1, worker_act, worker_rew, worker_done, _ = \
            self.replay_buffer.sample()

        td_map = {}
        td_map.update(
            self.manager.get_td_map_from_batch(meta_obs0, meta_act, meta_rew,
                                               meta_obs1, meta_done))
        td_map.update(
            self.worker.get_td_map_from_batch(worker_obs0, worker_act,
                                              worker_rew, worker_obs1,
                                              worker_done))

        return td_map

    # ======================================================================= #
    #                       Auxiliary methods for HIRO                        #
    # ======================================================================= #

    def _sample_best_meta_action(self,
                                 meta_obs0,
                                 meta_obs1,
                                 meta_action,
                                 worker_obses,
                                 worker_actions,
                                 k=10):
        """Return meta-actions that approximately maximize low-level log-probs.

        Parameters
        ----------
        meta_obs0 : array_like
            (batch_size, m_obs_dim) matrix of Manager observations
        meta_obs1 : array_like
            (batch_size, m_obs_dim) matrix of next time step Manager
            observations
        meta_action : array_like
            (batch_size, m_ac_dim) matrix of Manager actions
        worker_obses : array_like
            (batch_size, w_obs_dim, meta_period+1) matrix of current Worker
            state observations
        worker_actions : array_like
            (batch_size, w_ac_dim, meta_period) matrix of current Worker
            environmental actions
        k : int, optional
            number of goals returned, excluding the initial goal and the mean
            value

        Returns
        -------
        array_like
            (batch_size, m_ac_dim) matrix of most likely Manager actions
        """
        batch_size, goal_dim = meta_action.shape

        # Collect several samples of potentially optimal goals.
        sampled_actions = self._sample(meta_obs0, meta_obs1, meta_action, k)
        assert sampled_actions.shape == (batch_size, goal_dim, k)

        # Compute the fitness of each candidate goal. The fitness is the sum of
        # the log-probabilities of each action for the given goal.
        fitness = self._log_probs(sampled_actions, worker_obses,
                                  worker_actions)
        assert fitness.shape == (batch_size, k)

        # For each sample, choose the meta action that maximizes the fitness.
        indx = np.argmax(fitness, 1)
        best_goals = np.asarray(
            [sampled_actions[i, :, indx[i]] for i in range(batch_size)])

        return best_goals

    def _sample(self, meta_obs0, meta_obs1, meta_action, num_samples, sc=0.5):
        """Sample different goals.

        The goals are sampled as follows:

        * The first num_samples-2 goals are acquired from a random Gaussian
          distribution centered at s_{t+c} - s_t.
        * The second to last goal is s_{t+c} - s_t.
        * The last goal is the originally sampled goal g_t.

        Parameters
        ----------
        meta_obs0 : array_like
            (batch_size, m_obs_dim) matrix of Manager observations
        meta_obs1 : array_like
            (batch_size, m_obs_dim) matrix of next time step Manager
            observations
        meta_action : array_like
            (batch_size, m_ac_dim) matrix of Manager actions
        num_samples : int
            number of samples
        sc : float
            scaling factor for the normal distribution.

        Returns
        -------
        array_like
            (batch_size, goal_dim, num_samples) matrix of sampled goals

        Helps
        -----
        * _sample_best_meta_action(self)
        """
        batch_size, goal_dim = meta_action.shape
        goal_space = self.manager.ac_space
        spec_range = goal_space.high - goal_space.low
        random_samples = num_samples - 2

        # Compute the mean and std for the Gaussian distribution to sample
        # from, and well as the maxima and minima.
        loc = meta_obs1[:, self.goal_indices] - meta_obs0[:, self.goal_indices]
        scale = [sc * spec_range / 2]
        minimum, maximum = [goal_space.low], [goal_space.high]

        new_loc = np.zeros((batch_size, goal_dim, random_samples))
        new_scale = np.zeros((batch_size, goal_dim, random_samples))
        for i in range(random_samples):
            new_loc[:, :, i] = loc
            new_scale[:, :, i] = scale

        new_minimum = np.zeros((batch_size, goal_dim, num_samples))
        new_maximum = np.zeros((batch_size, goal_dim, num_samples))
        for i in range(num_samples):
            new_minimum[:, :, i] = minimum
            new_maximum[:, :, i] = maximum

        # Generate random samples for the above distribution.
        normal_samples = np.random.normal(size=(random_samples * batch_size *
                                                goal_dim))
        normal_samples = normal_samples.reshape(
            (batch_size, goal_dim, random_samples))

        samples = np.zeros((batch_size, goal_dim, num_samples))
        samples[:, :, :-2] = new_loc + normal_samples * new_scale
        samples[:, :, -2] = loc
        samples[:, :, -1] = meta_action

        # Clip the values based on the Manager action space range.
        samples = np.minimum(np.maximum(samples, new_minimum), new_maximum)

        return samples

    def _log_probs(self, meta_actions, worker_obses, worker_actions):
        """Calculate the log probability of the next goal by the Manager.

        Parameters
        ----------
        meta_actions : array_like
            (batch_size, m_ac_dim, num_samples) matrix of candidate Manager
            actions
        worker_obses : array_like
            (batch_size, w_obs_dim, meta_period + 1) matrix of Worker
            observations
        worker_actions : array_like
            (batch_size, w_ac_dim, meta_period) list of Worker actions

        Returns
        -------
        array_like
            (batch_size, num_samples) fitness associated with every state /
            action / goal pair

        Helps
        -----
        * _sample_best_meta_action(self):
        """
        raise NotImplementedError

    # ======================================================================= #
    #                       Auxiliary methods for HAC                         #
    # ======================================================================= #

    def _hindsight_actions_goals(self, meta_action, initial_observations,
                                 initial_rewards):
        """Calculate hindsight goal and action transitions.

        These are then stored in the replay buffer along with the original
        (non-hindsight) sample.

        See the README at the front page of this repository for an in-depth
        description of this procedure.

        Parameters
        ----------
        meta_action : array_like
            the original Manager actions (goal)
        initial_observations : array_like
            the original worker observations with the non-hindsight goals
            appended to them
        initial_rewards : array_like
            the original worker rewards

        Returns
        -------
        array_like
            the Manager action (goal) in hindsight
        array_like
            the modified Worker observations with the hindsight goals appended
            to them
        array_like
            the modified Worker rewards taking into account the hindsight goals

        Helps
        -----
        * store_transition(self):
        """
        goal_dim = meta_action.shape[0]
        observations = deepcopy(initial_observations)
        rewards = deepcopy(initial_rewards)
        hindsight_goal = 0 if self.relative_goals \
            else observations[-1][self.goal_indices]
        obs_tp1 = observations[-1]

        for i in range(1, len(observations) + 1):
            obs_t = observations[-i]

            # Calculate the hindsight goal in using relative goals.
            # If not, the hindsight goal is simply a subset of the
            # final state observation.
            if self.relative_goals:
                hindsight_goal += \
                    obs_tp1[self.goal_indices] - obs_t[self.goal_indices]

            # Modify the Worker intrinsic rewards based on the new
            # hindsight goal.
            if i > 1:
                rewards[-(i - 1)] = self.worker_reward_scale \
                    * self.worker_reward_fn(obs_t, hindsight_goal, obs_tp1)

            obs_tp1 = deepcopy(obs_t)

            # Replace the goal with the goal that the worker
            # actually achieved.
            observations[-i][-goal_dim:] = hindsight_goal

        return hindsight_goal, observations, rewards

    # ======================================================================= #
    #                      Auxiliary methods for HRL-CG                       #
    # ======================================================================= #

    def _setup_connected_gradients(self):
        """Create the updated manager optimization with connected gradients."""
        raise NotImplementedError

    def _connected_gradients_update(self,
                                    obs0,
                                    actions,
                                    rewards,
                                    obs1,
                                    terminals1,
                                    worker_obs0,
                                    worker_obs1,
                                    worker_actions,
                                    update_actor=True):
        """Perform the gradient update procedure for the HRL-CG algorithm.

        This procedure is similar to self.manager.update_from_batch, expect it
        runs the self.cg_optimizer operation instead of self.manager.optimizer,
        and utilizes some information from the worker samples as well.

        Parameters
        ----------
        obs0 : np.ndarray
            batch of manager observations
        actions : numpy float
            batch of manager actions executed given obs_batch
        rewards : numpy float
            manager rewards received as results of executing act_batch
        obs1 : np.ndarray
            set of next manager observations seen after executing act_batch
        terminals1 : numpy bool
            done_mask[i] = 1 if executing act_batch[i] resulted in the end of
            an episode and 0 otherwise.
        worker_obs0 : array_like
            batch of worker observations
        worker_obs1 : array_like
            batch of next worker observations
        worker_actions : array_like
            batch of worker actions
        update_actor : bool
            specifies whether to update the actor policy of the manager. The
            critic policy is still updated if this value is set to False.

        Returns
        -------
        [float, float]
            manager critic loss
        float
            manager actor loss
        """
        raise NotImplementedError
Ejemplo n.º 6
0
class GoalConditionedPolicy(Policy):
    r"""Goal-conditioned hierarchical reinforcement learning model.

    TODO
    This policy is an implementation of the two-level hierarchy presented
    in [1], which itself is similar to the feudal networks formulation [2, 3].
    This network consists of a high-level, or Manager, pi_{\theta_H} that
    computes and outputs goals g_t ~ pi_{\theta_H}(s_t, h) every `meta_period`
    time steps, and a low-level policy pi_{\theta_L} that takes as inputs the
    current state and the assigned goals and attempts to perform an action
    a_t ~ pi_{\theta_L}(s_t,g_t) that satisfies these goals.

    The highest level policy is rewarded based on the original environment
    reward function: r_H = r(s,a;h).

    The Target term, h, parametrizes the reward assigned to the highest level
    policy in order to allow the policy to generalize to several goals within a
    task, a technique that was first proposed by [4].

    Finally, the Worker is motivated to follow the goals set by the Manager via
    an intrinsic reward based on the distance between the current observation
    and the goal observation:
    r_L (s_t, g_t, s_{t+1}) = -||s_t + g_t - s_{t+1}||_2

    Bibliography:

    [1] Nachum, Ofir, et al. "Data-efficient hierarchical reinforcement
        learning." Advances in Neural Information Processing Systems. 2018.
    [2] Dayan, Peter, and Geoffrey E. Hinton. "Feudal reinforcement learning."
        Advances in neural information processing systems. 1993.
    [3] Vezhnevets, Alexander Sasha, et al. "Feudal networks for hierarchical
        reinforcement learning." Proceedings of the 34th International
        Conference on Machine Learning-Volume 70. JMLR. org, 2017.
    [4] Schaul, Tom, et al. "Universal value function approximators."
        International Conference on Machine Learning. 2015.

    Attributes
    ----------
    num_levels : int
        number of levels within the hierarchy. Must be greater than 1. Two
        levels correspond to a Manager/Worker paradigm.
    meta_period : int
        meta-policy action period
    intrinsic_reward_type : str
        the reward function to be used by the worker. Must be one of:

        * "negative_distance": the negative two norm between the states and
          desired absolute or relative goals.
        * "scaled_negative_distance": similar to the negative distance reward
          where the states, goals, and next states are scaled by the inverse of
          the action space of the manager policy
        * "non_negative_distance": the negative two norm between the states and
          desired absolute or relative goals offset by the maximum goal space
          (to ensure non-negativity)
        * "scaled_non_negative_distance": similar to the non-negative distance
          reward where the states, goals, and next states are scaled by the
          inverse of the action space of the manager policy
        * "exp_negative_distance": equal to exp(-negative_distance^2). The
          result is a reward between 0 and 1. This is useful for policies that
          terminate early.
        * "scaled_exp_negative_distance": similar to the previous worker reward
          type but with states, actions, and next states that are scaled.
    intrinsic_reward_scale : float
        the value that the intrinsic reward should be scaled by
    relative_goals : bool
        specifies whether the goal issued by the higher-level policies is meant
        to be a relative or absolute goal, i.e. specific state or change in
        state
    off_policy_corrections : bool
        whether to use off-policy corrections during the update procedure. See:
        https://arxiv.org/abs/1805.08296.
    hindsight : bool
        whether to use hindsight action and goal transitions, as well as
        subgoal testing. See: https://arxiv.org/abs/1712.00948
    subgoal_testing_rate : float
        rate at which the original (non-hindsight) sample is stored in the
        replay buffer as well. Used only if `hindsight` is set to True.
    cooperative_gradients : bool
        whether to use the cooperative gradient update procedure for the
        higher-level policy. See: https://arxiv.org/abs/1912.02368v1
    cg_weights : float
        weights for the gradients of the loss of the lower-level policies with
        respect to the parameters of the higher-level policies. Only used if
        `cooperative_gradients` is set to True.
    pretrain_worker : bool
        specifies whether you are pre-training the lower-level policies.
        Actions by the high-level policy are randomly sampled from its action
        space.
    pretrain_path : str or None
        path to the pre-trained worker policy checkpoints
    pretrain_ckpt : int or None
        checkpoint number to use within the worker policy path. If set to None,
        the most recent checkpoint is used.
    total_steps : int
        Total number of timesteps used during training. Used by a subset of
        algorithms.
    policy : list of hbaselines.base_policies.Policy
        a list of policy object for each level in the hierarchy, order from
        highest to lowest level policy
    replay_buffer : hbaselines.goal_conditioned.replay_buffer.HierReplayBuffer
        the replay buffer object
    goal_indices : list of int
        the state indices for the intrinsic rewards
    intrinsic_reward_fn : function
        reward function for the lower-level policies
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 use_huber,
                 l2_penalty,
                 model_params,
                 num_levels,
                 meta_period,
                 intrinsic_reward_type,
                 intrinsic_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 cooperative_gradients,
                 cg_weights,
                 cg_delta,
                 pretrain_worker,
                 pretrain_path,
                 pretrain_ckpt,
                 total_steps,
                 scope=None,
                 env_name="",
                 num_envs=1,
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        model_params : dict
            dictionary of model-specific parameters. See parent class.
        num_levels : int
            number of levels within the hierarchy. Must be greater than 1. Two
            levels correspond to a Manager/Worker paradigm.
        meta_period : int
            meta-policy action period
        intrinsic_reward_type : str
            the reward function to be used by the worker. Must be one of:

            * "negative_distance": the negative two norm between the states and
              desired absolute or relative goals.
            * "scaled_negative_distance": similar to the negative distance
              reward where the states, goals, and next states are scaled by the
              inverse of the action space of the manager policy
            * "non_negative_distance": the negative two norm between the states
              and desired absolute or relative goals offset by the maximum goal
              space (to ensure non-negativity)
            * "scaled_non_negative_distance": similar to the non-negative
              distance reward where the states, goals, and next states are
              scaled by the inverse of the action space of the manager policy
            * "exp_negative_distance": equal to exp(-negative_distance^2). The
              result is a reward between 0 and 1. This is useful for policies
              that terminate early.
            * "scaled_exp_negative_distance": similar to the previous worker
              reward type but with states, actions, and next states that are
              scaled.
        intrinsic_reward_scale : float
            the value that the intrinsic reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the higher-level policies is
            meant to be a relative or absolute goal, i.e. specific state or
            change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        cooperative_gradients : bool
            whether to use the cooperative gradient update procedure for the
            higher-level policy. See: https://arxiv.org/abs/1912.02368v1
        cg_weights : float
            weights for the gradients of the loss of the lower-level policies
            with respect to the parameters of the higher-level policies. Only
            used if `cooperative_gradients` is set to True.
        cg_delta : float
            the desired lower-level expected returns. If set to None, a fixed
            Lagrangian specified by cg_weights is used instead. Only used if
            `cooperative_gradients` is set to True.
        pretrain_worker : bool
            specifies whether you are pre-training the lower-level policies.
            Actions by the high-level policy are randomly sampled from the
            action space.
        pretrain_path : str or None
            path to the pre-trained worker policy checkpoints
        pretrain_ckpt : int or None
            checkpoint number to use within the worker policy path. If set to
            None, the most recent checkpoint is used.
        total_steps : int
            Total number of timesteps used during training. Used by a subset of
            algorithms.
        meta_policy : type [ hbaselines.base_policies.Policy ]
            the policy model to use for the meta policies
        worker_policy : type [ hbaselines.base_policies.Policy ]
            the policy model to use for the worker policy
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(
            sess=sess,
            ob_space=ob_space,
            ac_space=ac_space,
            co_space=co_space,
            verbose=verbose,
            l2_penalty=l2_penalty,
            model_params=model_params,
            num_envs=num_envs,
        )

        assert num_levels >= 2, "num_levels must be greater than or equal to 2"

        self.num_levels = num_levels
        self.meta_period = meta_period
        self.intrinsic_reward_type = intrinsic_reward_type
        self.intrinsic_reward_scale = intrinsic_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.cooperative_gradients = cooperative_gradients
        self.cg_weights = cg_weights
        self.cg_delta = cg_delta
        self.pretrain_worker = pretrain_worker
        self.pretrain_path = pretrain_path
        self.pretrain_ckpt = pretrain_ckpt
        self.total_steps = total_steps

        # Get the observation and action space of the higher level policies.
        meta_ac_space = get_meta_ac_space(
            ob_space=ob_space,
            relative_goals=relative_goals,
            env_name=env_name,
        )

        # =================================================================== #
        # Step 1: Create the policies for the individual levels.              #
        # =================================================================== #

        self.policy = []

        # The policies are ordered from the highest level to lowest level
        # policies in the hierarchy.
        for i in range(num_levels):
            # Determine the appropriate parameters to use for the policy in the
            # current level.
            policy_fn = meta_policy if i < (num_levels - 1) else worker_policy
            ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space
            co_space_i = co_space if i == 0 else meta_ac_space
            ob_space_i = ob_space

            # The policies are ordered from the highest level to lowest level
            # policies in the hierarchy.
            with tf.compat.v1.variable_scope("level_{}".format(i)):
                # Compute the scope name based on any outer scope term.
                scope_i = "level_{}".format(i)
                if scope is not None:
                    scope_i = "{}/{}".format(scope, scope_i)

                # TODO: description.
                model_params_i = model_params.copy()
                model_params_i.update({
                    "ignore_flat_channels":
                    model_params["ignore_flat_channels"] if i < 1 else [],
                    "ignore_image":
                    model_params["ignore_image"] if i < 1 else True,
                })

                # Create the next policy.
                self.policy.append(
                    policy_fn(
                        sess=sess,
                        ob_space=ob_space_i,
                        ac_space=ac_space_i,
                        co_space=co_space_i,
                        buffer_size=buffer_size,
                        batch_size=batch_size,
                        actor_lr=actor_lr,
                        critic_lr=critic_lr,
                        verbose=verbose,
                        tau=tau,
                        gamma=gamma,
                        use_huber=use_huber,
                        l2_penalty=l2_penalty,
                        model_params=model_params_i,
                        scope=scope_i,
                        **(additional_params or {}),
                    ))

        # =================================================================== #
        # Step 2: Create attributes for the replay buffer.                    #
        # =================================================================== #

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            obs_dim=ob_space.shape[0],
            ac_dim=ac_space.shape[0],
            co_dim=None if co_space is None else co_space.shape[0],
            goal_dim=meta_ac_space.shape[0],
            num_levels=num_levels)

        # current action by the meta-level policies
        self.meta_action = [[None for _ in range(num_levels - 1)]
                            for _ in range(num_envs)]

        # a list of all the actions performed by each level in the hierarchy,
        # ordered from highest to lowest level policy. A separate element is
        # used for each environment.
        self._actions = [[[] for _ in range(self.num_levels)]
                         for _ in range(num_envs)]

        # a list of the rewards (intrinsic or other) experienced by every level
        # in the hierarchy, ordered from highest to lowest level policy. A
        # separate element is used for each environment.
        self._rewards = [[[0]] + [[] for _ in range(self.num_levels - 1)]
                         for _ in range(num_envs)]

        # a list of observations that stretch as long as the dilated horizon
        # chosen for the highest level policy. A separate element is used for
        # each environment.
        self._observations = [[] for _ in range(num_envs)]

        # the first and last contextual term. A separate element is used for
        # each environment.
        self._contexts = [[] for _ in range(num_envs)]

        # a list of done masks at every time step. A separate element is used
        # for each environment.
        self._dones = [[] for _ in range(num_envs)]

        # Collect the state indices for the intrinsic rewards.
        self.goal_indices = get_state_indices(ob_space, env_name)

        # Define the intrinsic reward function.
        if intrinsic_reward_type in [
                "negative_distance", "scaled_negative_distance",
                "non_negative_distance", "scaled_non_negative_distance",
                "exp_negative_distance", "scaled_exp_negative_distance"
        ]:
            # Offset the distance measure by the maximum possible distance to
            # ensure non-negativity.
            if "non_negative" in intrinsic_reward_type:
                offset = np.sqrt(
                    np.sum(np.square(meta_ac_space.high - meta_ac_space.low),
                           -1))
            else:
                offset = 0

            # Scale the outputs from the state by the meta-action space if you
            # wish to scale the worker reward.
            if intrinsic_reward_type.startswith("scaled"):
                scale = 0.5 * (meta_ac_space.high - meta_ac_space.low)
            else:
                scale = 1

            def intrinsic_reward_fn(states, goals, next_states):
                return negative_distance(
                    states=states[self.goal_indices] / scale,
                    goals=goals / scale,
                    next_states=next_states[self.goal_indices] / scale,
                    relative_context=relative_goals,
                    offset=0.0,
                ) + offset

            # Perform the exponential and squashing operations to keep the
            # intrinsic reward between 0 and 1.
            if "exp" in intrinsic_reward_type:

                def exp_intrinsic_reward_fn(states, goals, next_states):
                    # TODO: temporary
                    span = sum(
                        np.square(self.policy[0].ac_space.high -
                                  self.policy[0].ac_space.low))
                    rew = intrinsic_reward_fn(states, goals, next_states)
                    return np.exp(-(rew / (span / 40))**2)

                self.intrinsic_reward_fn = exp_intrinsic_reward_fn
            else:
                self.intrinsic_reward_fn = intrinsic_reward_fn
        else:
            raise ValueError("Unknown intrinsic reward type: {}".format(
                intrinsic_reward_type))

        # =================================================================== #
        # Step 3: Create algorithm-specific features.                         #
        # =================================================================== #

        # the number of get_action calls that have been performed. This is used
        # when pretraining the worker to incrementally train different levels
        # of the policy.
        self._steps = 0

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        if self.cooperative_gradients:
            if scope is None:
                self._setup_cooperative_gradients()
            else:
                with tf.compat.v1.variable_scope(scope):
                    self._setup_cooperative_gradients()

    def initialize(self):
        """See parent class.

        This method performs the following operations:

        - It calls the initialization methods of the policies at every level of
          the hierarchy to match the target value function parameters with the
          current policy parameters.
        - It also imports the lower-level policies from a pretrained checkpoint
          if a path to one is specified.
        """
        # Initialize the separate policies in the hierarchy.
        for i in range(self.num_levels):
            self.policy[i].initialize()

        if self.pretrain_path is not None:
            ckpt_path = os.path.join(self.pretrain_path, "checkpoints")

            # Get the checkpoint number.
            if self.pretrain_ckpt is None:
                filenames = os.listdir(ckpt_path)
                metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"]
                metanum = [int(f.split("-")[-1]) for f in metafiles]
                ckpt_num = max(metanum)
            else:
                ckpt_num = self.pretrain_ckpt

            # Extract the checkpoint path.
            ckpt_path = os.path.join(ckpt_path, "itr-{}".format(ckpt_num))
            var_list = tf.train.list_variables(ckpt_path)
            ckpt_reader = tf.train.load_checkpoint(ckpt_path)

            # Check that the number of levels match.
            assert var_list[-1][0].startswith(
                "level_{}".format(self.num_levels-1)), \
                "Number of levels between the checkpoint and current policy " \
                "do not match. Policy={}, Checkpoint={}".format(
                    self.num_levels,
                    int(var_list[-1][0].split("/")[0][6:]) + 1)

            # Check that the names and shapes of the lowest-level policy
            # parameters match the current policy.
            current_vars = {
                v.name: v.shape.as_list()
                for v in get_trainable_vars()
            }
            for var in var_list:
                var_name, var_shape = var
                var_name = "{}:0".format(var_name)
                # We only check the lower-level policies.
                if any(
                        var_name.startswith("level_{}".format(level))
                        for level in range(1, self.num_levels)):
                    assert var_name in current_vars.keys(), \
                        "{} not available in current policy.".format(var_name)
                    current_shape = current_vars[var_name]
                    assert current_shape == var_shape, \
                        "Shape mismatch for {}, {} != {}".format(
                            var_name, var_shape, current_shape)

            # Import the lower-level policy parameters.
            current_vars = {v.name: v for v in get_trainable_vars()}
            for var in var_list:
                var_name, var_shape = var
                if any(
                        var_name.startswith("level_{}".format(level))
                        for level in range(1, self.num_levels)):
                    value = ckpt_reader.get_tensor(var_name)
                    var_name = "{}:0".format(var_name)
                    self.sess.run(
                        tf.compat.v1.assign(current_vars[var_name], value))

    def update(self, update_actor=True, **kwargs):
        """Perform a gradient update step.

        This is done both at every level of the hierarchy.

        The kwargs argument for this method contains two additional terms:

        * update_meta (bool): specifies whether to perform a gradient update
          step for the meta-policies
        * update_meta_actor (bool): similar to the `update_policy` term, but
          for the meta-policy. Note that, if `update_meta` is set to False,
          this term is void.

        **Note**; The target update soft updates for all policies occur at the
        same frequency as their respective actor update frequencies.

        Parameters
        ----------
        update_actor : bool
            specifies whether to update the actor policy. The critic policy is
            still updated if this value is set to False.
        """
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return

        # Specifies whether to remove additional data from the replay buffer
        # sampling procedure. Since only a subset of algorithms use additional
        # data, removing it can speedup the other algorithms.
        with_additional = self.off_policy_corrections

        # Specifies the levels to collect data from, corresponding to the
        # levels that will be trained. This also helps speedup the operation.
        collect_levels = [
            i for i in range(self.num_levels - 1) if kwargs["update_meta"][i]
        ] + [self.num_levels - 1]

        # Get a batch.
        obs0, obs1, act, rew, done, additional = self.replay_buffer.sample(
            with_additional, collect_levels)

        # Do not use done masks for lower-level policies with negative
        # intrinsic rewards (these the policies to terminate early).
        if self._negative_reward_fn():
            for i in range(self.num_levels - 1):
                done[i + 1] = np.array([False] * done[i + 1].shape[0])

        # Loop through all meta-policies.
        for i in range(self.num_levels - 1):
            if kwargs['update_meta'][i] and not self._pretrain_level(i):
                # Replace the goals with the most likely goals.
                if self.off_policy_corrections and i == 0:  # FIXME
                    meta_act = self._sample_best_meta_action(
                        meta_obs0=obs0[i],
                        meta_obs1=obs1[i],
                        meta_action=act[i],
                        worker_obses=additional["worker_obses"],
                        worker_actions=additional["worker_actions"],
                        k=8)
                    act[i] = meta_act

                if self.cooperative_gradients:
                    # Perform the cooperative gradients update procedure.
                    self._cooperative_gradients_update(
                        obs0=obs0,
                        actions=act,
                        rewards=rew,
                        obs1=obs1,
                        terminals1=done,
                        level_num=i,
                        update_actor=kwargs['update_meta_actor'],
                    )
                else:
                    # Perform the regular meta update procedure.
                    self.policy[i].update_from_batch(
                        obs0=obs0[i],
                        actions=act[i],
                        rewards=rew[i],
                        obs1=obs1[i],
                        terminals1=done[i],
                        update_actor=kwargs['update_meta_actor'],
                    )

        # Update the lowest level policy.
        self.policy[-1].update_from_batch(
            obs0=obs0[-1],
            actions=act[-1],
            rewards=rew[-1],
            obs1=obs1[-1],
            terminals1=done[-1],
            update_actor=update_actor,
        )

    def get_action(self, obs, context, apply_noise, random_actions, env_num=0):
        """See parent class."""
        # Increment the internal number of get_action calls.
        self._steps += 1

        # Loop through the policies in the hierarchy.
        for i in range(self.num_levels - 1):
            if self._update_meta(i, env_num):
                if self._pretrain_level(i):
                    # Sample goals randomly when performing pre-training.
                    self.meta_action[env_num][i] = np.array(
                        [self.policy[i].ac_space.sample()])
                else:
                    context_i = context if i == 0 \
                        else self.meta_action[env_num][i - 1]

                    # Update the meta action based on the output from the
                    # policy if the time period requires is.
                    self.meta_action[env_num][i] = self.policy[i].get_action(
                        obs, context_i, apply_noise, random_actions)
            else:
                # Update the meta-action in accordance with a fixed transition
                # function.
                self.meta_action[env_num][i] = self.goal_transition_fn(
                    obs0=np.array(
                        [self._observations[env_num][-1][self.goal_indices]]),
                    goal=self.meta_action[env_num][i],
                    obs1=obs[:, self.goal_indices])

        # Return the action to be performed within the environment (i.e. the
        # action by the lowest level policy).
        action = self.policy[-1].get_action(
            obs=obs,
            context=self.meta_action[env_num][-1],
            apply_noise=apply_noise,
            random_actions=random_actions and self.pretrain_path is None)

        return action

    def store_transition(self,
                         obs0,
                         context0,
                         action,
                         reward,
                         obs1,
                         context1,
                         done,
                         is_final_step,
                         env_num=0,
                         evaluate=False):
        """See parent class."""
        # the time since the most recent sample began collecting step samples
        t_start = len(self._observations[env_num])

        # Flatten the observations.
        obs0 = obs0.flatten()
        obs1 = obs1.flatten()

        for i in range(1, self.num_levels):
            # Actions and intrinsic rewards for the high-level policies are
            # only updated when the action is recomputed by the graph.
            if t_start % self.meta_period**(i - 1) == 0:
                self._rewards[env_num][-i].append(0)
                self._actions[env_num][-i - 1].append(
                    self.meta_action[env_num][-i].flatten())

            # Compute the intrinsic rewards and append them to the list of
            # rewards.
            self._rewards[env_num][-i][-1] += \
                self.intrinsic_reward_scale / self.meta_period ** (i-1) * \
                self.intrinsic_reward_fn(
                    states=obs0,
                    goals=self.meta_action[env_num][-i].flatten(),
                    next_states=obs1
                )

        # The highest level policy receives the sum of environmental rewards.
        self._rewards[env_num][0][0] += reward

        # The lowest level policy's actions are received from the algorithm.
        self._actions[env_num][-1].append(action)

        # Add the environmental observations and contextual terms to their
        # respective lists.
        self._observations[env_num].append(obs0)
        if t_start == 0:
            self._contexts[env_num].append(context0)

        # Modify the done mask in accordance with the TD3 algorithm. Done masks
        # that correspond to the final step are set to False.
        self._dones[env_num].append(done and not is_final_step)

        # Add a sample to the replay buffer.
        if len(self._observations[env_num]) == \
                self.meta_period ** (self.num_levels - 1) or done:
            # Add the last observation and context.
            self._observations[env_num].append(obs1)
            self._contexts[env_num].append(context1)

            # Compute the current state goals to add to the final observation.
            for i in range(self.num_levels - 1):
                self._actions[env_num][i].append(
                    self.goal_transition_fn(
                        obs0=obs0[self.goal_indices],
                        goal=self.meta_action[env_num][i],
                        obs1=obs1[self.goal_indices]).flatten())

            # Avoid storing samples when performing evaluations.
            if not evaluate:
                if not self.hindsight \
                        or random.random() < self.subgoal_testing_rate:
                    # Store a sample in the replay buffer.
                    self.replay_buffer.add(
                        obs_t=self._observations[env_num],
                        context_t=self._contexts[env_num],
                        action_t=self._actions[env_num],
                        reward_t=self._rewards[env_num],
                        done_t=self._dones[env_num],
                    )

                if self.hindsight:
                    # Some temporary attributes.
                    worker_obses = [
                        self._get_obs(self._observations[env_num][i],
                                      self._actions[env_num][0][i], 0)
                        for i in range(len(self._observations[env_num]))
                    ]
                    intrinsic_rewards = self._rewards[env_num][-1]

                    # Implement hindsight action and goal transitions.
                    goal, rewards = self._hindsight_actions_goals(
                        initial_observations=worker_obses,
                        initial_rewards=intrinsic_rewards)
                    new_actions = deepcopy(self._actions[env_num])
                    new_actions[0] = goal
                    new_rewards = deepcopy(self._rewards[env_num])
                    new_rewards[-1] = rewards

                    # Store the hindsight sample in the replay buffer.
                    self.replay_buffer.add(
                        obs_t=self._observations[env_num],
                        context_t=self._contexts[env_num],
                        action_t=new_actions,
                        reward_t=new_rewards,
                        done_t=self._dones[env_num],
                    )

            # Clear the memory that has been stored in the replay buffer.
            self.clear_memory(env_num)

    def _update_meta(self, level, env_num):
        """Determine whether a meta-policy should update its action.

        This is done by checking the length of the observation lists that are
        passed to the replay buffer, which are cleared whenever the highest
        level meta-period has been met or the environment has been reset.

        Parameters
        ----------
        level : int
            the level of the policy
        env_num : int
            the environment number. Used to handle situations when multiple
            parallel environments are being used.

        Returns
        -------
        bool
            True if the action should be updated by the meta-policy at the
            given level
        """
        return len(self._observations[env_num]) % \
            (self.meta_period ** (self.num_levels - level - 1)) == 0

    def clear_memory(self, env_num):
        """Clear internal memory that is used by the replay buffer."""
        self._actions[env_num] = [[] for _ in range(self.num_levels)]
        self._rewards[env_num] = \
            [[0]] + [[] for _ in range(self.num_levels - 1)]
        self._observations[env_num] = []
        self._contexts[env_num] = []
        self._dones[env_num] = []

    def get_td_map(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return {}

        # Get a batch.
        obs0, obs1, act, rew, done, _ = self.replay_buffer.sample(False)

        td_map = {}
        for i in range(self.num_levels):
            td_map.update(self.policy[i].get_td_map_from_batch(
                obs0=obs0[i],
                actions=act[i],
                rewards=rew[i],
                obs1=obs1[i],
                terminals1=done[i]))

        return td_map

    def _negative_reward_fn(self):
        """Return True if the intrinsic reward returns negative values.

        Intrinsic reward functions with negative rewards incentivize early
        terminations, which we attempt to mitigate in the training operation by
        preventing early terminations from return an expected return of 0.
        """
        return "exp" not in self.intrinsic_reward_type \
            and "non" not in self.intrinsic_reward_type

    def _pretrain_level(self, level):
        """Check whether the current level should be training.

        When using `pretrain_worker` the lowest level policy is trained every
        step, and higher level policies are incrementally unfrozen for a
        fraction of the training steps. The highest level policy is not trained
        in this case, but the checkpoints can later be used to continue
        training the entire hierarchy.

        Parameters
        ----------
        level : int
            the level of the policy

        Returns
        -------
        bool
            True if the level should not be trained and should perform random
            actions, False otherwise
        """
        # number of steps to perform pretraining for a given level, assuming
        # pretrain_worker is set to True.
        pretrain_steps = self.total_steps * \
            (self.num_levels - level - 1) / (self.num_levels - 1)

        if level == 0:
            # bug fix for the final step
            return self.pretrain_worker
        else:
            return self.pretrain_worker and (self._steps < pretrain_steps)

    # ======================================================================= #
    #                       Auxiliary methods for HIRO                        #
    # ======================================================================= #

    def _sample_best_meta_action(self,
                                 meta_obs0,
                                 meta_obs1,
                                 meta_action,
                                 worker_obses,
                                 worker_actions,
                                 k=10):
        """Return meta-actions that approximately maximize low-level log-probs.

        Parameters
        ----------
        meta_obs0 : array_like
            (batch_size, m_obs_dim) matrix of meta observations
        meta_obs1 : array_like
            (batch_size, m_obs_dim) matrix of next time step meta observations
        meta_action : array_like
            (batch_size, m_ac_dim) matrix of meta actions
        worker_obses : array_like
            (batch_size, w_obs_dim, meta_period+1) matrix of current Worker
            state observations
        worker_actions : array_like
            (batch_size, w_ac_dim, meta_period) matrix of current Worker
            environmental actions
        k : int, optional
            number of goals returned, excluding the initial goal and the mean
            value

        Returns
        -------
        array_like
            (batch_size, m_ac_dim) matrix of most likely meta actions
        """
        batch_size, goal_dim = meta_action.shape

        # Collect several samples of potentially optimal goals.
        sampled_actions = self._sample(meta_obs0, meta_obs1, meta_action, k)
        assert sampled_actions.shape == (batch_size, goal_dim, k)

        # Compute the fitness of each candidate goal. The fitness is the sum of
        # the log-probabilities of each action for the given goal.
        fitness = self._log_probs(sampled_actions, worker_obses,
                                  worker_actions)
        assert fitness.shape == (batch_size, k)

        # For each sample, choose the meta action that maximizes the fitness.
        indx = np.argmax(fitness, 1)
        best_goals = np.asarray(
            [sampled_actions[i, :, indx[i]] for i in range(batch_size)])

        return best_goals

    def _sample(self, meta_obs0, meta_obs1, meta_action, num_samples, sc=0.5):
        """Sample different goals.

        The goals are sampled as follows:

        * The first num_samples-2 goals are acquired from a random Gaussian
          distribution centered at s_{t+c} - s_t.
        * The second to last goal is s_{t+c} - s_t.
        * The last goal is the originally sampled goal g_t.

        Parameters
        ----------
        meta_obs0 : array_like
            (batch_size, m_obs_dim) matrix of meta observations
        meta_obs1 : array_like
            (batch_size, m_obs_dim) matrix of next time step meta observations
        meta_action : array_like
            (batch_size, m_ac_dim) matrix of meta actions
        num_samples : int
            number of samples
        sc : float
            scaling factor for the normal distribution.

        Returns
        -------
        array_like
            (batch_size, goal_dim, num_samples) matrix of sampled goals

        Helps
        -----
        * _sample_best_meta_action(self)
        """
        batch_size, goal_dim = meta_action.shape
        goal_space = self.policy[0].ac_space
        spec_range = goal_space.high - goal_space.low
        random_samples = num_samples - 2

        # Compute the mean and std for the Gaussian distribution to sample
        # from, and well as the maxima and minima.
        loc = meta_obs1[:, self.goal_indices] - meta_obs0[:, self.goal_indices]
        scale = [sc * spec_range / 2]
        minimum, maximum = [goal_space.low], [goal_space.high]

        new_loc = np.zeros((batch_size, goal_dim, random_samples))
        new_scale = np.zeros((batch_size, goal_dim, random_samples))
        for i in range(random_samples):
            new_loc[:, :, i] = loc
            new_scale[:, :, i] = scale

        new_minimum = np.zeros((batch_size, goal_dim, num_samples))
        new_maximum = np.zeros((batch_size, goal_dim, num_samples))
        for i in range(num_samples):
            new_minimum[:, :, i] = minimum
            new_maximum[:, :, i] = maximum

        # Generate random samples for the above distribution.
        normal_samples = np.random.normal(size=(random_samples * batch_size *
                                                goal_dim))
        normal_samples = normal_samples.reshape(
            (batch_size, goal_dim, random_samples))

        samples = np.zeros((batch_size, goal_dim, num_samples))
        samples[:, :, :-2] = new_loc + normal_samples * new_scale
        samples[:, :, -2] = loc
        samples[:, :, -1] = meta_action

        # Clip the values based on the meta action space range.
        samples = np.minimum(np.maximum(samples, new_minimum), new_maximum)

        return samples

    def _log_probs(self, meta_actions, worker_obses, worker_actions):
        """Calculate the log probability of the next goal by the meta-policies.

        Parameters
        ----------
        meta_actions : array_like
            (batch_size, m_ac_dim, num_samples) matrix of candidate higher-
            level policy actions
        worker_obses : array_like
            (batch_size, w_obs_dim, meta_period + 1) matrix of lower-level
            policy observations
        worker_actions : array_like
            (batch_size, w_ac_dim, meta_period) list of lower-level policy
            actions

        Returns
        -------
        array_like
            (batch_size, num_samples) fitness associated with every state /
            action / goal pair

        Helps
        -----
        * _sample_best_meta_action(self):
        """
        raise NotImplementedError

    # ======================================================================= #
    #                       Auxiliary methods for HAC                         #
    # ======================================================================= #

    def _hindsight_actions_goals(self, initial_observations, initial_rewards):
        """Calculate hindsight goal and action transitions.

        These are then stored in the replay buffer along with the original
        (non-hindsight) sample.

        See the README at the front page of this repository for an in-depth
        description of this procedure.

        Parameters
        ----------
        initial_observations : array_like
            the original worker observations with the non-hindsight goals
            appended to them
        initial_rewards : array_like
            the original intrinsic rewards

        Returns
        -------
        array_like
            the goal at every step in hindsight
        array_like
            the modified intrinsic rewards taking into account the hindsight
            goals

        Helps
        -----
        * store_transition(self):
        """
        new_goals = []
        observations = deepcopy(initial_observations)
        rewards = deepcopy(initial_rewards)
        hindsight_goal = 0 if self.relative_goals \
            else observations[-1][self.goal_indices]
        obs_tp1 = observations[-1]

        for i in range(1, len(observations) + 1):
            obs_t = observations[-i]

            # Calculate the hindsight goal in using relative goals.
            # If not, the hindsight goal is simply a subset of the
            # final state observation.
            if self.relative_goals:
                hindsight_goal += \
                    obs_tp1[self.goal_indices] - obs_t[self.goal_indices]

            # Modify the Worker intrinsic rewards based on the new
            # hindsight goal.
            if i > 1:
                rewards[-(i - 1)] = self.intrinsic_reward_scale \
                    * self.intrinsic_reward_fn(obs_t, hindsight_goal, obs_tp1)

            obs_tp1 = deepcopy(obs_t)
            new_goals = [deepcopy(hindsight_goal)] + new_goals

        return new_goals, rewards

    # ======================================================================= #
    #                       Auxiliary methods for CHER                        #
    # ======================================================================= #

    def _setup_cooperative_gradients(self):
        """Create the cooperative gradients meta-policy optimizer."""
        raise NotImplementedError

    def _cooperative_gradients_update(self,
                                      obs0,
                                      actions,
                                      rewards,
                                      obs1,
                                      terminals1,
                                      level_num,
                                      update_actor=True):
        """Perform the gradient update procedure for the CHER algorithm.

        This procedure is similar to update_from_batch, expect it runs the
        self.cg_optimizer operation instead of the policy object's optimizer,
        and utilizes some information from the worker samples as well.

        Parameters
        ----------
        obs0 : list of array_like
            (batch_size, obs_dim) matrix of observations for every level in the
            hierarchy
        actions : list of array_like
            (batch_size, ac_dim) matrix of actions for every level in the
            hierarchy
        obs1 : list of array_like
            (batch_size, obs_dim) matrix of next step observations for every
            level in the hierarchy
        rewards : list of array_like
            (batch_size,) vector of rewards for every level in the hierarchy
        terminals1 : list of numpy bool
            (batch_size,) vector of done masks for every level in the hierarchy
        level_num : int
            the hierarchy level number of the policy to optimize
        update_actor : bool
            specifies whether to update the actor policy of the meta policy.
            The critic policy is still updated if this value is set to False.

        Returns
        -------
        [float, float]
            meta-policy critic loss
        float
            meta-policy actor loss
        """
        raise NotImplementedError
Ejemplo n.º 7
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 use_huber,
                 l2_penalty,
                 model_params,
                 num_levels,
                 meta_period,
                 intrinsic_reward_type,
                 intrinsic_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 cooperative_gradients,
                 cg_weights,
                 cg_delta,
                 pretrain_worker,
                 pretrain_path,
                 pretrain_ckpt,
                 total_steps,
                 scope=None,
                 env_name="",
                 num_envs=1,
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        model_params : dict
            dictionary of model-specific parameters. See parent class.
        num_levels : int
            number of levels within the hierarchy. Must be greater than 1. Two
            levels correspond to a Manager/Worker paradigm.
        meta_period : int
            meta-policy action period
        intrinsic_reward_type : str
            the reward function to be used by the worker. Must be one of:

            * "negative_distance": the negative two norm between the states and
              desired absolute or relative goals.
            * "scaled_negative_distance": similar to the negative distance
              reward where the states, goals, and next states are scaled by the
              inverse of the action space of the manager policy
            * "non_negative_distance": the negative two norm between the states
              and desired absolute or relative goals offset by the maximum goal
              space (to ensure non-negativity)
            * "scaled_non_negative_distance": similar to the non-negative
              distance reward where the states, goals, and next states are
              scaled by the inverse of the action space of the manager policy
            * "exp_negative_distance": equal to exp(-negative_distance^2). The
              result is a reward between 0 and 1. This is useful for policies
              that terminate early.
            * "scaled_exp_negative_distance": similar to the previous worker
              reward type but with states, actions, and next states that are
              scaled.
        intrinsic_reward_scale : float
            the value that the intrinsic reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the higher-level policies is
            meant to be a relative or absolute goal, i.e. specific state or
            change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        cooperative_gradients : bool
            whether to use the cooperative gradient update procedure for the
            higher-level policy. See: https://arxiv.org/abs/1912.02368v1
        cg_weights : float
            weights for the gradients of the loss of the lower-level policies
            with respect to the parameters of the higher-level policies. Only
            used if `cooperative_gradients` is set to True.
        cg_delta : float
            the desired lower-level expected returns. If set to None, a fixed
            Lagrangian specified by cg_weights is used instead. Only used if
            `cooperative_gradients` is set to True.
        pretrain_worker : bool
            specifies whether you are pre-training the lower-level policies.
            Actions by the high-level policy are randomly sampled from the
            action space.
        pretrain_path : str or None
            path to the pre-trained worker policy checkpoints
        pretrain_ckpt : int or None
            checkpoint number to use within the worker policy path. If set to
            None, the most recent checkpoint is used.
        total_steps : int
            Total number of timesteps used during training. Used by a subset of
            algorithms.
        meta_policy : type [ hbaselines.base_policies.Policy ]
            the policy model to use for the meta policies
        worker_policy : type [ hbaselines.base_policies.Policy ]
            the policy model to use for the worker policy
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(
            sess=sess,
            ob_space=ob_space,
            ac_space=ac_space,
            co_space=co_space,
            verbose=verbose,
            l2_penalty=l2_penalty,
            model_params=model_params,
            num_envs=num_envs,
        )

        assert num_levels >= 2, "num_levels must be greater than or equal to 2"

        self.num_levels = num_levels
        self.meta_period = meta_period
        self.intrinsic_reward_type = intrinsic_reward_type
        self.intrinsic_reward_scale = intrinsic_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.cooperative_gradients = cooperative_gradients
        self.cg_weights = cg_weights
        self.cg_delta = cg_delta
        self.pretrain_worker = pretrain_worker
        self.pretrain_path = pretrain_path
        self.pretrain_ckpt = pretrain_ckpt
        self.total_steps = total_steps

        # Get the observation and action space of the higher level policies.
        meta_ac_space = get_meta_ac_space(
            ob_space=ob_space,
            relative_goals=relative_goals,
            env_name=env_name,
        )

        # =================================================================== #
        # Step 1: Create the policies for the individual levels.              #
        # =================================================================== #

        self.policy = []

        # The policies are ordered from the highest level to lowest level
        # policies in the hierarchy.
        for i in range(num_levels):
            # Determine the appropriate parameters to use for the policy in the
            # current level.
            policy_fn = meta_policy if i < (num_levels - 1) else worker_policy
            ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space
            co_space_i = co_space if i == 0 else meta_ac_space
            ob_space_i = ob_space

            # The policies are ordered from the highest level to lowest level
            # policies in the hierarchy.
            with tf.compat.v1.variable_scope("level_{}".format(i)):
                # Compute the scope name based on any outer scope term.
                scope_i = "level_{}".format(i)
                if scope is not None:
                    scope_i = "{}/{}".format(scope, scope_i)

                # TODO: description.
                model_params_i = model_params.copy()
                model_params_i.update({
                    "ignore_flat_channels":
                    model_params["ignore_flat_channels"] if i < 1 else [],
                    "ignore_image":
                    model_params["ignore_image"] if i < 1 else True,
                })

                # Create the next policy.
                self.policy.append(
                    policy_fn(
                        sess=sess,
                        ob_space=ob_space_i,
                        ac_space=ac_space_i,
                        co_space=co_space_i,
                        buffer_size=buffer_size,
                        batch_size=batch_size,
                        actor_lr=actor_lr,
                        critic_lr=critic_lr,
                        verbose=verbose,
                        tau=tau,
                        gamma=gamma,
                        use_huber=use_huber,
                        l2_penalty=l2_penalty,
                        model_params=model_params_i,
                        scope=scope_i,
                        **(additional_params or {}),
                    ))

        # =================================================================== #
        # Step 2: Create attributes for the replay buffer.                    #
        # =================================================================== #

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            obs_dim=ob_space.shape[0],
            ac_dim=ac_space.shape[0],
            co_dim=None if co_space is None else co_space.shape[0],
            goal_dim=meta_ac_space.shape[0],
            num_levels=num_levels)

        # current action by the meta-level policies
        self.meta_action = [[None for _ in range(num_levels - 1)]
                            for _ in range(num_envs)]

        # a list of all the actions performed by each level in the hierarchy,
        # ordered from highest to lowest level policy. A separate element is
        # used for each environment.
        self._actions = [[[] for _ in range(self.num_levels)]
                         for _ in range(num_envs)]

        # a list of the rewards (intrinsic or other) experienced by every level
        # in the hierarchy, ordered from highest to lowest level policy. A
        # separate element is used for each environment.
        self._rewards = [[[0]] + [[] for _ in range(self.num_levels - 1)]
                         for _ in range(num_envs)]

        # a list of observations that stretch as long as the dilated horizon
        # chosen for the highest level policy. A separate element is used for
        # each environment.
        self._observations = [[] for _ in range(num_envs)]

        # the first and last contextual term. A separate element is used for
        # each environment.
        self._contexts = [[] for _ in range(num_envs)]

        # a list of done masks at every time step. A separate element is used
        # for each environment.
        self._dones = [[] for _ in range(num_envs)]

        # Collect the state indices for the intrinsic rewards.
        self.goal_indices = get_state_indices(ob_space, env_name)

        # Define the intrinsic reward function.
        if intrinsic_reward_type in [
                "negative_distance", "scaled_negative_distance",
                "non_negative_distance", "scaled_non_negative_distance",
                "exp_negative_distance", "scaled_exp_negative_distance"
        ]:
            # Offset the distance measure by the maximum possible distance to
            # ensure non-negativity.
            if "non_negative" in intrinsic_reward_type:
                offset = np.sqrt(
                    np.sum(np.square(meta_ac_space.high - meta_ac_space.low),
                           -1))
            else:
                offset = 0

            # Scale the outputs from the state by the meta-action space if you
            # wish to scale the worker reward.
            if intrinsic_reward_type.startswith("scaled"):
                scale = 0.5 * (meta_ac_space.high - meta_ac_space.low)
            else:
                scale = 1

            def intrinsic_reward_fn(states, goals, next_states):
                return negative_distance(
                    states=states[self.goal_indices] / scale,
                    goals=goals / scale,
                    next_states=next_states[self.goal_indices] / scale,
                    relative_context=relative_goals,
                    offset=0.0,
                ) + offset

            # Perform the exponential and squashing operations to keep the
            # intrinsic reward between 0 and 1.
            if "exp" in intrinsic_reward_type:

                def exp_intrinsic_reward_fn(states, goals, next_states):
                    # TODO: temporary
                    span = sum(
                        np.square(self.policy[0].ac_space.high -
                                  self.policy[0].ac_space.low))
                    rew = intrinsic_reward_fn(states, goals, next_states)
                    return np.exp(-(rew / (span / 40))**2)

                self.intrinsic_reward_fn = exp_intrinsic_reward_fn
            else:
                self.intrinsic_reward_fn = intrinsic_reward_fn
        else:
            raise ValueError("Unknown intrinsic reward type: {}".format(
                intrinsic_reward_type))

        # =================================================================== #
        # Step 3: Create algorithm-specific features.                         #
        # =================================================================== #

        # the number of get_action calls that have been performed. This is used
        # when pretraining the worker to incrementally train different levels
        # of the policy.
        self._steps = 0

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        if self.cooperative_gradients:
            if scope is None:
                self._setup_cooperative_gradients()
            else:
                with tf.compat.v1.variable_scope(scope):
                    self._setup_cooperative_gradients()
Ejemplo n.º 8
0
class GoalConditionedPolicy(ActorCriticPolicy):
    r"""Goal-conditioned hierarchical reinforcement learning model.

    FIXME
    This policy is an implementation of the two-level hierarchy presented
    in [1], which itself is similar to the feudal networks formulation [2, 3].
    This network consists of a high-level, or Manager, pi_{\theta_H} that
    computes and outputs goals g_t ~ pi_{\theta_H}(s_t, h) every `meta_period`
    time steps, and a low-level policy pi_{\theta_L} that takes as inputs the
    current state and the assigned goals and attempts to perform an action
    a_t ~ pi_{\theta_L}(s_t,g_t) that satisfies these goals.

    The highest level policy is rewarded based on the original environment
    reward function: r_H = r(s,a;h).

    The Target term, h, parametrizes the reward assigned to the highest level
    policy in order to allow the policy to generalize to several goals within a
    task, a technique that was first proposed by [4].

    Finally, the Worker is motivated to follow the goals set by the Manager via
    an intrinsic reward based on the distance between the current observation
    and the goal observation:
    r_L (s_t, g_t, s_{t+1}) = -||s_t + g_t - s_{t+1}||_2

    Bibliography:

    [1] Nachum, Ofir, et al. "Data-efficient hierarchical reinforcement
        learning." Advances in Neural Information Processing Systems. 2018.
    [2] Dayan, Peter, and Geoffrey E. Hinton. "Feudal reinforcement learning."
        Advances in neural information processing systems. 1993.
    [3] Vezhnevets, Alexander Sasha, et al. "Feudal networks for hierarchical
        reinforcement learning." Proceedings of the 34th International
        Conference on Machine Learning-Volume 70. JMLR. org, 2017.
    [4] Schaul, Tom, et al. "Universal value function approximators."
        International Conference on Machine Learning. 2015.

    Attributes
    ----------
    meta_period : int
        meta-policy action period
    intrinsic_reward_type : str
        the reward function to be used by the worker. Must be one of:

        * "negative_distance": the negative two norm between the states and
          desired absolute or relative goals.
        * "scaled_negative_distance": similar to the negative distance reward
          where the states, goals, and next states are scaled by the inverse of
          the action space of the manager policy
        * "non_negative_distance": the negative two norm between the states and
          desired absolute or relative goals offset by the maximum goal space
          (to ensure non-negativity)
        * "scaled_non_negative_distance": similar to the non-negative distance
          reward where the states, goals, and next states are scaled by the
          inverse of the action space of the manager policy
        * "exp_negative_distance": equal to exp(-negative_distance^2). The
          result is a reward between 0 and 1. This is useful for policies that
          terminate early.
        * "scaled_exp_negative_distance": similar to the previous worker reward
          type but with states, actions, and next states that are scaled.
    intrinsic_reward_scale : float
        the value that the intrinsic reward should be scaled by
    relative_goals : bool
        specifies whether the goal issued by the higher-level policies is meant
        to be a relative or absolute goal, i.e. specific state or change in
        state
    off_policy_corrections : bool
        whether to use off-policy corrections during the update procedure. See:
        https://arxiv.org/abs/1805.08296.
    hindsight : bool
        whether to use hindsight action and goal transitions, as well as
        subgoal testing. See: https://arxiv.org/abs/1712.00948
    subgoal_testing_rate : float
        rate at which the original (non-hindsight) sample is stored in the
        replay buffer as well. Used only if `hindsight` is set to True.
    connected_gradients : bool
        whether to use the connected gradient update actor update procedure
        to the higher-level policy. See: https://arxiv.org/abs/1912.02368v1
    cg_weights : float
        weights for the gradients of the loss of the lower-level policies with
        respect to the parameters of the higher-level policies. Only used if
        `connected_gradients` is set to True.
    use_fingerprints : bool
        specifies whether to add a time-dependent fingerprint to the
        observations
    fingerprint_range : (list of float, list of float)
        the low and high values for each fingerprint element, if they are being
        used
    fingerprint_dim : tuple of int
        the shape of the fingerprint elements, if they are being used
    centralized_value_functions : bool
        specifies whether to use centralized value functions
    policy : list of hbaselines.base_policies.ActorCriticPolicy
        a list of policy object for each level in the hierarchy, order from
        highest to lowest level policy
    replay_buffer : hbaselines.goal_conditioned.replay_buffer.HierReplayBuffer
        the replay buffer object
    goal_indices : list of int
        the state indices for the intrinsic rewards
    intrinsic_reward_fn : function
        reward function for the lower-level policies
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 num_levels,
                 meta_period,
                 intrinsic_reward_type,
                 intrinsic_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 connected_gradients,
                 cg_weights,
                 use_fingerprints,
                 fingerprint_range,
                 centralized_value_functions,
                 env_name="",
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        num_levels : int
            number of levels within the hierarchy. Must be greater than 1. Two
            levels correspond to a Manager/Worker paradigm.
        meta_period : int
            meta-policy action period
        intrinsic_reward_type : str
            the reward function to be used by the worker. Must be one of:

            * "negative_distance": the negative two norm between the states and
              desired absolute or relative goals.
            * "scaled_negative_distance": similar to the negative distance
              reward where the states, goals, and next states are scaled by the
              inverse of the action space of the manager policy
            * "non_negative_distance": the negative two norm between the states
              and desired absolute or relative goals offset by the maximum goal
              space (to ensure non-negativity)
            * "scaled_non_negative_distance": similar to the non-negative
              distance reward where the states, goals, and next states are
              scaled by the inverse of the action space of the manager policy
            * "exp_negative_distance": equal to exp(-negative_distance^2). The
              result is a reward between 0 and 1. This is useful for policies
              that terminate early.
            * "scaled_exp_negative_distance": similar to the previous worker
              reward type but with states, actions, and next states that are
              scaled.
        intrinsic_reward_scale : float
            the value that the intrinsic reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the higher-level policies is
            meant to be a relative or absolute goal, i.e. specific state or
            change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        connected_gradients : bool
            whether to use the connected gradient update actor update procedure
            to the higher-level policy. See: https://arxiv.org/abs/1912.02368v1
        cg_weights : float
            weights for the gradients of the loss of the lower-level policies
            with respect to the parameters of the higher-level policies. Only
            used if `connected_gradients` is set to True.
        use_fingerprints : bool
            specifies whether to add a time-dependent fingerprint to the
            observations
        fingerprint_range : (list of float, list of float)
            the low and high values for each fingerprint element, if they are
            being used
        centralized_value_functions : bool
            specifies whether to use centralized value functions
        meta_policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use for the meta policies
        worker_policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use for the worker policy
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(sess=sess,
                                                    ob_space=ob_space,
                                                    ac_space=ac_space,
                                                    co_space=co_space,
                                                    buffer_size=buffer_size,
                                                    batch_size=batch_size,
                                                    actor_lr=actor_lr,
                                                    critic_lr=critic_lr,
                                                    verbose=verbose,
                                                    tau=tau,
                                                    gamma=gamma,
                                                    layer_norm=layer_norm,
                                                    layers=layers,
                                                    act_fun=act_fun,
                                                    use_huber=use_huber)

        assert num_levels >= 2, "num_levels must be greater than or equal to 2"

        self.num_levels = num_levels
        self.meta_period = meta_period
        self.intrinsic_reward_type = intrinsic_reward_type
        self.intrinsic_reward_scale = intrinsic_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.connected_gradients = connected_gradients
        self.cg_weights = cg_weights
        self.use_fingerprints = use_fingerprints
        self.fingerprint_range = fingerprint_range
        self.fingerprint_dim = (len(self.fingerprint_range[0]), )
        self.centralized_value_functions = centralized_value_functions

        # Get the observation and action space of the higher level policies.
        meta_ac_space = get_meta_ac_space(ob_space=ob_space,
                                          relative_goals=relative_goals,
                                          env_name=env_name,
                                          use_fingerprints=use_fingerprints,
                                          fingerprint_dim=self.fingerprint_dim)

        # =================================================================== #
        # Step 1: Create the policies for the individual levels.              #
        # =================================================================== #

        self.policy = []

        # The policies are ordered from the highest level to lowest level
        # policies in the hierarchy.
        for i in range(num_levels):
            # Determine the appropriate parameters to use for the policy in the
            # current level.
            policy_fn = meta_policy if i < (num_levels - 1) else worker_policy
            ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space
            co_space_i = co_space if i == 0 else meta_ac_space
            ob_space_i = ob_space
            zero_fingerprint_i = i == (num_levels - 1)

            # The policies are ordered from the highest level to lowest level
            # policies in the hierarchy.
            with tf.compat.v1.variable_scope("level_{}".format(i)):
                self.policy.append(
                    policy_fn(
                        sess=sess,
                        ob_space=ob_space_i,
                        ac_space=ac_space_i,
                        co_space=co_space_i,
                        buffer_size=buffer_size,
                        batch_size=batch_size,
                        actor_lr=actor_lr,
                        critic_lr=critic_lr,
                        verbose=verbose,
                        tau=tau,
                        gamma=gamma,
                        layer_norm=layer_norm,
                        layers=layers,
                        act_fun=act_fun,
                        use_huber=use_huber,
                        scope="level_{}".format(i),
                        zero_fingerprint=zero_fingerprint_i,
                        fingerprint_dim=self.fingerprint_dim[0],
                        **(additional_params or {}),
                    ))

        # =================================================================== #
        # Step 2: Create attributes for the replay buffer.                    #
        # =================================================================== #

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            obs_dim=ob_space.shape[0],
            ac_dim=ac_space.shape[0],
            co_dim=None if co_space is None else co_space.shape[0],
            goal_dim=meta_ac_space.shape[0],
            num_levels=num_levels)

        # current action by the meta-level policies
        self._meta_action = [None for _ in range(num_levels - 1)]

        # a list of all the actions performed by each level in the hierarchy,
        # ordered from highest to lowest level policy
        self._actions = None

        # a list of the rewards (intrinsic or other) experienced by every level
        # in the hierarchy, ordered from highest to lowest level policy
        self._rewards = None

        # a list of observations that stretch as long as the dilated horizon
        # chosen for the highest level policy
        self._observations = None

        # the first and last contextual term
        self._contexts = None

        # a list of done masks at every time step
        self._dones = None

        # Collect the state indices for the intrinsic rewards.
        self.goal_indices = get_state_indices(
            ob_space=ob_space,
            env_name=env_name,
            use_fingerprints=use_fingerprints,
            fingerprint_dim=self.fingerprint_dim)

        # Define the intrinsic reward function.
        if intrinsic_reward_type in [
                "negative_distance", "scaled_negative_distance",
                "non_negative_distance", "scaled_non_negative_distance",
                "exp_negative_distance", "scaled_exp_negative_distance"
        ]:
            # Offset the distance measure by the maximum possible distance to
            # ensure non-negativity.
            if "non_negative" in intrinsic_reward_type:
                offset = np.sqrt(
                    np.sum(np.square(meta_ac_space.high - meta_ac_space.low),
                           -1))
            else:
                offset = 0

            # Scale the outputs from the state by the meta-action space if you
            # wish to scale the worker reward.
            if intrinsic_reward_type.startswith("scaled"):
                scale = 0.5 * (meta_ac_space.high - meta_ac_space.low)
            else:
                scale = 1

            def intrinsic_reward_fn(states, goals, next_states):
                return negative_distance(
                    states=states[self.goal_indices] / scale,
                    goals=goals / scale,
                    next_states=next_states[self.goal_indices] / scale,
                    relative_context=relative_goals,
                    offset=0.0) + offset

            # Perform the exponential and squashing operations to keep the
            # intrinsic reward between 0 and 1.
            if "exp" in intrinsic_reward_type:

                def exp_intrinsic_reward_fn(states, goals, next_states):
                    return np.exp(
                        -1 *
                        intrinsic_reward_fn(states, goals, next_states)**2)

                self.intrinsic_reward_fn = exp_intrinsic_reward_fn
            else:
                self.intrinsic_reward_fn = intrinsic_reward_fn
        else:
            raise ValueError("Unknown intrinsic reward type: {}".format(
                intrinsic_reward_type))

        # =================================================================== #
        # Step 3: Create algorithm-specific features.                         #
        # =================================================================== #

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        # Utility method for indexing the goal out of an observation variable.
        self.crop_to_goal = lambda g: tf.gather(
            g,
            tf.tile(tf.expand_dims(np.array(self.goal_indices), 0),
                    [self.batch_size, 1]),
            batch_dims=1,
            axis=1)

        if self.connected_gradients:
            self._setup_connected_gradients()

    def initialize(self):
        """See parent class.

        This method calls the initialization methods of the policies at every
        level of the hierarchy.
        """
        for i in range(self.num_levels):
            self.policy[i].initialize()
        self.clear_memory()

    def update(self, update_actor=True, **kwargs):
        """Perform a gradient update step.

        This is done both at every level of the hierarchy.

        The kwargs argument for this method contains two additional terms:

        * update_meta (bool): specifies whether to perform a gradient update
          step for the meta-policies
        * update_meta_actor (bool): similar to the `update_policy` term, but
          for the meta-policy. Note that, if `update_meta` is set to False,
          this term is void.

        **Note**; The target update soft updates for all policies occur at the
        same frequency as their respective actor update frequencies.

        Parameters
        ----------
        update_actor : bool
            specifies whether to update the actor policy. The critic policy is
            still updated if this value is set to False.

        Returns
        -------
         ([float, float], [float, float])
            the critic loss for every policy in the hierarchy
        (float, float)
            the actor loss for every policy in the hierarchy
        """
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return tuple([[0, 0] for _ in range(self.num_levels)]), \
                tuple([0 for _ in range(self.num_levels)])

        # Specifies whether to remove additional data from the replay buffer
        # sampling procedure. Since only a subset of algorithms use additional
        # data, removing it can speedup the other algorithms.
        with_additional = self.off_policy_corrections

        # Get a batch.
        obs0, obs1, act, rew, done, additional = self.replay_buffer.sample(
            with_additional)

        # Update the higher-level policies.
        actor_loss = []
        critic_loss = []

        if kwargs['update_meta']:
            # Replace the goals with the most likely goals.
            if self.off_policy_corrections:
                meta_act = self._sample_best_meta_action(
                    meta_obs0=obs0[0],
                    meta_obs1=obs1[0],
                    meta_action=act[0],
                    worker_obses=additional["worker_obses"],
                    worker_actions=additional["worker_actions"],
                    k=8)
                act[0] = meta_act

            for i in range(self.num_levels - 1):
                if self.connected_gradients:
                    # Perform the connected gradients update procedure.
                    vf_loss, pi_loss = self._connected_gradients_update(
                        obs0=obs0,
                        actions=act,
                        rewards=rew,
                        obs1=obs1,
                        terminals1=done,
                        update_actor=kwargs['update_meta_actor'],
                    )
                else:
                    # Perform the regular meta update procedure.
                    vf_loss, pi_loss = self.policy[i].update_from_batch(
                        obs0=obs0[i],
                        actions=act[i],
                        rewards=rew[i],
                        obs1=obs1[i],
                        terminals1=done[i],
                        update_actor=kwargs['update_meta_actor'],
                    )

                actor_loss.append(pi_loss)
                critic_loss.append(vf_loss)
        else:
            for i in range(self.num_levels - 1):
                actor_loss.append(0)
                critic_loss.append([0, 0])

        # Update the lowest level policy.
        w_critic_loss, w_actor_loss = self.policy[-1].update_from_batch(
            obs0=obs0[-1],
            actions=act[-1],
            rewards=rew[-1],
            obs1=obs1[-1],
            terminals1=done[-1],
            update_actor=update_actor,
        )
        critic_loss.append(w_critic_loss)
        actor_loss.append(w_actor_loss)

        return tuple(critic_loss), tuple(actor_loss)

    def get_action(self, obs, context, apply_noise, random_actions):
        """See parent class."""
        # Loop through the policies in the hierarchy.
        for i in range(self.num_levels - 1):
            if self._update_meta(i):
                context_i = context if i == 0 else self._meta_action[i - 1]

                # Update the meta action based on the output from the policy if
                # the time period requires is.
                self._meta_action[i] = self.policy[i].get_action(
                    obs, context_i, apply_noise, random_actions)
            else:
                # Update the meta-action in accordance with a fixed transition
                # function.
                self._meta_action[i] = self.goal_transition_fn(
                    obs0=np.array([self._observations[-1][self.goal_indices]]),
                    goal=self._meta_action[i],
                    obs1=obs[:, self.goal_indices])

        # Return the action to be performed within the environment (i.e. the
        # action by the lowest level policy).
        action = self.policy[-1].get_action(obs, self._meta_action[-1],
                                            apply_noise, random_actions)

        return action

    def store_transition(self,
                         obs0,
                         context0,
                         action,
                         reward,
                         obs1,
                         context1,
                         done,
                         is_final_step,
                         evaluate=False):
        """See parent class."""
        # the time since the most recent sample began collecting step samples
        t_start = len(self._observations)

        for i in range(1, self.num_levels):
            # Actions and intrinsic rewards for the high-level policies are
            # only updated when the action is recomputed by the graph.
            if t_start % self.meta_period**(i - 1) == 0:
                self._rewards[-i].append(0)
                self._actions[-i - 1].append(self._meta_action[-i].flatten())

            # Compute the intrinsic rewards and append them to the list of
            # rewards.
            self._rewards[-i][-1] += \
                self.intrinsic_reward_scale / self.meta_period ** (i-1) * \
                self.intrinsic_reward_fn(
                    states=obs0,
                    goals=self._meta_action[-i].flatten(),
                    next_states=obs1
                )

        # The highest level policy receives the sum of environmental rewards.
        self._rewards[0][0] += reward

        # The lowest level policy's actions are received from the algorithm.
        self._actions[-1].append(action)

        # Add the environmental observations and contextual terms to their
        # respective lists.
        self._observations.append(obs0)
        if t_start == 0:
            self._contexts.append(context0)

        # Modify the done mask in accordance with the TD3 algorithm. Done masks
        # that correspond to the final step are set to False.
        self._dones.append(done and not is_final_step)

        # Add a sample to the replay buffer.
        if len(self._observations) == \
                self.meta_period ** (self.num_levels - 1) or done:
            # Add the last observation and context.
            self._observations.append(obs1)
            self._contexts.append(context1)

            # Compute the current state goals to add to the final observation.
            for i in range(self.num_levels - 1):
                self._actions[i].append(
                    self.goal_transition_fn(
                        obs0=obs0[self.goal_indices],
                        goal=self._meta_action[i],
                        obs1=obs1[self.goal_indices]).flatten())

            # Avoid storing samples when performing evaluations.
            if not evaluate:
                if not self.hindsight \
                        or random.random() < self.subgoal_testing_rate:
                    # Store a sample in the replay buffer.
                    self.replay_buffer.add(
                        obs_t=self._observations,
                        context_t=self._contexts,
                        action_t=self._actions,
                        reward_t=self._rewards,
                        done_t=self._dones,
                    )

                if self.hindsight:
                    # Some temporary attributes.
                    worker_obses = [
                        self._get_obs(self._observations[i],
                                      self._actions[0][i], 0)
                        for i in range(len(self._observations))
                    ]
                    intrinsic_rewards = self._rewards[-1]

                    # Implement hindsight action and goal transitions.
                    goal, rewards = self._hindsight_actions_goals(
                        initial_observations=worker_obses,
                        initial_rewards=intrinsic_rewards)
                    new_actions = deepcopy(self._actions)
                    new_actions[0] = goal
                    new_rewards = deepcopy(self._rewards)
                    new_rewards[-1] = rewards

                    # Store the hindsight sample in the replay buffer.
                    self.replay_buffer.add(
                        obs_t=self._observations,
                        context_t=self._contexts,
                        action_t=new_actions,
                        reward_t=new_rewards,
                        done_t=self._dones,
                    )

            # Clear the memory that has been stored in the replay buffer.
            self.clear_memory()

    def _update_meta(self, level):
        """Determine whether a meta-policy should update its action.

        This is done by checking the length of the observation lists that are
        passed to the replay buffer, which are cleared whenever the highest
        level meta-period has been met or the environment has been reset.

        Parameters
        ----------
        level : int
            the level of the policy

        Returns
        -------
        bool
            True if the action should be updated by the meta-policy at the
            given level
        """
        return len(self._observations) % \
            (self.meta_period ** (self.num_levels - level - 1)) == 0

    def clear_memory(self):
        """Clear internal memory that is used by the replay buffer."""
        self._actions = [[] for _ in range(self.num_levels)]
        self._rewards = [[0]] + [[] for _ in range(self.num_levels - 1)]
        self._observations = []
        self._contexts = []
        self._dones = []

    def get_td_map(self):
        """See parent class."""
        # Not enough samples in the replay buffer.
        if not self.replay_buffer.can_sample():
            return {}

        # Get a batch.
        obs0, obs1, act, rew, done, _ = self.replay_buffer.sample(False)

        td_map = {}
        for i in range(self.num_levels):
            td_map.update(self.policy[i].get_td_map_from_batch(
                obs0=obs0[i],
                actions=act[i],
                rewards=rew[i],
                obs1=obs1[i],
                terminals1=done[i]))

        return td_map

    # ======================================================================= #
    #                       Auxiliary methods for HIRO                        #
    # ======================================================================= #

    def _sample_best_meta_action(self,
                                 meta_obs0,
                                 meta_obs1,
                                 meta_action,
                                 worker_obses,
                                 worker_actions,
                                 k=10):
        """Return meta-actions that approximately maximize low-level log-probs.

        Parameters
        ----------
        meta_obs0 : array_like
            (batch_size, m_obs_dim) matrix of meta observations
        meta_obs1 : array_like
            (batch_size, m_obs_dim) matrix of next time step meta observations
        meta_action : array_like
            (batch_size, m_ac_dim) matrix of meta actions
        worker_obses : array_like
            (batch_size, w_obs_dim, meta_period+1) matrix of current Worker
            state observations
        worker_actions : array_like
            (batch_size, w_ac_dim, meta_period) matrix of current Worker
            environmental actions
        k : int, optional
            number of goals returned, excluding the initial goal and the mean
            value

        Returns
        -------
        array_like
            (batch_size, m_ac_dim) matrix of most likely meta actions
        """
        batch_size, goal_dim = meta_action.shape

        # Collect several samples of potentially optimal goals.
        sampled_actions = self._sample(meta_obs0, meta_obs1, meta_action, k)
        assert sampled_actions.shape == (batch_size, goal_dim, k)

        # Compute the fitness of each candidate goal. The fitness is the sum of
        # the log-probabilities of each action for the given goal.
        fitness = self._log_probs(sampled_actions, worker_obses,
                                  worker_actions)
        assert fitness.shape == (batch_size, k)

        # For each sample, choose the meta action that maximizes the fitness.
        indx = np.argmax(fitness, 1)
        best_goals = np.asarray(
            [sampled_actions[i, :, indx[i]] for i in range(batch_size)])

        return best_goals

    def _sample(self, meta_obs0, meta_obs1, meta_action, num_samples, sc=0.5):
        """Sample different goals.

        The goals are sampled as follows:

        * The first num_samples-2 goals are acquired from a random Gaussian
          distribution centered at s_{t+c} - s_t.
        * The second to last goal is s_{t+c} - s_t.
        * The last goal is the originally sampled goal g_t.

        Parameters
        ----------
        meta_obs0 : array_like
            (batch_size, m_obs_dim) matrix of meta observations
        meta_obs1 : array_like
            (batch_size, m_obs_dim) matrix of next time step meta observations
        meta_action : array_like
            (batch_size, m_ac_dim) matrix of meta actions
        num_samples : int
            number of samples
        sc : float
            scaling factor for the normal distribution.

        Returns
        -------
        array_like
            (batch_size, goal_dim, num_samples) matrix of sampled goals

        Helps
        -----
        * _sample_best_meta_action(self)
        """
        batch_size, goal_dim = meta_action.shape
        goal_space = self.policy[0].ac_space
        spec_range = goal_space.high - goal_space.low
        random_samples = num_samples - 2

        # Compute the mean and std for the Gaussian distribution to sample
        # from, and well as the maxima and minima.
        loc = meta_obs1[:, self.goal_indices] - meta_obs0[:, self.goal_indices]
        scale = [sc * spec_range / 2]
        minimum, maximum = [goal_space.low], [goal_space.high]

        new_loc = np.zeros((batch_size, goal_dim, random_samples))
        new_scale = np.zeros((batch_size, goal_dim, random_samples))
        for i in range(random_samples):
            new_loc[:, :, i] = loc
            new_scale[:, :, i] = scale

        new_minimum = np.zeros((batch_size, goal_dim, num_samples))
        new_maximum = np.zeros((batch_size, goal_dim, num_samples))
        for i in range(num_samples):
            new_minimum[:, :, i] = minimum
            new_maximum[:, :, i] = maximum

        # Generate random samples for the above distribution.
        normal_samples = np.random.normal(size=(random_samples * batch_size *
                                                goal_dim))
        normal_samples = normal_samples.reshape(
            (batch_size, goal_dim, random_samples))

        samples = np.zeros((batch_size, goal_dim, num_samples))
        samples[:, :, :-2] = new_loc + normal_samples * new_scale
        samples[:, :, -2] = loc
        samples[:, :, -1] = meta_action

        # Clip the values based on the meta action space range.
        samples = np.minimum(np.maximum(samples, new_minimum), new_maximum)

        return samples

    def _log_probs(self, meta_actions, worker_obses, worker_actions):
        """Calculate the log probability of the next goal by the meta-policies.

        Parameters
        ----------
        meta_actions : array_like
            (batch_size, m_ac_dim, num_samples) matrix of candidate higher-
            level policy actions
        worker_obses : array_like
            (batch_size, w_obs_dim, meta_period + 1) matrix of lower-level
            policy observations
        worker_actions : array_like
            (batch_size, w_ac_dim, meta_period) list of lower-level policy
            actions

        Returns
        -------
        array_like
            (batch_size, num_samples) fitness associated with every state /
            action / goal pair

        Helps
        -----
        * _sample_best_meta_action(self):
        """
        raise NotImplementedError

    # ======================================================================= #
    #                       Auxiliary methods for HAC                         #
    # ======================================================================= #

    def _hindsight_actions_goals(self, initial_observations, initial_rewards):
        """Calculate hindsight goal and action transitions.

        These are then stored in the replay buffer along with the original
        (non-hindsight) sample.

        See the README at the front page of this repository for an in-depth
        description of this procedure.

        Parameters
        ----------
        initial_observations : array_like
            the original worker observations with the non-hindsight goals
            appended to them
        initial_rewards : array_like
            the original intrinsic rewards

        Returns
        -------
        array_like
            the goal at every step in hindsight
        array_like
            the modified intrinsic rewards taking into account the hindsight
            goals

        Helps
        -----
        * store_transition(self):
        """
        new_goals = []
        observations = deepcopy(initial_observations)
        rewards = deepcopy(initial_rewards)
        hindsight_goal = 0 if self.relative_goals \
            else observations[-1][self.goal_indices]
        obs_tp1 = observations[-1]

        for i in range(1, len(observations) + 1):
            obs_t = observations[-i]

            # Calculate the hindsight goal in using relative goals.
            # If not, the hindsight goal is simply a subset of the
            # final state observation.
            if self.relative_goals:
                hindsight_goal += \
                    obs_tp1[self.goal_indices] - obs_t[self.goal_indices]

            # Modify the Worker intrinsic rewards based on the new
            # hindsight goal.
            if i > 1:
                rewards[-(i - 1)] = self.intrinsic_reward_scale \
                    * self.intrinsic_reward_fn(obs_t, hindsight_goal, obs_tp1)

            obs_tp1 = deepcopy(obs_t)
            new_goals = [deepcopy(hindsight_goal)] + new_goals

        return new_goals, rewards

    # ======================================================================= #
    #                      Auxiliary methods for HRL-CG                       #
    # ======================================================================= #

    def _setup_connected_gradients(self):
        """Create the connected gradients meta-policy optimizer."""
        raise NotImplementedError

    def _connected_gradients_update(self,
                                    obs0,
                                    actions,
                                    rewards,
                                    obs1,
                                    terminals1,
                                    update_actor=True):
        """Perform the gradient update procedure for the HRL-CG algorithm.

        This procedure is similar to update_from_batch, expect it runs the
        self.cg_optimizer operation instead of the policy object's optimizer,
        and utilizes some information from the worker samples as well.

        Parameters
        ----------
        obs0 : list of array_like
            (batch_size, obs_dim) matrix of observations for every level in the
            hierarchy
        actions : list of array_like
            (batch_size, ac_dim) matrix of actions for every level in the
            hierarchy
        obs1 : list of array_like
            (batch_size, obs_dim) matrix of next step observations for every
            level in the hierarchy
        rewards : list of array_like
            (batch_size,) vector of rewards for every level in the hierarchy
        terminals1 : list of numpy bool
            (batch_size,) vector of done masks for every level in the hierarchy
        update_actor : bool
            specifies whether to update the actor policy of the meta policy.
            The critic policy is still updated if this value is set to False.

        Returns
        -------
        [float, float]
            meta-policy critic loss
        float
            meta-policy actor loss
        """
        raise NotImplementedError
Ejemplo n.º 9
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 num_levels,
                 meta_period,
                 intrinsic_reward_type,
                 intrinsic_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 connected_gradients,
                 cg_weights,
                 use_fingerprints,
                 fingerprint_range,
                 centralized_value_functions,
                 env_name="",
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        num_levels : int
            number of levels within the hierarchy. Must be greater than 1. Two
            levels correspond to a Manager/Worker paradigm.
        meta_period : int
            meta-policy action period
        intrinsic_reward_type : str
            the reward function to be used by the worker. Must be one of:

            * "negative_distance": the negative two norm between the states and
              desired absolute or relative goals.
            * "scaled_negative_distance": similar to the negative distance
              reward where the states, goals, and next states are scaled by the
              inverse of the action space of the manager policy
            * "non_negative_distance": the negative two norm between the states
              and desired absolute or relative goals offset by the maximum goal
              space (to ensure non-negativity)
            * "scaled_non_negative_distance": similar to the non-negative
              distance reward where the states, goals, and next states are
              scaled by the inverse of the action space of the manager policy
            * "exp_negative_distance": equal to exp(-negative_distance^2). The
              result is a reward between 0 and 1. This is useful for policies
              that terminate early.
            * "scaled_exp_negative_distance": similar to the previous worker
              reward type but with states, actions, and next states that are
              scaled.
        intrinsic_reward_scale : float
            the value that the intrinsic reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the higher-level policies is
            meant to be a relative or absolute goal, i.e. specific state or
            change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        connected_gradients : bool
            whether to use the connected gradient update actor update procedure
            to the higher-level policy. See: https://arxiv.org/abs/1912.02368v1
        cg_weights : float
            weights for the gradients of the loss of the lower-level policies
            with respect to the parameters of the higher-level policies. Only
            used if `connected_gradients` is set to True.
        use_fingerprints : bool
            specifies whether to add a time-dependent fingerprint to the
            observations
        fingerprint_range : (list of float, list of float)
            the low and high values for each fingerprint element, if they are
            being used
        centralized_value_functions : bool
            specifies whether to use centralized value functions
        meta_policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use for the meta policies
        worker_policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use for the worker policy
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(sess=sess,
                                                    ob_space=ob_space,
                                                    ac_space=ac_space,
                                                    co_space=co_space,
                                                    buffer_size=buffer_size,
                                                    batch_size=batch_size,
                                                    actor_lr=actor_lr,
                                                    critic_lr=critic_lr,
                                                    verbose=verbose,
                                                    tau=tau,
                                                    gamma=gamma,
                                                    layer_norm=layer_norm,
                                                    layers=layers,
                                                    act_fun=act_fun,
                                                    use_huber=use_huber)

        assert num_levels >= 2, "num_levels must be greater than or equal to 2"

        self.num_levels = num_levels
        self.meta_period = meta_period
        self.intrinsic_reward_type = intrinsic_reward_type
        self.intrinsic_reward_scale = intrinsic_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.connected_gradients = connected_gradients
        self.cg_weights = cg_weights
        self.use_fingerprints = use_fingerprints
        self.fingerprint_range = fingerprint_range
        self.fingerprint_dim = (len(self.fingerprint_range[0]), )
        self.centralized_value_functions = centralized_value_functions

        # Get the observation and action space of the higher level policies.
        meta_ac_space = get_meta_ac_space(ob_space=ob_space,
                                          relative_goals=relative_goals,
                                          env_name=env_name,
                                          use_fingerprints=use_fingerprints,
                                          fingerprint_dim=self.fingerprint_dim)

        # =================================================================== #
        # Step 1: Create the policies for the individual levels.              #
        # =================================================================== #

        self.policy = []

        # The policies are ordered from the highest level to lowest level
        # policies in the hierarchy.
        for i in range(num_levels):
            # Determine the appropriate parameters to use for the policy in the
            # current level.
            policy_fn = meta_policy if i < (num_levels - 1) else worker_policy
            ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space
            co_space_i = co_space if i == 0 else meta_ac_space
            ob_space_i = ob_space
            zero_fingerprint_i = i == (num_levels - 1)

            # The policies are ordered from the highest level to lowest level
            # policies in the hierarchy.
            with tf.compat.v1.variable_scope("level_{}".format(i)):
                self.policy.append(
                    policy_fn(
                        sess=sess,
                        ob_space=ob_space_i,
                        ac_space=ac_space_i,
                        co_space=co_space_i,
                        buffer_size=buffer_size,
                        batch_size=batch_size,
                        actor_lr=actor_lr,
                        critic_lr=critic_lr,
                        verbose=verbose,
                        tau=tau,
                        gamma=gamma,
                        layer_norm=layer_norm,
                        layers=layers,
                        act_fun=act_fun,
                        use_huber=use_huber,
                        scope="level_{}".format(i),
                        zero_fingerprint=zero_fingerprint_i,
                        fingerprint_dim=self.fingerprint_dim[0],
                        **(additional_params or {}),
                    ))

        # =================================================================== #
        # Step 2: Create attributes for the replay buffer.                    #
        # =================================================================== #

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            obs_dim=ob_space.shape[0],
            ac_dim=ac_space.shape[0],
            co_dim=None if co_space is None else co_space.shape[0],
            goal_dim=meta_ac_space.shape[0],
            num_levels=num_levels)

        # current action by the meta-level policies
        self._meta_action = [None for _ in range(num_levels - 1)]

        # a list of all the actions performed by each level in the hierarchy,
        # ordered from highest to lowest level policy
        self._actions = None

        # a list of the rewards (intrinsic or other) experienced by every level
        # in the hierarchy, ordered from highest to lowest level policy
        self._rewards = None

        # a list of observations that stretch as long as the dilated horizon
        # chosen for the highest level policy
        self._observations = None

        # the first and last contextual term
        self._contexts = None

        # a list of done masks at every time step
        self._dones = None

        # Collect the state indices for the intrinsic rewards.
        self.goal_indices = get_state_indices(
            ob_space=ob_space,
            env_name=env_name,
            use_fingerprints=use_fingerprints,
            fingerprint_dim=self.fingerprint_dim)

        # Define the intrinsic reward function.
        if intrinsic_reward_type in [
                "negative_distance", "scaled_negative_distance",
                "non_negative_distance", "scaled_non_negative_distance",
                "exp_negative_distance", "scaled_exp_negative_distance"
        ]:
            # Offset the distance measure by the maximum possible distance to
            # ensure non-negativity.
            if "non_negative" in intrinsic_reward_type:
                offset = np.sqrt(
                    np.sum(np.square(meta_ac_space.high - meta_ac_space.low),
                           -1))
            else:
                offset = 0

            # Scale the outputs from the state by the meta-action space if you
            # wish to scale the worker reward.
            if intrinsic_reward_type.startswith("scaled"):
                scale = 0.5 * (meta_ac_space.high - meta_ac_space.low)
            else:
                scale = 1

            def intrinsic_reward_fn(states, goals, next_states):
                return negative_distance(
                    states=states[self.goal_indices] / scale,
                    goals=goals / scale,
                    next_states=next_states[self.goal_indices] / scale,
                    relative_context=relative_goals,
                    offset=0.0) + offset

            # Perform the exponential and squashing operations to keep the
            # intrinsic reward between 0 and 1.
            if "exp" in intrinsic_reward_type:

                def exp_intrinsic_reward_fn(states, goals, next_states):
                    return np.exp(
                        -1 *
                        intrinsic_reward_fn(states, goals, next_states)**2)

                self.intrinsic_reward_fn = exp_intrinsic_reward_fn
            else:
                self.intrinsic_reward_fn = intrinsic_reward_fn
        else:
            raise ValueError("Unknown intrinsic reward type: {}".format(
                intrinsic_reward_type))

        # =================================================================== #
        # Step 3: Create algorithm-specific features.                         #
        # =================================================================== #

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        # Utility method for indexing the goal out of an observation variable.
        self.crop_to_goal = lambda g: tf.gather(
            g,
            tf.tile(tf.expand_dims(np.array(self.goal_indices), 0),
                    [self.batch_size, 1]),
            batch_dims=1,
            axis=1)

        if self.connected_gradients:
            self._setup_connected_gradients()
Ejemplo n.º 10
0
class TestHierReplayBuffer(unittest.TestCase):
    """Tests for the HierReplayBuffer object."""
    def setUp(self):
        self.replay_buffer = HierReplayBuffer(buffer_size=2,
                                              batch_size=1,
                                              meta_period=1,
                                              meta_obs_dim=2,
                                              meta_ac_dim=3,
                                              worker_obs_dim=4,
                                              worker_ac_dim=5)

    def tearDown(self):
        del self.replay_buffer

    def test_init(self):
        """Validate that all the attributes were initialize properly."""
        self.assertTupleEqual(self.replay_buffer.meta_obs0.shape, (1, 2))
        self.assertTupleEqual(self.replay_buffer.meta_obs1.shape, (1, 2))
        self.assertTupleEqual(self.replay_buffer.meta_act.shape, (1, 3))
        self.assertTupleEqual(self.replay_buffer.meta_rew.shape, (1, ))
        self.assertTupleEqual(self.replay_buffer.meta_done.shape, (1, ))
        self.assertTupleEqual(self.replay_buffer.worker_obs0.shape, (1, 4))
        self.assertTupleEqual(self.replay_buffer.worker_obs1.shape, (1, 4))
        self.assertTupleEqual(self.replay_buffer.worker_act.shape, (1, 5))
        self.assertTupleEqual(self.replay_buffer.worker_rew.shape, (1, ))
        self.assertTupleEqual(self.replay_buffer.worker_done.shape, (1, ))

    def test_buffer_size(self):
        """Validate the buffer_size output from the replay buffer."""
        self.assertEqual(self.replay_buffer.buffer_size, 2)

    def test_add_sample(self):
        """Test the `add` and `sample` methods the replay buffer."""
        """Test the `add` and `sample` methods the replay buffer."""
        # Add an element.
        self.replay_buffer.add(
            obs_t=[np.array([0, 0, 0, 0]),
                   np.array([1, 1, 1, 1])],
            goal_t=np.array([2, 2, 2]),
            action_t=[np.array([3, 3, 3, 3, 3])],
            reward_t=[4],
            done=[False],
            meta_obs_t=(np.array([5, 5]), np.array([6, 6])),
            meta_reward_t=7,
        )

        # Check is_full in the False case.
        self.assertEqual(self.replay_buffer.is_full(), False)

        # Add an element.
        self.replay_buffer.add(
            obs_t=[np.array([0, 0, 0, 0]),
                   np.array([1, 1, 1, 1])],
            goal_t=np.array([2, 2, 2]),
            action_t=[np.array([3, 3, 3, 3, 3])],
            reward_t=[4],
            done=[False],
            meta_obs_t=(np.array([5, 5]), np.array([6, 6])),
            meta_reward_t=7,
        )

        # Check is_full in the True case.
        self.assertEqual(self.replay_buffer.is_full(), True)

        # Check can_sample in the True case.
        self.assertEqual(self.replay_buffer.can_sample(), True)

        # Test the `sample` method.
        meta_obs0, meta_obs1, meta_act, meta_rew, meta_done, worker_obs0, \
            worker_obs1, worker_act, worker_rew, worker_done, _ = \
            self.replay_buffer.sample()
        np.testing.assert_array_almost_equal(meta_obs0, [[5, 5]])
        np.testing.assert_array_almost_equal(meta_obs1, [[6, 6]])
        np.testing.assert_array_almost_equal(meta_act, [[2, 2, 2]])
        np.testing.assert_array_almost_equal(meta_rew, [7])
        np.testing.assert_array_almost_equal(meta_done, [0])
        np.testing.assert_array_almost_equal(worker_obs0, [[0, 0, 0, 0]])
        np.testing.assert_array_almost_equal(worker_obs1, [[1, 1, 1, 1]])
        np.testing.assert_array_almost_equal(worker_act, [[3, 3, 3, 3, 3]])
        np.testing.assert_array_almost_equal(worker_rew, [4])
        np.testing.assert_array_almost_equal(worker_done, [0])