Exemple #1
0
    def test_state_indices(self):
        # non-relevant parameters for most tests
        params = dict(
            ob_space=Box(-1, 1, shape=(2, ), dtype=np.float32),
            use_fingerprints=False,
            fingerprint_dim=1,
        )

        # test for AntMaze
        self.assertListEqual(
            get_state_indices(env_name="AntMaze", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for AntGather
        self.assertListEqual(
            get_state_indices(env_name="AntGather", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for AntPush
        self.assertListEqual(
            get_state_indices(env_name="AntPush", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for AntFall
        self.assertListEqual(
            get_state_indices(env_name="AntFall", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for UR5
        self.assertIsNone(get_state_indices(env_name="UR5", **params))

        # test for Pendulum
        self.assertListEqual(get_state_indices(env_name="Pendulum", **params),
                             [0, 2])

        # test for ring
        self.assertListEqual(get_state_indices(env_name="ring", **params),
                             [0, 5, 10, 15, 20])

        # test for ring_small
        self.assertListEqual(
            get_state_indices(env_name="ring_small", **params), [0])

        # test for merge0
        self.assertListEqual(get_state_indices(env_name="merge0", **params),
                             [0, 5, 10, 15, 20])

        # test for merge1
        self.assertListEqual(
            get_state_indices(env_name="merge1", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60])

        # test for merge2
        self.assertListEqual(
            get_state_indices(env_name="merge2", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80])

        # test for figureeight0
        self.assertListEqual(
            get_state_indices(env_name="figureeight0", **params), [13])

        # test for figureeight1
        self.assertListEqual(
            get_state_indices(env_name="figureeight1", **params),
            [1, 3, 5, 7, 9, 11, 13])

        # test for figureeight2
        self.assertListEqual(
            get_state_indices(env_name="figureeight2", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

        # test for highway-single
        self.assertListEqual(
            get_state_indices(env_name="highway-single", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])

        # test for Point2DEnv
        self.assertListEqual(
            get_state_indices(env_name="Point2DEnv", **params), [0, 1])

        # test for Point2DImageEnv
        self.assertListEqual(
            get_state_indices(env_name="Point2DImageEnv", **params),
            [1024, 1025])
Exemple #2
0
    def test_state_indices(self):
        # non-relevant parameters for most tests
        params = dict(
            ob_space=Box(-1, 1, shape=(2, )),
            use_fingerprints=False,
            fingerprint_dim=1,
        )

        # test for AntMaze
        self.assertListEqual(
            get_state_indices(env_name="AntMaze", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for AntGather
        self.assertListEqual(
            get_state_indices(env_name="AntGather", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for AntPush
        self.assertListEqual(
            get_state_indices(env_name="AntPush", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for AntFall
        self.assertListEqual(
            get_state_indices(env_name="AntFall", **params),
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

        # test for UR5
        self.assertIsNone(get_state_indices(env_name="UR5", **params))

        # test for Pendulum
        self.assertListEqual(get_state_indices(env_name="Pendulum", **params),
                             [0, 2])

        # test for ring-v0
        self.assertListEqual(get_state_indices(env_name="ring-v0", **params),
                             [0, 5, 10, 15, 20])

        # test for ring-v1
        self.assertListEqual(get_state_indices(env_name="ring-v1", **params),
                             [0, 5, 10, 15, 20])

        # test for ring-v2
        self.assertListEqual(get_state_indices(env_name="ring-v2", **params),
                             [0, 5, 10, 15, 20])

        # test for ring-imitation
        self.assertListEqual(
            get_state_indices(env_name="ring-imitation", **params),
            [0, 5, 10, 15, 20])

        # test for merge-v0
        self.assertListEqual(get_state_indices(env_name="merge-v0", **params),
                             [0, 5, 10, 15, 20])

        # test for merge-v1
        self.assertListEqual(
            get_state_indices(env_name="merge-v1", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60])

        # test for merge-v2
        self.assertListEqual(
            get_state_indices(env_name="merge-v2", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80])

        # test for highway-v0
        self.assertListEqual(
            get_state_indices(env_name="highway-v0", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])

        # test for highway-v1
        self.assertListEqual(
            get_state_indices(env_name="highway-v1", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])

        # test for highway-v2
        self.assertListEqual(
            get_state_indices(env_name="highway-v2", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])

        # test for highway-imitation
        self.assertListEqual(
            get_state_indices(env_name="highway-imitation", **params),
            [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])

        # test for i210-v0
        self.assertListEqual(get_state_indices(env_name="i210-v0", **params), [
            0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80,
            85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150,
            155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215,
            220, 225, 230, 235, 240, 245
        ])

        # test for i210-v1
        self.assertListEqual(get_state_indices(env_name="i210-v1", **params), [
            0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80,
            85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150,
            155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215,
            220, 225, 230, 235, 240, 245
        ])

        # test for i210-v2
        self.assertListEqual(get_state_indices(env_name="i210-v2", **params), [
            0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80,
            85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150,
            155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215,
            220, 225, 230, 235, 240, 245
        ])

        # test for Point2DEnv
        self.assertListEqual(
            get_state_indices(env_name="Point2DEnv", **params), [0, 1])

        # test for Point2DImageEnv
        self.assertListEqual(
            get_state_indices(env_name="Point2DImageEnv", **params),
            [1024, 1025])
Exemple #3
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 use_huber,
                 l2_penalty,
                 model_params,
                 num_levels,
                 meta_period,
                 intrinsic_reward_type,
                 intrinsic_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 cooperative_gradients,
                 cg_weights,
                 cg_delta,
                 pretrain_worker,
                 pretrain_path,
                 pretrain_ckpt,
                 total_steps,
                 scope=None,
                 env_name="",
                 num_envs=1,
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        model_params : dict
            dictionary of model-specific parameters. See parent class.
        num_levels : int
            number of levels within the hierarchy. Must be greater than 1. Two
            levels correspond to a Manager/Worker paradigm.
        meta_period : int
            meta-policy action period
        intrinsic_reward_type : str
            the reward function to be used by the worker. Must be one of:

            * "negative_distance": the negative two norm between the states and
              desired absolute or relative goals.
            * "scaled_negative_distance": similar to the negative distance
              reward where the states, goals, and next states are scaled by the
              inverse of the action space of the manager policy
            * "non_negative_distance": the negative two norm between the states
              and desired absolute or relative goals offset by the maximum goal
              space (to ensure non-negativity)
            * "scaled_non_negative_distance": similar to the non-negative
              distance reward where the states, goals, and next states are
              scaled by the inverse of the action space of the manager policy
            * "exp_negative_distance": equal to exp(-negative_distance^2). The
              result is a reward between 0 and 1. This is useful for policies
              that terminate early.
            * "scaled_exp_negative_distance": similar to the previous worker
              reward type but with states, actions, and next states that are
              scaled.
        intrinsic_reward_scale : float
            the value that the intrinsic reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the higher-level policies is
            meant to be a relative or absolute goal, i.e. specific state or
            change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        cooperative_gradients : bool
            whether to use the cooperative gradient update procedure for the
            higher-level policy. See: https://arxiv.org/abs/1912.02368v1
        cg_weights : float
            weights for the gradients of the loss of the lower-level policies
            with respect to the parameters of the higher-level policies. Only
            used if `cooperative_gradients` is set to True.
        cg_delta : float
            the desired lower-level expected returns. If set to None, a fixed
            Lagrangian specified by cg_weights is used instead. Only used if
            `cooperative_gradients` is set to True.
        pretrain_worker : bool
            specifies whether you are pre-training the lower-level policies.
            Actions by the high-level policy are randomly sampled from the
            action space.
        pretrain_path : str or None
            path to the pre-trained worker policy checkpoints
        pretrain_ckpt : int or None
            checkpoint number to use within the worker policy path. If set to
            None, the most recent checkpoint is used.
        total_steps : int
            Total number of timesteps used during training. Used by a subset of
            algorithms.
        meta_policy : type [ hbaselines.base_policies.Policy ]
            the policy model to use for the meta policies
        worker_policy : type [ hbaselines.base_policies.Policy ]
            the policy model to use for the worker policy
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(
            sess=sess,
            ob_space=ob_space,
            ac_space=ac_space,
            co_space=co_space,
            verbose=verbose,
            l2_penalty=l2_penalty,
            model_params=model_params,
            num_envs=num_envs,
        )

        assert num_levels >= 2, "num_levels must be greater than or equal to 2"

        self.num_levels = num_levels
        self.meta_period = meta_period
        self.intrinsic_reward_type = intrinsic_reward_type
        self.intrinsic_reward_scale = intrinsic_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.cooperative_gradients = cooperative_gradients
        self.cg_weights = cg_weights
        self.cg_delta = cg_delta
        self.pretrain_worker = pretrain_worker
        self.pretrain_path = pretrain_path
        self.pretrain_ckpt = pretrain_ckpt
        self.total_steps = total_steps

        # Get the observation and action space of the higher level policies.
        meta_ac_space = get_meta_ac_space(
            ob_space=ob_space,
            relative_goals=relative_goals,
            env_name=env_name,
        )

        # =================================================================== #
        # Step 1: Create the policies for the individual levels.              #
        # =================================================================== #

        self.policy = []

        # The policies are ordered from the highest level to lowest level
        # policies in the hierarchy.
        for i in range(num_levels):
            # Determine the appropriate parameters to use for the policy in the
            # current level.
            policy_fn = meta_policy if i < (num_levels - 1) else worker_policy
            ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space
            co_space_i = co_space if i == 0 else meta_ac_space
            ob_space_i = ob_space

            # The policies are ordered from the highest level to lowest level
            # policies in the hierarchy.
            with tf.compat.v1.variable_scope("level_{}".format(i)):
                # Compute the scope name based on any outer scope term.
                scope_i = "level_{}".format(i)
                if scope is not None:
                    scope_i = "{}/{}".format(scope, scope_i)

                # TODO: description.
                model_params_i = model_params.copy()
                model_params_i.update({
                    "ignore_flat_channels":
                    model_params["ignore_flat_channels"] if i < 1 else [],
                    "ignore_image":
                    model_params["ignore_image"] if i < 1 else True,
                })

                # Create the next policy.
                self.policy.append(
                    policy_fn(
                        sess=sess,
                        ob_space=ob_space_i,
                        ac_space=ac_space_i,
                        co_space=co_space_i,
                        buffer_size=buffer_size,
                        batch_size=batch_size,
                        actor_lr=actor_lr,
                        critic_lr=critic_lr,
                        verbose=verbose,
                        tau=tau,
                        gamma=gamma,
                        use_huber=use_huber,
                        l2_penalty=l2_penalty,
                        model_params=model_params_i,
                        scope=scope_i,
                        **(additional_params or {}),
                    ))

        # =================================================================== #
        # Step 2: Create attributes for the replay buffer.                    #
        # =================================================================== #

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            obs_dim=ob_space.shape[0],
            ac_dim=ac_space.shape[0],
            co_dim=None if co_space is None else co_space.shape[0],
            goal_dim=meta_ac_space.shape[0],
            num_levels=num_levels)

        # current action by the meta-level policies
        self.meta_action = [[None for _ in range(num_levels - 1)]
                            for _ in range(num_envs)]

        # a list of all the actions performed by each level in the hierarchy,
        # ordered from highest to lowest level policy. A separate element is
        # used for each environment.
        self._actions = [[[] for _ in range(self.num_levels)]
                         for _ in range(num_envs)]

        # a list of the rewards (intrinsic or other) experienced by every level
        # in the hierarchy, ordered from highest to lowest level policy. A
        # separate element is used for each environment.
        self._rewards = [[[0]] + [[] for _ in range(self.num_levels - 1)]
                         for _ in range(num_envs)]

        # a list of observations that stretch as long as the dilated horizon
        # chosen for the highest level policy. A separate element is used for
        # each environment.
        self._observations = [[] for _ in range(num_envs)]

        # the first and last contextual term. A separate element is used for
        # each environment.
        self._contexts = [[] for _ in range(num_envs)]

        # a list of done masks at every time step. A separate element is used
        # for each environment.
        self._dones = [[] for _ in range(num_envs)]

        # Collect the state indices for the intrinsic rewards.
        self.goal_indices = get_state_indices(ob_space, env_name)

        # Define the intrinsic reward function.
        if intrinsic_reward_type in [
                "negative_distance", "scaled_negative_distance",
                "non_negative_distance", "scaled_non_negative_distance",
                "exp_negative_distance", "scaled_exp_negative_distance"
        ]:
            # Offset the distance measure by the maximum possible distance to
            # ensure non-negativity.
            if "non_negative" in intrinsic_reward_type:
                offset = np.sqrt(
                    np.sum(np.square(meta_ac_space.high - meta_ac_space.low),
                           -1))
            else:
                offset = 0

            # Scale the outputs from the state by the meta-action space if you
            # wish to scale the worker reward.
            if intrinsic_reward_type.startswith("scaled"):
                scale = 0.5 * (meta_ac_space.high - meta_ac_space.low)
            else:
                scale = 1

            def intrinsic_reward_fn(states, goals, next_states):
                return negative_distance(
                    states=states[self.goal_indices] / scale,
                    goals=goals / scale,
                    next_states=next_states[self.goal_indices] / scale,
                    relative_context=relative_goals,
                    offset=0.0,
                ) + offset

            # Perform the exponential and squashing operations to keep the
            # intrinsic reward between 0 and 1.
            if "exp" in intrinsic_reward_type:

                def exp_intrinsic_reward_fn(states, goals, next_states):
                    # TODO: temporary
                    span = sum(
                        np.square(self.policy[0].ac_space.high -
                                  self.policy[0].ac_space.low))
                    rew = intrinsic_reward_fn(states, goals, next_states)
                    return np.exp(-(rew / (span / 40))**2)

                self.intrinsic_reward_fn = exp_intrinsic_reward_fn
            else:
                self.intrinsic_reward_fn = intrinsic_reward_fn
        else:
            raise ValueError("Unknown intrinsic reward type: {}".format(
                intrinsic_reward_type))

        # =================================================================== #
        # Step 3: Create algorithm-specific features.                         #
        # =================================================================== #

        # the number of get_action calls that have been performed. This is used
        # when pretraining the worker to incrementally train different levels
        # of the policy.
        self._steps = 0

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        if self.cooperative_gradients:
            if scope is None:
                self._setup_cooperative_gradients()
            else:
                with tf.compat.v1.variable_scope(scope):
                    self._setup_cooperative_gradients()
Exemple #4
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 co_space,
                 buffer_size,
                 batch_size,
                 actor_lr,
                 critic_lr,
                 verbose,
                 tau,
                 gamma,
                 layer_norm,
                 layers,
                 act_fun,
                 use_huber,
                 num_levels,
                 meta_period,
                 intrinsic_reward_type,
                 intrinsic_reward_scale,
                 relative_goals,
                 off_policy_corrections,
                 hindsight,
                 subgoal_testing_rate,
                 connected_gradients,
                 cg_weights,
                 use_fingerprints,
                 fingerprint_range,
                 centralized_value_functions,
                 env_name="",
                 meta_policy=None,
                 worker_policy=None,
                 additional_params=None):
        """Instantiate the goal-conditioned hierarchical policy.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            the current TensorFlow session
        ob_space : gym.spaces.*
            the observation space of the environment
        ac_space : gym.spaces.*
            the action space of the environment
        co_space : gym.spaces.*
            the context space of the environment
        buffer_size : int
            the max number of transitions to store
        batch_size : int
            SGD batch size
        actor_lr : float
            actor learning rate
        critic_lr : float
            critic learning rate
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        tau : float
            target update rate
        gamma : float
            discount factor
        layer_norm : bool
            enable layer normalisation
        layers : list of int or None
            the size of the neural network for the policy
        act_fun : tf.nn.*
            the activation function to use in the neural network
        use_huber : bool
            specifies whether to use the huber distance function as the loss
            for the critic. If set to False, the mean-squared error metric is
            used instead
        num_levels : int
            number of levels within the hierarchy. Must be greater than 1. Two
            levels correspond to a Manager/Worker paradigm.
        meta_period : int
            meta-policy action period
        intrinsic_reward_type : str
            the reward function to be used by the worker. Must be one of:

            * "negative_distance": the negative two norm between the states and
              desired absolute or relative goals.
            * "scaled_negative_distance": similar to the negative distance
              reward where the states, goals, and next states are scaled by the
              inverse of the action space of the manager policy
            * "non_negative_distance": the negative two norm between the states
              and desired absolute or relative goals offset by the maximum goal
              space (to ensure non-negativity)
            * "scaled_non_negative_distance": similar to the non-negative
              distance reward where the states, goals, and next states are
              scaled by the inverse of the action space of the manager policy
            * "exp_negative_distance": equal to exp(-negative_distance^2). The
              result is a reward between 0 and 1. This is useful for policies
              that terminate early.
            * "scaled_exp_negative_distance": similar to the previous worker
              reward type but with states, actions, and next states that are
              scaled.
        intrinsic_reward_scale : float
            the value that the intrinsic reward should be scaled by
        relative_goals : bool
            specifies whether the goal issued by the higher-level policies is
            meant to be a relative or absolute goal, i.e. specific state or
            change in state
        off_policy_corrections : bool
            whether to use off-policy corrections during the update procedure.
            See: https://arxiv.org/abs/1805.08296
        hindsight : bool
            whether to include hindsight action and goal transitions in the
            replay buffer. See: https://arxiv.org/abs/1712.00948
        subgoal_testing_rate : float
            rate at which the original (non-hindsight) sample is stored in the
            replay buffer as well. Used only if `hindsight` is set to True.
        connected_gradients : bool
            whether to use the connected gradient update actor update procedure
            to the higher-level policy. See: https://arxiv.org/abs/1912.02368v1
        cg_weights : float
            weights for the gradients of the loss of the lower-level policies
            with respect to the parameters of the higher-level policies. Only
            used if `connected_gradients` is set to True.
        use_fingerprints : bool
            specifies whether to add a time-dependent fingerprint to the
            observations
        fingerprint_range : (list of float, list of float)
            the low and high values for each fingerprint element, if they are
            being used
        centralized_value_functions : bool
            specifies whether to use centralized value functions
        meta_policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use for the meta policies
        worker_policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use for the worker policy
        additional_params : dict
            additional algorithm-specific policy parameters. Used internally by
            the class when instantiating other (child) policies.
        """
        super(GoalConditionedPolicy, self).__init__(sess=sess,
                                                    ob_space=ob_space,
                                                    ac_space=ac_space,
                                                    co_space=co_space,
                                                    buffer_size=buffer_size,
                                                    batch_size=batch_size,
                                                    actor_lr=actor_lr,
                                                    critic_lr=critic_lr,
                                                    verbose=verbose,
                                                    tau=tau,
                                                    gamma=gamma,
                                                    layer_norm=layer_norm,
                                                    layers=layers,
                                                    act_fun=act_fun,
                                                    use_huber=use_huber)

        assert num_levels >= 2, "num_levels must be greater than or equal to 2"

        self.num_levels = num_levels
        self.meta_period = meta_period
        self.intrinsic_reward_type = intrinsic_reward_type
        self.intrinsic_reward_scale = intrinsic_reward_scale
        self.relative_goals = relative_goals
        self.off_policy_corrections = off_policy_corrections
        self.hindsight = hindsight
        self.subgoal_testing_rate = subgoal_testing_rate
        self.connected_gradients = connected_gradients
        self.cg_weights = cg_weights
        self.use_fingerprints = use_fingerprints
        self.fingerprint_range = fingerprint_range
        self.fingerprint_dim = (len(self.fingerprint_range[0]), )
        self.centralized_value_functions = centralized_value_functions

        # Get the observation and action space of the higher level policies.
        meta_ac_space = get_meta_ac_space(ob_space=ob_space,
                                          relative_goals=relative_goals,
                                          env_name=env_name,
                                          use_fingerprints=use_fingerprints,
                                          fingerprint_dim=self.fingerprint_dim)

        # =================================================================== #
        # Step 1: Create the policies for the individual levels.              #
        # =================================================================== #

        self.policy = []

        # The policies are ordered from the highest level to lowest level
        # policies in the hierarchy.
        for i in range(num_levels):
            # Determine the appropriate parameters to use for the policy in the
            # current level.
            policy_fn = meta_policy if i < (num_levels - 1) else worker_policy
            ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space
            co_space_i = co_space if i == 0 else meta_ac_space
            ob_space_i = ob_space
            zero_fingerprint_i = i == (num_levels - 1)

            # The policies are ordered from the highest level to lowest level
            # policies in the hierarchy.
            with tf.compat.v1.variable_scope("level_{}".format(i)):
                self.policy.append(
                    policy_fn(
                        sess=sess,
                        ob_space=ob_space_i,
                        ac_space=ac_space_i,
                        co_space=co_space_i,
                        buffer_size=buffer_size,
                        batch_size=batch_size,
                        actor_lr=actor_lr,
                        critic_lr=critic_lr,
                        verbose=verbose,
                        tau=tau,
                        gamma=gamma,
                        layer_norm=layer_norm,
                        layers=layers,
                        act_fun=act_fun,
                        use_huber=use_huber,
                        scope="level_{}".format(i),
                        zero_fingerprint=zero_fingerprint_i,
                        fingerprint_dim=self.fingerprint_dim[0],
                        **(additional_params or {}),
                    ))

        # =================================================================== #
        # Step 2: Create attributes for the replay buffer.                    #
        # =================================================================== #

        # Create the replay buffer.
        self.replay_buffer = HierReplayBuffer(
            buffer_size=int(buffer_size / meta_period),
            batch_size=batch_size,
            meta_period=meta_period,
            obs_dim=ob_space.shape[0],
            ac_dim=ac_space.shape[0],
            co_dim=None if co_space is None else co_space.shape[0],
            goal_dim=meta_ac_space.shape[0],
            num_levels=num_levels)

        # current action by the meta-level policies
        self._meta_action = [None for _ in range(num_levels - 1)]

        # a list of all the actions performed by each level in the hierarchy,
        # ordered from highest to lowest level policy
        self._actions = None

        # a list of the rewards (intrinsic or other) experienced by every level
        # in the hierarchy, ordered from highest to lowest level policy
        self._rewards = None

        # a list of observations that stretch as long as the dilated horizon
        # chosen for the highest level policy
        self._observations = None

        # the first and last contextual term
        self._contexts = None

        # a list of done masks at every time step
        self._dones = None

        # Collect the state indices for the intrinsic rewards.
        self.goal_indices = get_state_indices(
            ob_space=ob_space,
            env_name=env_name,
            use_fingerprints=use_fingerprints,
            fingerprint_dim=self.fingerprint_dim)

        # Define the intrinsic reward function.
        if intrinsic_reward_type in [
                "negative_distance", "scaled_negative_distance",
                "non_negative_distance", "scaled_non_negative_distance",
                "exp_negative_distance", "scaled_exp_negative_distance"
        ]:
            # Offset the distance measure by the maximum possible distance to
            # ensure non-negativity.
            if "non_negative" in intrinsic_reward_type:
                offset = np.sqrt(
                    np.sum(np.square(meta_ac_space.high - meta_ac_space.low),
                           -1))
            else:
                offset = 0

            # Scale the outputs from the state by the meta-action space if you
            # wish to scale the worker reward.
            if intrinsic_reward_type.startswith("scaled"):
                scale = 0.5 * (meta_ac_space.high - meta_ac_space.low)
            else:
                scale = 1

            def intrinsic_reward_fn(states, goals, next_states):
                return negative_distance(
                    states=states[self.goal_indices] / scale,
                    goals=goals / scale,
                    next_states=next_states[self.goal_indices] / scale,
                    relative_context=relative_goals,
                    offset=0.0) + offset

            # Perform the exponential and squashing operations to keep the
            # intrinsic reward between 0 and 1.
            if "exp" in intrinsic_reward_type:

                def exp_intrinsic_reward_fn(states, goals, next_states):
                    return np.exp(
                        -1 *
                        intrinsic_reward_fn(states, goals, next_states)**2)

                self.intrinsic_reward_fn = exp_intrinsic_reward_fn
            else:
                self.intrinsic_reward_fn = intrinsic_reward_fn
        else:
            raise ValueError("Unknown intrinsic reward type: {}".format(
                intrinsic_reward_type))

        # =================================================================== #
        # Step 3: Create algorithm-specific features.                         #
        # =================================================================== #

        # a fixed goal transition function for the meta-actions in between meta
        # periods. This is used when relative_goals is set to True in order to
        # maintain a fixed absolute position of the goal.
        if relative_goals:

            def goal_transition_fn(obs0, goal, obs1):
                return obs0 + goal - obs1
        else:

            def goal_transition_fn(obs0, goal, obs1):
                return goal

        self.goal_transition_fn = goal_transition_fn

        # Utility method for indexing the goal out of an observation variable.
        self.crop_to_goal = lambda g: tf.gather(
            g,
            tf.tile(tf.expand_dims(np.array(self.goal_indices), 0),
                    [self.batch_size, 1]),
            batch_dims=1,
            axis=1)

        if self.connected_gradients:
            self._setup_connected_gradients()