Exemple #1
0
    def __init__(self, input_dim):
        """Initializes a policy network.

    Args:
      input_dim: size of the input space
    """
        super(CriticTD3, self).__init__()
        self.critic1 = CriticDDPG(input_dim)
        self.critic2 = CriticDDPG(input_dim)
Exemple #2
0
    def __init__(self,
                 input_dim,
                 action_dim,
                 discount=0.99,
                 tau=0.005,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 use_td3=True,
                 policy_noise=0.2,
                 policy_noise_clip=0.5,
                 policy_update_freq=2,
                 get_reward=None,
                 use_absorbing_state=False):
        """Initializes actor, critic, target networks and optimizers.

    The class handles absorbing state properly. Absorbing state corresponds to
    a state which a policy gets in after reaching a goal state and stays there
    forever. For most RL problems, we can just assign 0 to all reward after
    the goal. But for GAIL, we need to have an actual absorbing state.

    Args:
       input_dim: size of the observation space.
       action_dim: size of the action space.
       discount: reward discount.
       tau: target networks update coefficient.
       actor_lr: actor learning rate.
       critic_lr: critic learning rate.
       use_td3: whether to use standard ddpg or td3.
       policy_noise: std of gaussian added to critic action input.
       policy_noise_clip: clip added gaussian noise.
       policy_update_freq: perform policy update once per n steps.
       get_reward: a function that given (s,a,s') returns a reward.
       use_absorbing_state: whether to use an absorbing state or not.
    """
        self.discount = discount
        self.tau = tau

        self.use_td3 = use_td3
        self.policy_noise = policy_noise
        self.policy_noise_clip = policy_noise_clip
        self.policy_update_freq = policy_update_freq
        self.get_reward = get_reward
        self.use_absorbing_state = use_absorbing_state

        with tf.variable_scope('actor'):
            self.actor = Actor(input_dim, action_dim)
            with tf.variable_scope('target'):
                self.actor_target = Actor(input_dim, action_dim)

            self.initial_actor_lr = actor_lr
            self.actor_lr = contrib_eager_python_tfe.Variable(actor_lr,
                                                              name='lr')
            self.actor_step = contrib_eager_python_tfe.Variable(0,
                                                                dtype=tf.int64,
                                                                name='step')
            self.actor_optimizer = tf.train.AdamOptimizer(
                learning_rate=self.actor_lr)
            self.actor_optimizer._create_slots(self.actor.variables)  # pylint: disable=protected-access

        soft_update(self.actor.variables, self.actor_target.variables)

        with tf.variable_scope('critic'):
            if self.use_td3:
                self.critic = CriticTD3(input_dim + action_dim)
                with tf.variable_scope('target'):
                    self.critic_target = CriticTD3(input_dim + action_dim)
            else:
                self.critic = CriticDDPG(input_dim + action_dim)
                with tf.variable_scope('target'):
                    self.critic_target = CriticDDPG(input_dim + action_dim)

            self.critic_step = contrib_eager_python_tfe.Variable(
                0, dtype=tf.int64, name='step')
            self.critic_optimizer = tf.train.AdamOptimizer(
                learning_rate=critic_lr)
            self.critic_optimizer._create_slots(self.critic.variables)  # pylint: disable=protected-access

        soft_update(self.critic.variables, self.critic_target.variables)