def __init__(self, input_dim): """Initializes a policy network. Args: input_dim: size of the input space """ super(CriticTD3, self).__init__() self.critic1 = CriticDDPG(input_dim) self.critic2 = CriticDDPG(input_dim)
def __init__(self, input_dim, action_dim, discount=0.99, tau=0.005, actor_lr=1e-3, critic_lr=1e-3, use_td3=True, policy_noise=0.2, policy_noise_clip=0.5, policy_update_freq=2, get_reward=None, use_absorbing_state=False): """Initializes actor, critic, target networks and optimizers. The class handles absorbing state properly. Absorbing state corresponds to a state which a policy gets in after reaching a goal state and stays there forever. For most RL problems, we can just assign 0 to all reward after the goal. But for GAIL, we need to have an actual absorbing state. Args: input_dim: size of the observation space. action_dim: size of the action space. discount: reward discount. tau: target networks update coefficient. actor_lr: actor learning rate. critic_lr: critic learning rate. use_td3: whether to use standard ddpg or td3. policy_noise: std of gaussian added to critic action input. policy_noise_clip: clip added gaussian noise. policy_update_freq: perform policy update once per n steps. get_reward: a function that given (s,a,s') returns a reward. use_absorbing_state: whether to use an absorbing state or not. """ self.discount = discount self.tau = tau self.use_td3 = use_td3 self.policy_noise = policy_noise self.policy_noise_clip = policy_noise_clip self.policy_update_freq = policy_update_freq self.get_reward = get_reward self.use_absorbing_state = use_absorbing_state with tf.variable_scope('actor'): self.actor = Actor(input_dim, action_dim) with tf.variable_scope('target'): self.actor_target = Actor(input_dim, action_dim) self.initial_actor_lr = actor_lr self.actor_lr = contrib_eager_python_tfe.Variable(actor_lr, name='lr') self.actor_step = contrib_eager_python_tfe.Variable(0, dtype=tf.int64, name='step') self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.actor_lr) self.actor_optimizer._create_slots(self.actor.variables) # pylint: disable=protected-access soft_update(self.actor.variables, self.actor_target.variables) with tf.variable_scope('critic'): if self.use_td3: self.critic = CriticTD3(input_dim + action_dim) with tf.variable_scope('target'): self.critic_target = CriticTD3(input_dim + action_dim) else: self.critic = CriticDDPG(input_dim + action_dim) with tf.variable_scope('target'): self.critic_target = CriticDDPG(input_dim + action_dim) self.critic_step = contrib_eager_python_tfe.Variable( 0, dtype=tf.int64, name='step') self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=critic_lr) self.critic_optimizer._create_slots(self.critic.variables) # pylint: disable=protected-access soft_update(self.critic.variables, self.critic_target.variables)