def build_model(self): """ Build the main networks. To improve the critic network, we want to compute the cross-entropy loss between the projection on the support z of the target y = r_t + gamma * Q_target( s_{t+1}, A(s_{t+1}) ) and the Q-value at the time t Q(s_t, a_t) (with A(.) the output of the actor network). To improve the actor network, we apply the policy gradient : Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) ) """ # Compute A(s_t) self.actions = build_actor(self.state_ph, trainable=True, scope='learner_actor') # Compute Q(s_t, a_t) self.Q_distrib_given_actions = build_critic(self.state_ph, self.action_ph, trainable=True, reuse=False, scope='learner_critic') # Compute Q(s_t, A(s_t)) with the same network self.Q_distrib_suggested_actions = build_critic(self.state_ph, self.actions, trainable=True, reuse=True, scope='learner_critic') # Turn the distribution into value Qval(s_t, A(s_t)) self.Q_values_suggested_actions = tf.reduce_sum( self.z * self.Q_distrib_suggested_actions, axis=1)
def build_model(self): """ Build the main networks. To improve the critic network, we want to compute the classical TD-error TDerr = [ r_t + gamma * Q_target(s_{t+1}, A(s_{t+1})) - Q(s_t, a_t) ]² (with A(.) the output of the actor network). To improve the actor network, we apply the policy gradient : Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) ) """ # Compute A(s_t) self.actions = build_actor(self.state_ph, trainable=True, scope='actor') # Compute Q(s_t, a_t) self.q_values_of_given_actions = build_critic(self.state_ph, self.action_ph, trainable=True, reuse=False, scope='critic') # Compute Q(s_t, A(s_t)) with the same network self.q_values_of_suggested_actions = build_critic(self.state_ph, self.actions, trainable=True, reuse=True, scope='critic')
def build_networks(self): """ Build the main network that predicts the Q-value distribution of a given state. Also build the operation to compute Q(s_t, a_t) for the gradient descent. Reminder : if simple DQN: y_t = r_t + gamma * max_a Q_target(s_{t+n}, a) = r_t + gamma * Q_target( s_{t+n}, argmax_a Q_target(s_{t+n}, a) ) elif double DQN: y_t = r_t + gamma * Q_target( s_{t+n}, argmax_a Q(s_{t+n}, a) ) TD-error = y_t - Q(s_t, a_t) """ # Compute Q(s_t, .) self.Q_st = build_critic(self.state_ph, trainable=True, reuse=False, scope='main_network') # Compute Q(s_t, a_t) ind = tf.stack((tf.range(Settings.BATCH_SIZE), self.action_ph), axis=1) self.Q_st_at = tf.gather_nd(self.Q_st, ind) # Compute Q_target(s_{t+n}, .) Q_target_st_n = build_critic(self.next_state_ph, trainable=False, reuse=False, scope='target_network') # If not double DQN, choose the best next action of the target network # Elif double DQN, choose the best next action of the main network if not Settings.DOUBLE_DQN: # Reuse Q_target(s_{t+n}, .) Q_st_n_max_a = Q_target_st_n else: # Compute Q(s_{t+n}, .) Q_st_n_max_a = build_critic(self.next_state_ph, trainable=True, reuse=True, scope='main_network') # Transform the distribution into the value to get the argmax if Settings.DISTRIBUTIONAL: Q_st_n_max_a = tf.reduce_sum(self.z * Q_st_n_max_a, axis=2) # Compute argmax_a Q[_target](s_{t+n}, a) best_at_n = tf.argmax(Q_st_n_max_a, 1, output_type=tf.int32) # Compute Q_target(s_{t+n}, argmax_a Q[_target](s_{t+n}, a)) ind = tf.stack((tf.range(Settings.BATCH_SIZE), best_at_n), axis=1) self.Q_target_st_n_at_n = tf.gather_nd(Q_target_st_n, ind)
def build_target(self): """ Build the operation to compute max_a Q(s_{t+1}, a) for the gradient descent. Reminder : TD-error = (r_t + gamma * max_a Q(s_{t+1}, a) - Q(s_t, a_t) """ # Computate Q(s_{t+1}, .) self.target_Q_distrib = build_critic(self.next_state_ph, trainable=False, scope='target_network') # Distribution -> value and selection to get the action that maximizes # the target Q-value a* = argmax_a Q(s_{t+1}, a) self.target_Q_value = tf.reduce_sum(self.z * self.target_Q_distrib, axis=2) self.target_action = tf.argmax(self.target_Q_value, 1, output_type=tf.int32) # Selection of the maximum target Q-value distribution # max_a Q(s_{t+1}, a) == Q(s_{t+1}, a*) ind = tf.stack((tf.range(self.batch_size), self.target_action), axis=1) self.target_Q_distrib_optimal_action = tf.gather_nd( self.target_Q_distrib, ind)
def build_target(self): """ Build the target networks. """ # Compute A(s_{t+1}) self.target_next_actions = build_actor(self.next_state_ph, trainable=False, scope='learner_target_actor') # Compute Q_target( s_{t+1}, A(s_{t+1}) ) self.Q_distrib_next = build_critic(self.next_state_ph, self.target_next_actions, trainable=False, reuse=False, scope='learner_target_critic')
def build_target(self): # Compute Q_target(s_{t+1}, .) self.Q_distrib_next_target = build_critic(self.next_state_ph, trainable=False, reuse=False, scope='target_network') # Compute Q_target(s_{t+1}, argmax_a Q(s_{t+1}, a)) ind = tf.stack((tf.range(Settings.BATCH_SIZE), self.best_next_action), axis=1) self.Q_distrib_next_target_best_action = tf.gather_nd( self.Q_distrib_next_target, ind)
def build_main_network(self): # Compute Q(s_t, .) self.Q_distrib = build_critic(self.state_ph, trainable=True, reuse=False, scope='main_network') # Compute Q(s_t, a_t) ind = tf.stack((tf.range(Settings.BATCH_SIZE), self.action_ph), axis=1) self.Q_distrib_main_action = tf.gather_nd(self.Q_distrib, ind) # Compute Q(s_{t+1}, .) self.Q_distrib_next = build_critic(self.next_state_ph, trainable=True, reuse=True, scope='main_network') self.Q_value_next = tf.reduce_sum(self.z * self.Q_distrib_next, axis=2) # Compute argmax_a Q(s_{t+1}, a) self.best_next_action = tf.argmax(self.Q_value_next, 1, output_type=tf.int32)
def build_main_network(self): """ Build the main network that predicts the Q-value distribution of a given state. Also build the operation to compute Q(s_t, a_t) for the gradient descent. Reminder : TD-error = (r_t + gamma * max_a Q(s_{t+1}, a) - Q(s_t, a_t) """ # Computate Q(s_t, .) self.Q_distrib = build_critic(self.state_ph, trainable=True, scope='main_network') # Select only the Q-distribution of the action given in the experience, # i.e. compute Q(s_t, a_t) ind = tf.stack((tf.range(self.batch_size), self.action_ph), axis=1) self.Q_distrib_taken_action = tf.gather_nd(self.Q_distrib, ind)
def build_target(self): """ Build the target networks. """ # Compute A(s_{t+1}) self.target_next_actions = build_actor(self.next_state_ph, trainable=False, scope='learner_target_actor') #print("3 intermediate action shape {}".format(self.target_next_actions.shape)) self.target_next_actions = tf.expand_dims(self.target_next_actions, 1) #print("3 intermediate action shape {}".format(self.target_next_actions.shape)) # Compute Q_target( s_{t+1}, A(s_{t+1}) ) self.Q_distrib_next = build_critic(self.next_state_ph, self.target_next_actions, trainable=False, reuse=False, scope='learner_target_critic', sess=self.sess)