def init_network(self): # The model structure # TODO: how to handle reuse, it would register loss two times self.train_policy = self.policy(self.sess, self.X_input_train_shape, self.num_actions, self.layer_collection, reuse=False) self.step_policy = self.policy(self.sess, self.X_input_step_shape, self.num_actions, reuse=True) with tf.variable_scope('train_output'): negative_log_prob_action = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.train_policy.policy_logits, labels=self.actions) self.policy_gradient_loss = tf.reduce_mean( self.advantage * negative_log_prob_action) self.value_function_loss = tf.reduce_mean( mse(tf.squeeze(self.train_policy.value_function), self.reward)) self.entropy = tf.reduce_mean( openai_entropy(self.train_policy.policy_logits)) self.loss = self.policy_gradient_loss - self.entropy * self.entropy_coeff + self.value_function_loss * self.vf_coeff # Gradient Clipping params = find_trainable_variables("policy") grads = tf.gradients(self.loss, params) # Apply Gradients grads = list(zip(grads, params)) optimizer = opt.KfacOptimizer( learning_rate=self.learning_rate, cov_ema_decay=self.moving_average, damping=self.damping, layer_collection=self.layer_collection, norm_constraint=self.kl_clip, momentum=self.momentum) self.optimize = optimizer.apply_gradients(grads) self.cov_update_op = optimizer.cov_update_op self.inv_update_op = optimizer.inv_update_op self.inv_update_dict = optimizer.inv_updates_dict self.factors = self.layer_collection.get_factors()
def init_network(self): # The model structure self.step_policy = self.policy(self.sess, self.X_input_step_shape, self.num_actions, reuse=False, is_training=False) self.train_policy = self.policy(self.sess, self.X_input_train_shape, self.num_actions, reuse=True, is_training=self.is_training) with tf.variable_scope('train_output'): negative_log_prob_action = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.train_policy.policy_logits, labels=self.actions) self.policy_gradient_loss = tf.reduce_mean( self.advantage * negative_log_prob_action) self.value_function_loss = tf.reduce_mean( mse(tf.squeeze(self.train_policy.value_function), self.reward)) self.entropy = tf.reduce_mean( openai_entropy(self.train_policy.policy_logits)) self.loss = self.policy_gradient_loss - self.entropy * self.entropy_coeff + self.value_function_loss * self.vf_coeff # Gradient Clipping params = find_trainable_variables("policy") grads = tf.gradients(self.loss, params) if self.max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) # Apply Gradients grads = list(zip(grads, params)) optimizer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate, decay=self.alpha, epsilon=self.epsilon) # ADDED with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.optimize = optimizer.apply_gradients(grads)
def prepare_loss(self): self.X_input_train_shape = (None, self.img_height, self.img_width, self.num_classes * self.num_stack) self.X_input_step_shape = (None, self.img_height, self.img_width, self.num_classes * self.num_stack) self.actions = tf.placeholder(tf.int32, [None]) # actions self.advantage = tf.placeholder(tf.float32, [None]) # advantage function self.reward = tf.placeholder(tf.float32, [None]) # reward self.learning_rate = tf.placeholder(tf.float32, []) # learning rate self.is_training = tf.placeholder(tf.bool) # is_training # The model structure self.actor_network = self.policy(self.sess, self.X_input_step_shape, self.num_actions, reuse=False, is_training=False) self.critic_network = self.policy(self.sess, self.X_input_train_shape, self.num_actions, reuse=True, is_training=self.is_training) with tf.variable_scope('train_output'): negative_log_prob_action = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.critic_network.policy_logits, labels=self.actions) self.policy_gradient_loss = tf.reduce_mean( self.advantage * negative_log_prob_action) self.value_function_loss = tf.reduce_mean( mse(tf.squeeze(self.critic_network.value_function), self.reward)) self.entropy = tf.reduce_mean( openai_entropy(self.critic_network.policy_logits)) self.loss = self.policy_gradient_loss - self.entropy * self.entropy_coeff + self.value_function_loss * self.vf_coeff # Gradient Clipping params = find_trainable_variables("policy") grads = tf.gradients(self.loss, params) if self.max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) # Apply Gradients grads = list(zip(grads, params)) optimizer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate, decay=self.alpha, epsilon=self.epsilon) self.optimize = optimizer.apply_gradients(grads) # monitor training summaries = [] summaries.append( tf.summary.scalar('loss/policy_gradient_loss', self.policy_gradient_loss)) summaries.append( tf.summary.scalar('loss/value_function_loss', self.value_function_loss)) summaries.append(tf.summary.scalar('loss/entropy', self.entropy)) summaries.append(tf.summary.scalar('loss/total_loss', self.loss)) summaries.append(tf.summary.scalar('train/gradnorm', grad_norm)) self.summary = tf.summary.merge(summaries)