def init_load_weights(self): with self.graph.as_default(): _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) values = [v.eval(session=self.sess) for v in _vars] for var, value in zip(_vars, values): assign_ph = tf.placeholder(var.dtype, shape=value.shape) self.assign_phs.append(assign_ph) self.assign_ops.append(tf.assign(var, assign_ph))
def _create_sac_optimizer_ops(self) -> None: """ Creates the Adam optimizers and update ops for SAC, including the policy, value, and entropy updates, as well as the target network update. """ policy_optimizer = self.create_optimizer_op( learning_rate=self.learning_rate, name="sac_policy_opt") entropy_optimizer = self.create_optimizer_op( learning_rate=self.learning_rate, name="sac_entropy_opt") value_optimizer = self.create_optimizer_op( learning_rate=self.learning_rate, name="sac_value_opt") self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(self.target_network.value_vars, self.policy_network.value_vars) ] logger.debug("value_vars") self.print_all_vars(self.policy_network.value_vars) logger.debug("targvalue_vars") self.print_all_vars(self.target_network.value_vars) logger.debug("critic_vars") self.print_all_vars(self.policy_network.critic_vars) logger.debug("q_vars") self.print_all_vars(self.policy_network.q_vars) logger.debug("policy_vars") policy_vars = self.policy.get_trainable_variables() self.print_all_vars(policy_vars) self.target_init_op = [ tf.assign(target, source) for target, source in zip( self.target_network.value_vars, self.policy_network.value_vars) ] self.update_batch_policy = policy_optimizer.minimize( self.policy_loss, var_list=policy_vars) # Make sure policy is updated first, then value, then entropy. with tf.control_dependencies([self.update_batch_policy]): self.update_batch_value = value_optimizer.minimize( self.total_value_loss, var_list=self.policy_network.critic_vars) # Add entropy coefficient optimization operation with tf.control_dependencies([self.update_batch_value]): self.update_batch_entropy = entropy_optimizer.minimize( self.entropy_loss, var_list=self.log_ent_coef)
def create_normalizer_update( vector_input: tf.Tensor, steps: tf.Tensor, running_mean: tf.Tensor, running_variance: tf.Tensor, ) -> Tuple[tf.Operation, tf.Operation]: """ Creates the update operation for the normalizer. :param vector_input: Vector observation to use for updating the running mean and variance. :param running_mean: Tensorflow tensor representing the current running mean. :param running_variance: Tensorflow tensor representing the current running variance. :param steps: Tensorflow tensor representing the current number of steps that have been normalized. :return: A TF operation that updates the normalization based on vector_input. """ # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here: # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates steps_increment = tf.shape(vector_input)[0] total_new_steps = tf.add(steps, tf.cast(steps_increment, dtype=tf.int64)) # Compute the incremental update and divide by the number of new steps. input_to_old_mean = tf.subtract(vector_input, running_mean) new_mean = running_mean + tf.reduce_sum( input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0) # Compute difference of input to the new mean for Welford update input_to_new_mean = tf.subtract(vector_input, new_mean) new_variance = running_variance + tf.reduce_sum( input_to_new_mean * input_to_old_mean, axis=0) update_mean = tf.assign(running_mean, new_mean) update_variance = tf.assign(running_variance, new_variance) update_norm_step = tf.assign(steps, total_new_steps) # First mean and variance calculated normally initial_mean, initial_variance = tf.nn.moments(vector_input, axes=[0]) initialize_mean = tf.assign(running_mean, initial_mean) # Multiplied by total_new_step because it is divided by total_new_step in the normalization initialize_variance = tf.assign( running_variance, (initial_variance + EPSILON) * tf.cast(total_new_steps, dtype=tf.float32), ) return ( tf.group([initialize_mean, initialize_variance, update_norm_step]), tf.group([update_mean, update_variance, update_norm_step]), )
def make_beta_update(self) -> None: """ Creates the beta parameter and its updater for GAIL """ new_beta = tf.maximum( self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON ) with tf.control_dependencies([self.update_batch]): self.update_beta = tf.assign(self.beta, new_beta)
def create_global_steps(): """Creates TF ops to track and increment global training step.""" global_step = tf.Variable( 0, name="global_step", trainable=False, dtype=tf.int32 ) steps_to_increment = tf.placeholder( shape=[], dtype=tf.int32, name="steps_to_increment" ) increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment)) return global_step, increment_step, steps_to_increment
def create_normalizer_update(self, vector_input): # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here: # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates steps_increment = tf.shape(vector_input)[0] total_new_steps = tf.add(self.normalization_steps, steps_increment) # Compute the incremental update and divide by the number of new steps. input_to_old_mean = tf.subtract(vector_input, self.running_mean) new_mean = self.running_mean + tf.reduce_sum( input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0) # Compute difference of input to the new mean for Welford update input_to_new_mean = tf.subtract(vector_input, new_mean) new_variance = self.running_variance + tf.reduce_sum( input_to_new_mean * input_to_old_mean, axis=0) update_mean = tf.assign(self.running_mean, new_mean) update_variance = tf.assign(self.running_variance, new_variance) update_norm_step = tf.assign(self.normalization_steps, total_new_steps) return tf.group([update_mean, update_variance, update_norm_step])
def start_learning(self, env_manager: EnvManager, inital_weights): self._create_output_path(self.output_path) # tf.reset_default_graph() global_step = 0 last_brain_behavior_ids: Set[str] = set() try: # Initial reset self._reset_env(env_manager) first_step = True while self._not_done_training(): external_brain_behavior_ids = set( env_manager.external_brains.keys()) new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids self._create_trainers_and_managers(env_manager, new_behavior_ids) # Load inital weights if (inital_weights is not None and first_step): print("Loading init weights!") # Set weights with self.trainers['Brain'].get_policy( 0).graph.as_default(): _vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES) values = [ v.eval(session=self.trainers['Brain'].get_policy( 0).sess) for v in _vars ] self.trainers['Brain'].get_policy(0).assign_phs = [] self.trainers['Brain'].get_policy(0).assign_ops = [] for var, value in zip(_vars, values): assign_ph = tf.placeholder(var.dtype, shape=value.shape) self.trainers['Brain'].get_policy( 0).assign_phs.append(assign_ph) self.trainers['Brain'].get_policy( 0).assign_ops.append(tf.assign(var, assign_ph)) # print(self.trainers['Brain'].get_policy(0).assign_ops) # print(self.trainers['Brain'].get_policy(0).assign_phs) self.trainers['Brain'].get_policy(0).load_weights( inital_weights) print("Inital weights loaded succesfully!") last_brain_behavior_ids = external_brain_behavior_ids n_steps = self.advance(env_manager) # print("Current weights: " + str(self.trainers['Brain'].get_policy(0).get_weights()[8])) for _ in range(n_steps): global_step += 1 self.reset_env_if_ready(env_manager, global_step) # Stop advancing trainers, Killing trainers first_step = False self.step = self.trainers['Brain'].step self.join_threads() except ( KeyboardInterrupt, UnityCommunicationException, UnityEnvironmentException, UnityCommunicatorStoppedException, ) as ex: self.join_threads() self.logger.info( "Learning was interrupted. Please wait while the graph is generated." ) if isinstance(ex, KeyboardInterrupt) or isinstance( ex, UnityCommunicatorStoppedException): pass else: # If the environment failed, we want to make sure to raise # the exception so we exit the process with an return code of 1. raise ex finally: # print("Weights after train: " + str(self.trainers['Brain'].get_policy(0).get_weights()[8])) # self.weights = self.trainers['Brain'].get_policy(0).get_weights() if self.train_model: self._save_model() self._export_graph() return self.trainers['Brain'].get_policy(0).get_weights()