def diag_normal_kl(mean0, logstd0, mean1, logstd1): """Epirical KL divergence of two normals with diagonal covariance.""" logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1 return 0.5 * (tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) + tf.reduce_sum((mean1 - mean0)**2 / tf.exp(logstd1_2), -1) + tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) - mean0.shape[-1].value)
def perform(self, observ): """Compute batch of actions and a summary for a batch of observation. Args: observ: Tensor of a batch of observations for all algorithms. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) network = self._network(observ[:, None], tf.ones(observ.shape[0]), self._last_state) action = tf.cond(self._is_training, network.policy.sample, lambda: network.mean) logprob = network.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', network.mean[:, 0]), tf.summary.histogram('std', tf.exp(network.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. with tf.control_dependencies([ utility.assign_nested_vars(self._last_state, network.state), self._last_action.assign(action[:, 0]), self._last_mean.assign(network.mean[:, 0]), self._last_logstd.assign(network.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def recurrent_gaussian(config, action_size, observations, length, state=None): """Independent recurrent policy and feed forward value networks. The policy network outputs the mean action and the log standard deviation is learned as independent parameter vector. The last policy layer is recurrent and uses a GRU cell. Args: config: Configuration object. action_size: Length of the action vector. observations: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. Returns: NetworkOutput tuple. """ mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer( factor=config.init_mean_factor) logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10) cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1]) flat_observations = tf.reshape(observations, [ tf.shape(observations)[0], tf.shape(observations)[1], functools.reduce(operator.mul, observations.shape.as_list()[2:], 1) ]) with tf.variable_scope('policy'): x = flat_observations for size in config.policy_layers[:-1]: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32) mean = tf.contrib.layers.fully_connected(x, action_size, tf.tanh, weights_initializer=mean_weights_initializer) logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer) logstd = tf.tile(logstd[None, None], [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2)) with tf.variable_scope('value'): x = flat_observations for size in config.value_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] mean = tf.check_numerics(mean, 'mean') logstd = tf.check_numerics(logstd, 'logstd') value = tf.check_numerics(value, 'value') policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd)) # assert state.shape.as_list()[0] is not None return NetworkOutput(policy, mean, logstd, value, state)
def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tf.contrib.framework.nest.map_structure( lambda x: tf.gather(x, agent_indices), self._last_state) output = self._network(observ[:, None], tf.ones(observ.shape[0]), state) action = tf.cond(self._is_training, output.policy.sample, lambda: output.mean) logprob = output.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', output.mean[:, 0]), tf.summary.histogram('std', tf.exp(output.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) with tf.control_dependencies([ assign_state, tf.scatter_update(self._last_action, agent_indices, action[:, 0]), tf.scatter_update(self._last_mean, agent_indices, output.mean[:, 0]), tf.scatter_update(self._last_logstd, agent_indices, output.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def _build_losses(self, json_data): actor_weight_decay = 0 if ( self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY] critic_weight_decay = 0 if ( self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY] norm_val_diff = self.val_norm.normalize_tf( self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf) self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff)) if (critic_weight_decay != 0): self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss( 'main/critic') norm_tar_a_tf = self.a_norm.normalize_tf(self.a_tf) self._norm_a_mean_tf = self.a_norm.normalize_tf(self.a_mean_tf) self.logp_tf = TFUtil.calc_logp_gaussian(norm_tar_a_tf, self._norm_a_mean_tf, self.norm_a_std_tf) ratio_tf = tf.exp(self.logp_tf - self.old_logp_tf) actor_loss0 = self.adv_tf * ratio_tf actor_loss1 = self.adv_tf * tf.clip_by_value( ratio_tf, 1.0 - self.ratio_clip, 1 + self.ratio_clip) self.actor_loss_tf = -tf.reduce_mean( tf.minimum(actor_loss0, actor_loss1)) norm_a_bound_min = self.a_norm.normalize(self.a_bound_min) norm_a_bound_max = self.a_norm.normalize(self.a_bound_max) a_bound_loss = TFUtil.calc_bound_loss(self._norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max) self.actor_loss_tf += a_bound_loss if (actor_weight_decay != 0): self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss( 'main/actor') # for debugging self.clip_frac_tf = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio_tf - 1.0), self.ratio_clip))) return
def _network(self, observ, length=None, state=None, reuse=True): """Compute the network output for a batched sequence of observations. Optionally, the initial state can be specified. The weights should be reused for all calls, except for the first one. Output is a named tuple containing the policy as a TensorFlow distribution, the policy mean and log standard deviation, the approximated state value, and the new recurrent state. Args: observ: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. reuse: Python boolean whether to reuse previous variables. Returns: NetworkOutput tuple. """ with tf.variable_scope('network', reuse=reuse): observ = tf.convert_to_tensor(observ) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): observ = tf.check_numerics(observ, 'observ') cell = self._config.network( self._batch_env.action.shape[1].value) (mean, logstd, value), state = tf.nn.dynamic_rnn(cell, observ, length, state, tf.float32, swap_memory=True) mean = tf.check_numerics(mean, 'mean') logstd = tf.check_numerics(logstd, 'logstd') value = tf.check_numerics(value, 'value') policy = tf.contrib.distributions.MultivariateNormalDiag( mean, tf.exp(logstd)) return _NetworkOutput(policy, mean, logstd, value, state)
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary
def diag_normal_logpdf(mean, logstd, loc): """Log density of a normal with diagonal covariance.""" constant = -0.5 * (math.log(2 * math.pi) + logstd) value = -0.5 * ((loc - mean) / tf.exp(logstd))**2 return tf.reduce_sum(constant + value, -1)