def kl(self, other): assert isinstance(other, CategoricalDist), 'Distribution type not match.' rescaled_logits_self = self.logits - tf.reduce_max( self.logits, axis=-1, keepdims=True) rescaled_logits_other = other.logits - tf.reduce_max( other.logits, axis=-1, keepdims=True) exp_logits_self = tf.exp(rescaled_logits_self) exp_logits_other = tf.exp(rescaled_logits_other) z_self = tf.reduce_sum(exp_logits_self, axis=-1, keepdims=True) z_other = tf.reduce_sum(exp_logits_other, axis=-1, keepdims=True) p = exp_logits_self / z_self return tf.reduce_sum(p * (rescaled_logits_self - tf.log(z_self) - rescaled_logits_other + tf.log(z_other)), axis=-1, keepdims=True)
def entropy(self): rescaled_logits = self.logits - tf.reduce_max( self.logits, axis=-1, keepdims=True) exp_logits = tf.exp(rescaled_logits) z = tf.reduce_sum(exp_logits, axis=-1, keepdims=True) p = exp_logits / z return tf.reduce_sum(p * (tf.log(z) - rescaled_logits), axis=-1, keepdims=True)
def actor_loss_with_entropy(adv, old_logits, behavior_action, out_logits): """Calculate actor loss with entropy.""" old_log_p = neglog_prob(behavior_action, old_logits) action_log_prob = neglog_prob(behavior_action, out_logits) ratio = tf.exp(action_log_prob - old_log_p) surr_loss_1 = ratio * adv surr_loss_2 = tf.clip_by_value(ratio, 1.0 - LOSS_CLIPPING, 1.0 + LOSS_CLIPPING) * adv surr_loss = tf.reduce_mean(tf.minimum(surr_loss_1, surr_loss_2)) ent = entropy(out_logits) ent = tf.reduce_mean(ent) return -surr_loss - ENTROPY_LOSS * ent
def actor_loss_with_entropy(dist, adv, old_log_p, behavior_action, clip_ratio, ent_coef): """Calculate actor loss with entropy.""" action_log_prob = dist.log_prob(behavior_action) ratio = tf.exp(action_log_prob - old_log_p) surr_loss_1 = ratio * adv surr_loss_2 = tf.clip_by_value(ratio, 1.0 - clip_ratio, 1.0 + clip_ratio) * adv surr_loss = tf.reduce_mean(tf.minimum(surr_loss_1, surr_loss_2)) ent = dist.entropy() ent = tf.reduce_mean(ent) return -surr_loss - ent_coef * ent
def from_logic_outputs(behaviour_policy_logic_outputs, target_policy_logic_outputs, actions, discounts, rewards, values, bootstrap_value, clip_importance_sampling_threshold=1.0, clip_pg_importance_sampling_threshold=1.0): """ Calculate vtrace with logic outputs. :param behaviour_policy_logic_outputs: behaviour_policy_logic_outputs :param target_policy_logic_outputs: target_policy_logic_outputs :param actions: :param discounts: :param rewards: :param values: :param bootstrap_value: :param clip_importance_sampling_threshold: :param clip_pg_importance_sampling_threshold: :return: """ behaviour_policy_logic_outputs = tf.convert_to_tensor( behaviour_policy_logic_outputs, dtype=tf.float32) target_policy_logic_outputs = tf.convert_to_tensor( target_policy_logic_outputs, dtype=tf.float32) actions = tf.convert_to_tensor(actions, dtype=tf.int32) # support [T, B, Action_dimension] behaviour_policy_logic_outputs.shape.assert_has_rank(3) target_policy_logic_outputs.shape.assert_has_rank(3) actions.shape.assert_has_rank(2) target_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits( logits=target_policy_logic_outputs, labels=actions) behaviour_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits( logits=behaviour_policy_logic_outputs, labels=actions) # log importance sampling weight importance_sampling_weights = tf.exp(target_log_prob - behaviour_log_prob) clipped_importance_sampling_weight = tf.minimum( clip_importance_sampling_threshold, importance_sampling_weights) clipped_pg_importance_sampling_weight = tf.minimum( clip_pg_importance_sampling_threshold, importance_sampling_weights) # coefficient, similar to the 'trace cutting' coefficient = tf.minimum(1.0, importance_sampling_weights) next_values = tf.concat( [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0) # temporal difference, as the fixed point deltas = clipped_importance_sampling_weight * ( rewards + discounts * next_values - values) sequences = (deltas, discounts, coefficient) # calculate Vtrace with tf.scan, and set reverse: True, back --> begin def scan_fn(cumulative_value, sequence_item): _delta, _discount, _coefficient = sequence_item return _delta + _discount * _coefficient * cumulative_value last_values = tf.zeros_like(bootstrap_value) temporal_difference = tf.scan( fn=scan_fn, elems=sequences, initializer=last_values, parallel_iterations=1, back_prop=False, reverse=True, ) value_of_states = tf.add(temporal_difference, values) # Advantage for policy gradient. value_of_next_state = tf.concat( [value_of_states[1:], tf.expand_dims(bootstrap_value, 0)], axis=0) pg_advantages = clipped_pg_importance_sampling_weight * ( rewards + discounts * value_of_next_state - values) value_of_states = tf.stop_gradient(value_of_states) pg_advantages = tf.stop_gradient(pg_advantages) return value_of_states, pg_advantages
def init_by_param(self, param): self.param = param self.mean, self.log_std = tf.split(self.param, num_or_size_splits=2, axis=-1) self.std = tf.exp(self.log_std)