コード例 #1
0
ファイル: tf_dist.py プロジェクト: shishouyuan/xingtian
 def kl(self, other):
     assert isinstance(other,
                       CategoricalDist), 'Distribution type not match.'
     rescaled_logits_self = self.logits - tf.reduce_max(
         self.logits, axis=-1, keepdims=True)
     rescaled_logits_other = other.logits - tf.reduce_max(
         other.logits, axis=-1, keepdims=True)
     exp_logits_self = tf.exp(rescaled_logits_self)
     exp_logits_other = tf.exp(rescaled_logits_other)
     z_self = tf.reduce_sum(exp_logits_self, axis=-1, keepdims=True)
     z_other = tf.reduce_sum(exp_logits_other, axis=-1, keepdims=True)
     p = exp_logits_self / z_self
     return tf.reduce_sum(p * (rescaled_logits_self - tf.log(z_self) -
                               rescaled_logits_other + tf.log(z_other)),
                          axis=-1,
                          keepdims=True)
コード例 #2
0
ファイル: tf_dist.py プロジェクト: shishouyuan/xingtian
 def entropy(self):
     rescaled_logits = self.logits - tf.reduce_max(
         self.logits, axis=-1, keepdims=True)
     exp_logits = tf.exp(rescaled_logits)
     z = tf.reduce_sum(exp_logits, axis=-1, keepdims=True)
     p = exp_logits / z
     return tf.reduce_sum(p * (tf.log(z) - rescaled_logits),
                          axis=-1,
                          keepdims=True)
コード例 #3
0
def actor_loss_with_entropy(adv, old_logits, behavior_action, out_logits):
    """Calculate actor loss with entropy."""
    old_log_p = neglog_prob(behavior_action, old_logits)
    action_log_prob = neglog_prob(behavior_action, out_logits)
    ratio = tf.exp(action_log_prob - old_log_p)

    surr_loss_1 = ratio * adv
    surr_loss_2 = tf.clip_by_value(ratio, 1.0 - LOSS_CLIPPING, 1.0 + LOSS_CLIPPING) * adv
    surr_loss = tf.reduce_mean(tf.minimum(surr_loss_1, surr_loss_2))

    ent = entropy(out_logits)
    ent = tf.reduce_mean(ent)

    return -surr_loss - ENTROPY_LOSS * ent
コード例 #4
0
ファイル: __init__.py プロジェクト: shishouyuan/xingtian
def actor_loss_with_entropy(dist, adv, old_log_p, behavior_action, clip_ratio,
                            ent_coef):
    """Calculate actor loss with entropy."""
    action_log_prob = dist.log_prob(behavior_action)
    ratio = tf.exp(action_log_prob - old_log_p)

    surr_loss_1 = ratio * adv
    surr_loss_2 = tf.clip_by_value(ratio, 1.0 - clip_ratio,
                                   1.0 + clip_ratio) * adv
    surr_loss = tf.reduce_mean(tf.minimum(surr_loss_1, surr_loss_2))

    ent = dist.entropy()
    ent = tf.reduce_mean(ent)

    return -surr_loss - ent_coef * ent
コード例 #5
0
ファイル: vtrace.py プロジェクト: shishouyuan/xingtian
def from_logic_outputs(behaviour_policy_logic_outputs,
                       target_policy_logic_outputs,
                       actions,
                       discounts,
                       rewards,
                       values,
                       bootstrap_value,
                       clip_importance_sampling_threshold=1.0,
                       clip_pg_importance_sampling_threshold=1.0):
    """
    Calculate vtrace with logic outputs.

    :param behaviour_policy_logic_outputs: behaviour_policy_logic_outputs
    :param target_policy_logic_outputs: target_policy_logic_outputs
    :param actions:
    :param discounts:
    :param rewards:
    :param values:
    :param bootstrap_value:
    :param clip_importance_sampling_threshold:
    :param clip_pg_importance_sampling_threshold:
    :return:
    """
    behaviour_policy_logic_outputs = tf.convert_to_tensor(
        behaviour_policy_logic_outputs, dtype=tf.float32)
    target_policy_logic_outputs = tf.convert_to_tensor(
        target_policy_logic_outputs, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)

    # support [T, B, Action_dimension]
    behaviour_policy_logic_outputs.shape.assert_has_rank(3)
    target_policy_logic_outputs.shape.assert_has_rank(3)
    actions.shape.assert_has_rank(2)

    target_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=target_policy_logic_outputs, labels=actions)

    behaviour_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=behaviour_policy_logic_outputs, labels=actions)

    # log importance sampling weight
    importance_sampling_weights = tf.exp(target_log_prob - behaviour_log_prob)

    clipped_importance_sampling_weight = tf.minimum(
        clip_importance_sampling_threshold, importance_sampling_weights)
    clipped_pg_importance_sampling_weight = tf.minimum(
        clip_pg_importance_sampling_threshold, importance_sampling_weights)

    # coefficient, similar to the 'trace cutting'
    coefficient = tf.minimum(1.0, importance_sampling_weights)

    next_values = tf.concat(
        [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)

    # temporal difference, as the fixed point
    deltas = clipped_importance_sampling_weight * (
        rewards + discounts * next_values - values)
    sequences = (deltas, discounts, coefficient)

    # calculate Vtrace with tf.scan, and set reverse: True, back --> begin
    def scan_fn(cumulative_value, sequence_item):
        _delta, _discount, _coefficient = sequence_item
        return _delta + _discount * _coefficient * cumulative_value

    last_values = tf.zeros_like(bootstrap_value)
    temporal_difference = tf.scan(
        fn=scan_fn,
        elems=sequences,
        initializer=last_values,
        parallel_iterations=1,
        back_prop=False,
        reverse=True,
    )

    value_of_states = tf.add(temporal_difference, values)
    # Advantage for policy gradient.
    value_of_next_state = tf.concat(
        [value_of_states[1:],
         tf.expand_dims(bootstrap_value, 0)], axis=0)
    pg_advantages = clipped_pg_importance_sampling_weight * (
        rewards + discounts * value_of_next_state - values)

    value_of_states = tf.stop_gradient(value_of_states)
    pg_advantages = tf.stop_gradient(pg_advantages)
    return value_of_states, pg_advantages
コード例 #6
0
ファイル: tf_dist.py プロジェクト: shishouyuan/xingtian
 def init_by_param(self, param):
     self.param = param
     self.mean, self.log_std = tf.split(self.param,
                                        num_or_size_splits=2,
                                        axis=-1)
     self.std = tf.exp(self.log_std)