Ejemplo n.º 1
0
    def _build_graph(self, inputs):
        state, action, futurereward, action_prob = inputs
        logits, value = self._get_NN_prediction(state)
        value = tf.squeeze(value, [1], name='pred_value')  # (B,)
        policy = tf.nn.softmax(logits, name='policy')
        is_training = get_current_tower_context().is_training
        if not is_training:
            return
        log_probs = tf.log(policy + 1e-6)

        log_pi_a_given_s = tf.reduce_sum(
            log_probs * tf.one_hot(action, NUM_ACTIONS), 1)
        advantage = tf.subtract(tf.stop_gradient(value), futurereward, name='advantage')

        pi_a_given_s = tf.reduce_sum(policy * tf.one_hot(action, NUM_ACTIONS), 1)  # (B,)
        importance = tf.stop_gradient(tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10))

        policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage * importance, name='policy_loss')
        xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss')
        value_loss = tf.nn.l2_loss(value - futurereward, name='value_loss')

        pred_reward = tf.reduce_mean(value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(0.01), trainable=False)
        self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
        self.cost = tf.truediv(self.cost,
                               tf.cast(tf.shape(futurereward)[0], tf.float32),
                               name='cost')
        summary.add_moving_summary(policy_loss, xentropy_loss,
                                   value_loss, pred_reward, advantage,
                                   self.cost, tf.reduce_mean(importance, name='importance'))
Ejemplo n.º 2
0
  def _build_graph(self, inputs):
    state, action, futurereward = inputs
    policy, self.value = self._get_NN_prediction(state)
    self.value = tf.squeeze(self.value, [1], name='pred_value') # (B,)
    self.logits = tf.nn.softmax(policy, name='logits')

    expf = tf.get_variable('explore_factor', shape=[],
        initializer=tf.constant_initializer(1), trainable=False)
    logitsT = tf.nn.softmax(policy * expf, name='logitsT')
    is_training = get_current_tower_context().is_training
    if not is_training:
      return
    log_probs = tf.log(self.logits + 1e-6)

    log_pi_a_given_s = tf.reduce_sum(
        log_probs * tf.one_hot(action, NUM_ACTIONS), 1)
    advantage = tf.sub(tf.stop_gradient(self.value), futurereward, name='advantage')
    policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage, name='policy_loss')
    xentropy_loss = tf.reduce_sum(
        self.logits * log_probs, name='xentropy_loss')
    value_loss = tf.nn.l2_loss(self.value - futurereward, name='value_loss')

    pred_reward = tf.reduce_mean(self.value, name='predict_reward')
    advantage = symbf.rms(advantage, name='rms_advantage')
    summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage)
    entropy_beta = tf.get_variable('entropy_beta', shape=[],
        initializer=tf.constant_initializer(0.01), trainable=False)
    self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
    self.cost = tf.truediv(self.cost,
        tf.cast(tf.shape(futurereward)[0], tf.float32),
        name='cost')
    def _build_graph(self, inputs):
        state, action, futurereward, action_prob = inputs
        logits, value = self._get_NN_prediction(state)
        value = tf.squeeze(value, [1], name='pred_value')  # (B,)
        policy = tf.nn.softmax(logits, name='policy')
        is_training = get_current_tower_context().is_training
        if not is_training:
            return
        log_probs = tf.log(policy + 1e-6)

        log_pi_a_given_s = tf.reduce_sum(
            log_probs * tf.one_hot(action, NUM_ACTIONS), 1)
        advantage = tf.subtract(tf.stop_gradient(value), futurereward, name='advantage')

        pi_a_given_s = tf.reduce_sum(policy * tf.one_hot(action, NUM_ACTIONS), 1)  # (B,)
        importance = tf.stop_gradient(tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10))

        policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage * importance, name='policy_loss')
        xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss')
        value_loss = tf.nn.l2_loss(value - futurereward, name='value_loss')

        pred_reward = tf.reduce_mean(value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(0.01), trainable=False)
        self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
        self.cost = tf.truediv(self.cost,
                               tf.cast(tf.shape(futurereward)[0], tf.float32),
                               name='cost')
        summary.add_moving_summary(policy_loss, xentropy_loss,
                                   value_loss, pred_reward, advantage,
                                   self.cost, tf.reduce_mean(importance, name='importance'))
Ejemplo n.º 4
0
 def _mapper(self, grad, var):
     name = var.op.name
     if name not in _summaried_gradient:
         _summaried_gradient.add(name)
         tf.summary.histogram(name + '-grad', grad)
         from tensorpack.tfutils.symbolic_functions import rms
         tf.summary.scalar(name + '/rms', rms(grad))
     return grad
Ejemplo n.º 5
0
    def _build_graph(self, inputs):
        state, action, futurereward1, futurereward2, updateweight1, updateweight2, action_prob = inputs
        logits, value1, value2 = self._get_NN_prediction(state)
        value1 = tf.squeeze(value1, [1], name='pred_value_1')  # (B,)
        value2 = tf.squeeze(value2, [1], name='pred_value_2')  # (B,)
        policy = tf.nn.softmax(logits, name='policy')
        is_training = get_current_tower_context().is_training
        if not is_training:
            return
        log_probs = tf.log(policy + 1e-6)

        log_pi_a_given_s = tf.reduce_sum(
            log_probs * tf.one_hot(action, NUM_ACTIONS), 1)
        advantage1 = tf.subtract(tf.stop_gradient(value1), futurereward1, name='advantage_1')
        advantage2 = tf.subtract(tf.stop_gradient(value2), futurereward2, name='advantage_2')

        pi_a_given_s = tf.reduce_sum(policy * tf.one_hot(action, NUM_ACTIONS), 1)  # (B,)
        importance = tf.stop_gradient(tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10))

        policy_loss1 = tf.reduce_sum(log_pi_a_given_s * advantage1 * importance * updateweight1, name='policy_loss_1')
        policy_loss2 = tf.reduce_sum(log_pi_a_given_s * advantage2 * importance * updateweight2, name='policy_loss_2')
        policy_loss = tf.add(policy_loss1, policy_loss2, name='policy_loss')
        xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss')
        value_loss1 = tf.nn.l2_loss((value1 - futurereward1) * tf.sqrt(updateweight1), name='value_loss_1')
        value_loss2 = tf.nn.l2_loss((value2 - futurereward2) * tf.sqrt(updateweight2), name='value_loss_2')
        value_loss = tf.add(value_loss1, value_loss2, name='value_loss')

        pred_reward1 = tf.reduce_mean(value1, name='predict_reward_1')
        pred_reward2 = tf.reduce_mean(value2, name='predict_reward_2')
        pred_reward_avg = tf.add(pred_reward1 * 0.5, pred_reward2 * 0.5, name='predict_reward_avg')
        advantage1 = symbf.rms(advantage1, name='rms_advantage_1')
        advantage2 = symbf.rms(advantage2, name='rms_advantage_2')
        advantage_avg = symbf.rms(advantage1 * 0.5 + advantage2 * 0.5, name='rms_advantage_avg')
        entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(0.01), trainable=False)
        self.cost = tf.add_n([policy_loss1, policy_loss2, xentropy_loss * entropy_beta, value_loss1, value_loss2])
        self.cost = tf.truediv(self.cost,
                               tf.cast(tf.shape(futurereward1)[0], tf.float32),
                               name='cost')
        summary.add_moving_summary(policy_loss, xentropy_loss, value_loss,
                                   pred_reward1, pred_reward2, pred_reward_avg,
                                   advantage1, advantage2, advantage_avg,
                                   self.cost, tf.reduce_mean(importance, name='importance'))
Ejemplo n.º 6
0
    def _build_graph(self, inputs):
        state, action, futurereward = inputs
        policy, self.value = self._get_NN_prediction(state)
        self.value = tf.squeeze(self.value, [1], name='pred_value')  # (B,)
        self.logits = tf.nn.softmax(policy, name='logits')

        expf = tf.get_variable('explore_factor',
                               shape=[],
                               initializer=tf.constant_initializer(1),
                               trainable=False)
        logitsT = tf.nn.softmax(policy * expf, name='logitsT')
        is_training = get_current_tower_context().is_training
        if not is_training:
            return
        log_probs = tf.log(self.logits + 1e-6)

        log_pi_a_given_s = tf.reduce_sum(
            log_probs * tf.one_hot(action, self.number_of_actions), 1)
        advantage = tf.sub(tf.stop_gradient(self.value),
                           futurereward,
                           name='advantage')
        policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage,
                                    name='policy_loss')
        xentropy_loss = tf.reduce_sum(self.logits * log_probs,
                                      name='xentropy_loss')
        value_loss = tf.nn.l2_loss(self.value - futurereward,
                                   name='value_loss')

        pred_reward = tf.reduce_mean(self.value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        summary.add_moving_summary(policy_loss, xentropy_loss, value_loss,
                                   pred_reward, advantage)
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        self.cost = tf.add_n(
            [policy_loss, xentropy_loss * entropy_beta, value_loss])
        self.cost = tf.truediv(self.cost,
                               tf.cast(tf.shape(futurereward)[0], tf.float32),
                               name='cost')

        # print "DEBUGGING INFO:{}".format(DEBUGING_INFO)
        # assert 1 == 0, "AAA"

        if DEBUGING_INFO:
            logits_mean, logits_var = tf.nn.moments(self.logits, axes=[1])
            # logits_mean_r = tf.reduce_sum(logits_mean)
            logits_var_r = tf.reduce_sum(logits_var)
            # tf.scalar_summary('logits_mean', logits_mean_r)
            tf.scalar_summary('logits_var', logits_var_r)

        tf.scalar_summary('entropy beta', entropy_beta)
        tf.scalar_summary('explore factor', expf)
Ejemplo n.º 7
0
 def perform(var, action):
     ndim = var.get_shape().ndims
     name = var.name.replace(':0', '')
     if action == 'scalar':
         assert ndim == 0, "Scalar summary on high-dimension data. Maybe you want 'mean'?"
         tf.summary.scalar(name, var)
         return
     assert ndim > 0, "Cannot perform {} summary on scalar data".format(action)
     if action == 'histogram':
         tf.summary.histogram(name, var)
         return
     if action == 'sparsity':
         tf.summary.scalar(name + '-sparsity', tf.nn.zero_fraction(var))
         return
     if action == 'mean':
         tf.summary.scalar(name + '-mean', tf.reduce_mean(var))
         return
     if action == 'rms':
         tf.summary.scalar(name + '-rms', rms(var))
         return
     if action == 'absmax':
         tf.summary.scalar(name + '-absmax', tf.reduce_max(tf.abs(var)))
         return
     raise RuntimeError("Unknown summary type: {}".format(action))
Ejemplo n.º 8
0
    def _build_ad_nn(self, tensor_io):
        from drlutils.dataflow.tensor_io import TensorIO
        assert (isinstance(tensor_io, TensorIO))
        from drlutils.model.base import get_current_nn_context
        from tensorpack.tfutils.common import get_global_step_var
        global_step = get_global_step_var()
        nnc = get_current_nn_context()
        is_training = nnc.is_training
        i_state = tensor_io.getInputTensor('state')
        i_agentIdent = tensor_io.getInputTensor('agentIdent')
        i_sequenceLength = tensor_io.getInputTensor('sequenceLength')
        i_resetRNN = tensor_io.getInputTensor('resetRNN')
        l = i_state
        # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ')
        # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ')
        # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ')
        # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ')
        with tf.variable_scope('critic', reuse=nnc.reuse) as vs:

            def _get_cell():
                cell = tf.nn.rnn_cell.BasicLSTMCell(256)
                # if is_training:
                #     cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9)
                return cell

            cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)])
            rnn_outputs = self._buildRNN(
                l,
                cell,
                tensor_io.batchSize,
                i_agentIdent=i_agentIdent,
                i_sequenceLength=i_sequenceLength,
                i_resetRNN=i_resetRNN,
            )
            rnn_outputs = tf.reshape(
                rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]])
            l = rnn_outputs
            from ad_cur.autodrive.model.selu import fc_selu
            for lidx in range(2):
                l = fc_selu(
                    l,
                    200,
                    keep_prob=1.,  # 由于我们只使用传感器训练,关键信息不能丢
                    is_training=is_training,
                    name='fc-{}'.format(lidx))
            value = tf.layers.dense(l, 1, name='fc-value')
            value = tf.squeeze(value, [1], name="value")
            if not hasattr(self, '_weights_critic'):
                self._weights_critic = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        with tf.variable_scope('actor', reuse=nnc.reuse) as vs:
            l = tf.stop_gradient(l)
            l = tf.layers.dense(l,
                                128,
                                activation=tf.nn.relu6,
                                name='fc-actor')
            mu_steering = 0.5 * tf.layers.dense(
                l, 1, activation=tf.nn.tanh, name='fc-mu-steering')
            mu_accel = tf.layers.dense(l,
                                       1,
                                       activation=tf.nn.tanh,
                                       name='fc-mu-accel')
            mus = tf.concat([mu_steering, mu_accel], axis=-1)

            # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus')
            # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas')
            # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5)
            def saturating_sigmoid(x):
                """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1]."""
                with tf.name_scope("saturating_sigmoid", [x]):
                    y = tf.sigmoid(x)
                    return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1))

            sigma_steering_ = 0.1 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering')
            sigma_accel_ = 0.25 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel')

            if not nnc.is_evaluating:
                sigma_beta_steering = tf.get_default_graph(
                ).get_tensor_by_name('actor/sigma_beta_steering:0')
                sigma_beta_accel = tf.get_default_graph().get_tensor_by_name(
                    'actor/sigma_beta_accel:0')
                sigma_beta_steering = tf.constant(1e-4)
                # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp')
                # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp')
            else:
                sigma_beta_steering = tf.constant(1e-4)
                sigma_beta_accel = tf.constant(1e-4)
            sigma_steering = (sigma_steering_ + sigma_beta_steering)
            sigma_accel = (sigma_accel_ + sigma_beta_accel)

            sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1)
            # if is_training:
            #     pass
            #     # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因:
            #     #   1、训练前期尽量大的探索可以避免网络陷入局部最优
            #     #   2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来
            #
            # if is_training:
            #     sigmas += sigma_beta_steering
            # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5)
            # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5)
            # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5)
            # sigmas_orig = sigmas
            # sigmas = sigmas + sigma_beta_steering
            # sigmas = tf.minimum(sigmas + 0.1, 100)
            # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1)
            # sigma_steering += sigma_beta_steering
            # sigma_accel += sigma_beta_accel

            # mus = tf.concat([mu_steering, mu_accel], axis=-1)

            from tensorflow.contrib.distributions import Normal
            dists = Normal(mus, sigmas + 0.01)
            policy = tf.squeeze(dists.sample([1]), [0])
            # 裁剪到两倍方差之内
            policy = tf.clip_by_value(policy, mus - 2 * sigmas,
                                      mus + 2 * sigmas)
            if is_training:
                self._addMovingSummary(
                    tf.reduce_mean(mu_steering, name='mu/steering/mean'),
                    tf.reduce_mean(mu_accel, name='mu/accel/mean'),
                    tf.reduce_mean(sigma_steering, name='sigma/steering/mean'),
                    tf.reduce_max(sigma_steering, name='sigma/steering/max'),
                    tf.reduce_mean(sigma_accel, name='sigma/accel/mean'),
                    tf.reduce_max(sigma_accel, name='sigma/accel/max'),
                    # sigma_beta_accel,
                    # sigma_beta_steering,
                )
            # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions],
            #                    'mu/sigma/sigma.orig/act=', summarize=4)
            if not hasattr(self, '_weights_actor'):
                self._weights_actor = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
        if not is_training:
            tensor_io.setOutputTensors(policy, value, mus, sigmas)
            return

        i_actions = tensor_io.getInputTensor("action")
        # i_actions = tf.Print(i_actions, [i_actions], 'actions = ')
        i_actions = tf.reshape(i_actions,
                               [-1] + i_actions.get_shape().as_list()[2:])
        log_probs = dists.log_prob(i_actions)
        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        i_advantage = tensor_io.getInputTensor("advantage")
        i_advantage = tf.reshape(i_advantage,
                                 [-1] + i_advantage.get_shape().as_list()[2:])
        exp_v = log_probs * tf.expand_dims(i_advantage, -1)
        entropy = dists.entropy()
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        exp_v = entropy_beta * entropy + exp_v
        loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1),
                                     name='loss/policy')

        i_futurereward = tensor_io.getInputTensor("futurereward")
        i_futurereward = tf.reshape(i_futurereward, [-1] +
                                    i_futurereward.get_shape().as_list()[2:])
        loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward))

        loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1),
                                      name='xentropy_loss')

        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4),
                                                   self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])

        self._addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(value, name='predict_reward')
        import tensorpack.tfutils.symbolic_functions as symbf
        advantage = symbf.rms(i_advantage, name='rms_advantage')
        self._addMovingSummary(
            loss_policy,
            loss_value,
            loss_entropy,
            pred_reward,
            advantage,
            loss_l2_regularizer,
            tf.reduce_mean(policy[:, 0], name='actor/steering/mean'),
            tf.reduce_mean(policy[:, 1], name='actor/accel/mean'),
        )
        return loss_policy, loss_value
    def _build_graph(self, inputs):
        state, action, futurereward = inputs
        with tf.variable_scope('potential'):
            action_prediction_logits = self._get_human_action_prediction(state)
            action_prediction = tf.nn.softmax(action_prediction_logits)
            action_prediction = tf.stop_gradient(action_prediction)
        logits, self.value = self._get_NN_prediction(state)
        # reward shaping with negative cross_entropy
        # cross_entropy returns values close to 0 if labels and logits agree
        # and values growing more and more towards inf if labels and logits disagree
        mean_score = tf.get_variable('mean_score',
                                     shape=[],
                                     initializer=tf.constant_initializer(0),
                                     trainable=False)
        avg_cross_entropy = -np.log(1 / float(NUM_ACTIONS))
        avg_human_performance = tf.constant(self.avg_human_performance)
        temperature = tf.nn.relu(
            (avg_human_performance - mean_score) / avg_human_performance,
            name='temperature')
        shaping = tf.nn.softmax_cross_entropy_with_logits(
            labels=action_prediction, logits=logits)
        shaping_loss = temperature * tf.reduce_sum(shaping)
        shaping_beta = tf.get_variable(
            'shaping_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        #reward_shaping = -tf.clip_by_value(reward_shaping, 0.0, avg_cross_entropy)
        summary.add_moving_summary(
            tf.reduce_mean(futurereward, name='futurereward'),
            tf.reduce_mean(shaping, name='mean_shaping_loss'))

        self.value = tf.squeeze(self.value, [1], name='pred_value')  # (B,)
        self.policy = tf.nn.softmax(logits, name='policy')

        expf = tf.get_variable('explore_factor',
                               shape=[],
                               initializer=tf.constant_initializer(1),
                               trainable=False)
        policy_explore = tf.nn.softmax(logits * expf, name='policy_explore')
        is_training = get_current_tower_context().is_training
        if not is_training:
            return
        log_probs = tf.log(self.policy + 1e-6)

        log_pi_a_given_s = tf.reduce_sum(
            log_probs * tf.one_hot(action, NUM_ACTIONS), 1)
        advantage = tf.subtract(tf.stop_gradient(self.value),
                                futurereward,
                                name='advantage')
        policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage,
                                    name='policy_loss')
        xentropy_loss = tf.reduce_sum(self.policy * log_probs,
                                      name='xentropy_loss')
        value_loss = tf.nn.l2_loss(self.value - futurereward,
                                   name='value_loss')

        pred_reward = tf.reduce_mean(self.value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        self.cost = tf.add_n([
            policy_loss, xentropy_loss * entropy_beta, value_loss,
            shaping_beta * shaping_loss
        ])
        self.cost = tf.truediv(self.cost,
                               tf.cast(tf.shape(futurereward)[0], tf.float32),
                               name='cost')
        summary.add_moving_summary(policy_loss, xentropy_loss, temperature,
                                   value_loss, pred_reward, advantage,
                                   self.cost)
Ejemplo n.º 10
0
    def _build_graph(self, inputs):
        from tensorpack.tfutils.common import get_global_step_var
        state, action, futurereward, advantage = inputs
        is_training = get_current_tower_context().is_training
        policy, value, dists = self._get_NN_prediction(state)
        if not hasattr(self, '_weights_train'):
            self._weights_train = self._weights_critic + self._weights_actor
        self.value = tf.squeeze(value, [1], name='value')  # (B,)
        self.policy = tf.identity(policy, name='policy')

        with tf.variable_scope("Pred") as vs:
            __p, __v, _ = self._get_NN_prediction(state)
            __v = tf.squeeze(__v, [1], name='value')  # (B,)
            __p = tf.identity(__p, name='policy')
            if not hasattr(self, '_weights_pred'):
                self._weights_pred = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                assert (len(self._weights_train) == len(self._weights_pred))
                assert (not hasattr(self, '_sync_op'))
                self._sync_op = tf.group(*[d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02)) for d, s in zip(self._weights_pred, self._weights_train)])

        with tf.variable_scope('pre') as vs:
            pre_p,pre_v,pre_dists=self._get_NN_prediction(state)
            if not hasattr(self,'pre_weights'):
                self.pre_weights=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=vs.name)
                self._td_sync_op = tf.group(*[d.assign(s) for d, s in zip(self.pre_weights, self._weights_train)])


        if not is_training:
            return

        # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage')
        # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4)
        log_probs = dists.log_prob(action)
        #add  ppo policy clip loss
        #add ratio  ,surr1, surr2
        pre_probs=pre_dists.log_prob(action)
        ratio=tf.exp(log_probs-pre_probs)
        prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio, axis=1), axis=1)
        clip_param=tf.train.exponential_decay(CLIP_PARAMETER, get_global_step_var(), 10000, 0.98, name='clip_param')


        # surr1=prob_ratio*advantage
        surr1=ratio*tf.expand_dims(advantage, -1)
        surr2=tf.clip_by_value(ratio,1.0-clip_param,1.0+clip_param)*tf.expand_dims(advantage, -1)
        
        # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage

        loss_policy=-tf.reduce_mean(tf.minimum(surr1,surr2))

        #add critic clip loss
        v_loss1=tf.square(value-futurereward)
        pre_value=pre_v+tf.clip_by_value(value-pre_v,-clip_param,clip_param)
        v_loss2=tf.square(pre_v-futurereward)
        # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2))
        loss_value=0.5*tf.reduce_mean(v_loss1)
        

        entropy = dists.entropy()
        entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(0.01), trainable=False)
        exp_v = entropy_beta * entropy
        loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy')
        loss_policy=loss_policy+loss_entropy
        

        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        # exp_v = log_probs * tf.expand_dims(advantage, -1)
        # entropy = dists.entropy()
        # entropy_beta = tf.get_variable('entropy_beta', shape=[],
        #                                initializer=tf.constant_initializer(0.01), trainable=False)
        # exp_v = entropy_beta * entropy + exp_v
        
        # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward))

        # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss')


        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])
        self._cost = [loss_policy,
                      loss_value
                      ]
        from autodrive.trainer.summary import addParamSummary
        addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(self.value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        summary.add_moving_summary(loss_policy, loss_value,
                                   loss_entropy,
                                   pred_reward, advantage,
                                   loss_l2_regularizer,
                                   tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'),
                                   tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'),
                                    )
Ejemplo n.º 11
0
    def _build_graph(self, inputs):
        from tensorpack.tfutils.common import get_global_step_var
        state, action, futurereward, advantage = inputs
        is_training = get_current_tower_context().is_training
        policy, value, dists = self._get_NN_prediction(state)
        if not hasattr(self, '_weights_train'):
            self._weights_train = self._weights_critic + self._weights_actor
        self.value = tf.squeeze(value, [1], name='value')  # (B,)
        self.policy = tf.identity(policy, name='policy')

        with tf.variable_scope("Pred") as vs:
            __p, __v, _ = self._get_NN_prediction(state)
            __v = tf.squeeze(__v, [1], name='value')  # (B,)
            __p = tf.identity(__p, name='policy')
            if not hasattr(self, '_weights_pred'):
                self._weights_pred = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                assert (len(self._weights_train) == len(self._weights_pred))
                assert (not hasattr(self, '_sync_op'))
                self._sync_op = tf.group(*[
                    d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02))
                    for d, s in zip(self._weights_pred, self._weights_train)
                ])

        with tf.variable_scope('pre') as vs:
            pre_p, pre_v, pre_dists = self._get_NN_prediction(state)
            if not hasattr(self, 'pre_weights'):
                self.pre_weights = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                self._td_sync_op = tf.group(*[
                    d.assign(s)
                    for d, s in zip(self.pre_weights, self._weights_train)
                ])

        if not is_training:
            return

        # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage')
        # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4)
        log_probs = dists.log_prob(action)
        #add  ppo policy clip loss
        #add ratio  ,surr1, surr2
        pre_probs = pre_dists.log_prob(action)
        ratio = tf.exp(log_probs - pre_probs)
        prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio,
                                                           axis=1),
                                    axis=1)
        clip_param = tf.train.exponential_decay(CLIP_PARAMETER,
                                                get_global_step_var(),
                                                10000,
                                                0.98,
                                                name='clip_param')

        # surr1=prob_ratio*advantage
        surr1 = ratio * tf.expand_dims(advantage, -1)
        surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 +
                                 clip_param) * tf.expand_dims(advantage, -1)

        # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage

        loss_policy = -tf.reduce_mean(tf.minimum(surr1, surr2))

        #add critic clip loss
        v_loss1 = tf.square(value - futurereward)
        pre_value = pre_v + tf.clip_by_value(value - pre_v, -clip_param,
                                             clip_param)
        v_loss2 = tf.square(pre_v - futurereward)
        # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2))
        loss_value = 0.5 * tf.reduce_mean(v_loss1)

        entropy = dists.entropy()
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        exp_v = entropy_beta * entropy
        loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1),
                                      name='loss/policy')
        loss_policy = loss_policy + loss_entropy

        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        # exp_v = log_probs * tf.expand_dims(advantage, -1)
        # entropy = dists.entropy()
        # entropy_beta = tf.get_variable('entropy_beta', shape=[],
        #                                initializer=tf.constant_initializer(0.01), trainable=False)
        # exp_v = entropy_beta * entropy + exp_v

        # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward))

        # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss')

        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4),
                                                   self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])
        self._cost = [loss_policy, loss_value]
        from autodrive.trainer.summary import addParamSummary
        addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(self.value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        summary.add_moving_summary(
            loss_policy,
            loss_value,
            loss_entropy,
            pred_reward,
            advantage,
            loss_l2_regularizer,
            tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'),
            tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'),
        )