Python Normal.entropy Examples

Programming Language: Python

Namespace/Package Name: tensorflow.contrib.distributions

Class/Type: Normal

Method/Function: entropy

Examples at hotexamples.com: 2

Python Normal.entropy - 2 examples found. These are the top rated real world Python examples of tensorflow.contrib.distributions.Normal.entropy extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Normal(30)

log_prob(25)

sample(12)

log_pdf(4)

prob(4)

entropy(2)

cdf(1)

get_variables(1)

mean(1)

pdf(1)

Example #1

Show file

File: main.py Project: BruceChaun/homework

def main_pendulum(logdir,
                  seed,
                  n_iter,
                  gamma,
                  min_timesteps_per_batch,
                  initial_stepsize,
                  desired_kl,
                  vf_type,
                  vf_params,
                  animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)

    ####
    # YOUR_CODE_HERE

    # batch of observations
    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    # batch of actions
    sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.float32)
    # batch of advantage function estimates
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    # 2-layer network to learn state from observation
    sy_h1 = lrelu(dense(sy_ob_no, 32, "h1",
                        weight_init=normc_initializer(1.0)))
    sy_h2 = lrelu(dense(sy_h1, 32, "h2", weight_init=normc_initializer(1.0)))
    # Mean control output
    sy_mean_na = dense(sy_h2,
                       ac_dim,
                       "mean",
                       weight_init=normc_initializer(0.1))
    # Variance
    logstd_a = tf.get_variable("logstdev", [ac_dim])

    # define action distribution
    sy_ac_distr = Normal(mu=tf.squeeze(sy_mean_na),
                         sigma=tf.exp(logstd_a),
                         validate_args=True)
    # sampled actions, used for defining the policy
    # (NOT computing the policy gradient)
    sy_sampled_ac = tf.squeeze(sy_ac_distr.sample(sample_shape=[ac_dim]))

    sy_n = tf.shape(sy_ob_no)[0]
    sy_logprob_n = sy_ac_distr.log_pdf(sy_ac_n)

    # used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES
    sy_oldmean_na = tf.placeholder(shape=[None, ac_dim],
                                   name='oldmean',
                                   dtype=tf.float32)
    sy_oldlogstd_a = tf.placeholder(shape=[ac_dim],
                                    name="oldlogstdev",
                                    dtype=tf.float32)
    sy_ac_olddistr = Normal(mu=tf.squeeze(sy_oldmean_na),
                            sigma=tf.exp(sy_oldlogstd_a),
                            validate_args=True)

    sy_kl = tf.reduce_mean(
        tf.contrib.distributions.kl(sy_ac_distr, sy_ac_olddistr))
    sy_ent = tf.reduce_mean(sy_ac_distr.entropy())

    ####

    sy_surr = -tf.reduce_mean(
        sy_adv_n * sy_logprob_n
    )  # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(
        shape=[], dtype=tf.float32
    )  # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************" % i)

        ####
        # YOUR_CODE_HERE

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (i % 10 == 0)
                                    and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step([ac])
                rewards.append(rew)
                if done:
                    break

            path = {
                "observation": np.array(obs),
                "terminated": terminated,
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break

        total_timesteps += timesteps_this_batch

        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        _, oldmean_na, oldlogstdev = sess.run(
            [update_op, sy_mean_na, logstd_a],
            feed_dict={
                sy_ob_no: ob_no,
                sy_ac_n: ac_n,
                sy_adv_n: standardized_adv_n,
                sy_stepsize: stepsize
            })
        kl, ent = sess.run(
            [sy_kl, sy_ent],
            feed_dict={
                sy_ob_no: ob_no,
                sy_oldmean_na: oldmean_na,
                sy_oldlogstd_a: oldlogstdev
            })

        ####

        if kl > desired_kl * 2:
            stepsize /= 1.5
            print('stepsize -> %s' % stepsize)
        elif kl < desired_kl / 2:
            stepsize *= 1.5
            print('stepsize -> %s' % stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean",
                         np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean",
                         np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter",
                         explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()

Example #2

Show file

    def _build_ad_nn(self, tensor_io):
        from drlutils.dataflow.tensor_io import TensorIO
        assert (isinstance(tensor_io, TensorIO))
        from drlutils.model.base import get_current_nn_context
        from tensorpack.tfutils.common import get_global_step_var
        global_step = get_global_step_var()
        nnc = get_current_nn_context()
        is_training = nnc.is_training
        i_state = tensor_io.getInputTensor('state')
        i_agentIdent = tensor_io.getInputTensor('agentIdent')
        i_sequenceLength = tensor_io.getInputTensor('sequenceLength')
        i_resetRNN = tensor_io.getInputTensor('resetRNN')
        l = i_state
        # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ')
        # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ')
        # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ')
        # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ')
        with tf.variable_scope('critic', reuse=nnc.reuse) as vs:

            def _get_cell():
                cell = tf.nn.rnn_cell.BasicLSTMCell(256)
                # if is_training:
                #     cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9)
                return cell

            cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)])
            rnn_outputs = self._buildRNN(
                l,
                cell,
                tensor_io.batchSize,
                i_agentIdent=i_agentIdent,
                i_sequenceLength=i_sequenceLength,
                i_resetRNN=i_resetRNN,
            )
            rnn_outputs = tf.reshape(
                rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]])
            l = rnn_outputs
            from ad_cur.autodrive.model.selu import fc_selu
            for lidx in range(2):
                l = fc_selu(
                    l,
                    200,
                    keep_prob=1.,  # 由于我们只使用传感器训练，关键信息不能丢
                    is_training=is_training,
                    name='fc-{}'.format(lidx))
            value = tf.layers.dense(l, 1, name='fc-value')
            value = tf.squeeze(value, [1], name="value")
            if not hasattr(self, '_weights_critic'):
                self._weights_critic = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        with tf.variable_scope('actor', reuse=nnc.reuse) as vs:
            l = tf.stop_gradient(l)
            l = tf.layers.dense(l,
                                128,
                                activation=tf.nn.relu6,
                                name='fc-actor')
            mu_steering = 0.5 * tf.layers.dense(
                l, 1, activation=tf.nn.tanh, name='fc-mu-steering')
            mu_accel = tf.layers.dense(l,
                                       1,
                                       activation=tf.nn.tanh,
                                       name='fc-mu-accel')
            mus = tf.concat([mu_steering, mu_accel], axis=-1)

            # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus')
            # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas')
            # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5)
            def saturating_sigmoid(x):
                """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1]."""
                with tf.name_scope("saturating_sigmoid", [x]):
                    y = tf.sigmoid(x)
                    return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1))

            sigma_steering_ = 0.1 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering')
            sigma_accel_ = 0.25 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel')

            if not nnc.is_evaluating:
                sigma_beta_steering = tf.get_default_graph(
                ).get_tensor_by_name('actor/sigma_beta_steering:0')
                sigma_beta_accel = tf.get_default_graph().get_tensor_by_name(
                    'actor/sigma_beta_accel:0')
                sigma_beta_steering = tf.constant(1e-4)
                # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp')
                # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp')
            else:
                sigma_beta_steering = tf.constant(1e-4)
                sigma_beta_accel = tf.constant(1e-4)
            sigma_steering = (sigma_steering_ + sigma_beta_steering)
            sigma_accel = (sigma_accel_ + sigma_beta_accel)

            sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1)
            # if is_training:
            #     pass
            #     # 如果不加sigma_beta，收敛会很慢，并且不稳定，猜测可能是以下原因：
            #     #   1、训练前期尽量大的探索可以避免网络陷入局部最优
            #     #   2、前期过小的sigma会使normal_dist的log_prob过大，导致梯度更新过大，网络一开始就畸形了，很难恢复回来
            #
            # if is_training:
            #     sigmas += sigma_beta_steering
            # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5)
            # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5)
            # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5)
            # sigmas_orig = sigmas
            # sigmas = sigmas + sigma_beta_steering
            # sigmas = tf.minimum(sigmas + 0.1, 100)
            # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1)
            # sigma_steering += sigma_beta_steering
            # sigma_accel += sigma_beta_accel

            # mus = tf.concat([mu_steering, mu_accel], axis=-1)

            from tensorflow.contrib.distributions import Normal
            dists = Normal(mus, sigmas + 0.01)
            policy = tf.squeeze(dists.sample([1]), [0])
            # 裁剪到两倍方差之内
            policy = tf.clip_by_value(policy, mus - 2 * sigmas,
                                      mus + 2 * sigmas)
            if is_training:
                self._addMovingSummary(
                    tf.reduce_mean(mu_steering, name='mu/steering/mean'),
                    tf.reduce_mean(mu_accel, name='mu/accel/mean'),
                    tf.reduce_mean(sigma_steering, name='sigma/steering/mean'),
                    tf.reduce_max(sigma_steering, name='sigma/steering/max'),
                    tf.reduce_mean(sigma_accel, name='sigma/accel/mean'),
                    tf.reduce_max(sigma_accel, name='sigma/accel/max'),
                    # sigma_beta_accel,
                    # sigma_beta_steering,
                )
            # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions],
            #                    'mu/sigma/sigma.orig/act=', summarize=4)
            if not hasattr(self, '_weights_actor'):
                self._weights_actor = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
        if not is_training:
            tensor_io.setOutputTensors(policy, value, mus, sigmas)
            return

        i_actions = tensor_io.getInputTensor("action")
        # i_actions = tf.Print(i_actions, [i_actions], 'actions = ')
        i_actions = tf.reshape(i_actions,
                               [-1] + i_actions.get_shape().as_list()[2:])
        log_probs = dists.log_prob(i_actions)
        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        i_advantage = tensor_io.getInputTensor("advantage")
        i_advantage = tf.reshape(i_advantage,
                                 [-1] + i_advantage.get_shape().as_list()[2:])
        exp_v = log_probs * tf.expand_dims(i_advantage, -1)
        entropy = dists.entropy()
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        exp_v = entropy_beta * entropy + exp_v
        loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1),
                                     name='loss/policy')

        i_futurereward = tensor_io.getInputTensor("futurereward")
        i_futurereward = tf.reshape(i_futurereward, [-1] +
                                    i_futurereward.get_shape().as_list()[2:])
        loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward))

        loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1),
                                      name='xentropy_loss')

        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4),
                                                   self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])

        self._addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(value, name='predict_reward')
        import tensorpack.tfutils.symbolic_functions as symbf
        advantage = symbf.rms(i_advantage, name='rms_advantage')
        self._addMovingSummary(
            loss_policy,
            loss_value,
            loss_entropy,
            pred_reward,
            advantage,
            loss_l2_regularizer,
            tf.reduce_mean(policy[:, 0], name='actor/steering/mean'),
            tf.reduce_mean(policy[:, 1], name='actor/accel/mean'),
        )
        return loss_policy, loss_value