Esempio n. 1
0
    def dump_tabular(self):
        u"""
        Write all of the diagnostics from the current iteration.

        Writes both to stdout, and to the output file.
        """
        if proc_id() == 0:
            vals = []
            key_lens = [len(key) for key in self.log_headers]
            max_key_len = max(15, max(key_lens))
            keystr = u'%' + u'%d' % max_key_len
            fmt = u"| " + keystr + u"s | %15s |"
            n_slashes = 22 + max_key_len
            print u"-" * n_slashes
            for key in self.log_headers:
                val = self.log_current_row.get(key, u"")
                valstr = u"%8.3g" % val if hasattr(val, u"__float__") else val
                print fmt % (key, valstr)
                vals.append(val)
            print u"-" * n_slashes
            if self.output_file is not None:
                if self.first_row:
                    self.output_file.write(u"\t".join(self.log_headers) +
                                           u"\n")
                self.output_file.write(u"\t".join(imap(unicode, vals)) + u"\n")
                self.output_file.flush()
        self.log_current_row.clear()
        self.first_row = False
Esempio n. 2
0
    def save_state(self, state_dict, itr=None):
        u"""
        Saves the state of an experiment.

        To be clear: this is about saving *state*, not logging diagnostics.
        All diagnostic logging is separate from this function. This function
        will save whatever is in ``state_dict``---usually just a copy of the
        environment---and the most recent parameters for the model you 
        previously set up saving for with ``setup_tf_saver``. 

        Call with any frequency you prefer. If you only want to maintain a
        single state and overwrite it at each call with the most recent 
        version, leave ``itr=None``. If you want to keep all of the states you
        save, provide unique (increasing) values for 'itr'.

        Args:
            state_dict (dict): Dictionary containing essential elements to
                describe the current state of training.

            itr: An int, or None. Current iteration of training.
        """
        if proc_id() == 0:
            fname = u'vars.pkl' if itr is None else u'vars%d.pkl' % itr
            try:
                joblib.dump(state_dict, osp.join(self.output_dir, fname))
            except:
                self.log(u'Warning: could not pickle state_dict.',
                         color=u'red')
            if hasattr(self, u'tf_saver_elements'):
                self._tf_simple_save(itr)
Esempio n. 3
0
    def save_config(self, config):
        u"""
        Log an experiment configuration.

        Call this once at the top of your experiment, passing in all important
        config vars as a dict. This will serialize the config to JSON, while
        handling anything which can't be serialized in a graceful way (writing
        as informative a string as possible). 

        Example use:

        .. code-block:: python

            logger = EpochLogger(**logger_kwargs)
            logger.save_config(locals())
        """
        config_json = convert_json(config)
        if self.exp_name is not None:
            config_json[u'exp_name'] = self.exp_name
        if proc_id() == 0:
            output = json.dumps(config_json,
                                separators=(u',', u':\t'),
                                indent=4,
                                sort_keys=True)
            print colorize(u'Saving config:\n', color=u'cyan', bold=True)
            print output
            with open(osp.join(self.output_dir, u"config.json"), u'w') as out:
                out.write(output)
Esempio n. 4
0
 def _tf_simple_save(self, itr=None):
     u"""
     Uses simple_save to save a trained model, plus info to make it easy
     to associated tensors to variables after restore. 
     """
     if proc_id() == 0:
         assert hasattr(self, u'tf_saver_elements'), \
             u"First have to setup saving with self.setup_tf_saver"
         fpath = u'simple_save' + (u'%d' % itr if itr is not None else u'')
         fpath = osp.join(self.output_dir, fpath)
         if osp.exists(fpath):
             # simple_save refuses to be useful if fpath already exists,
             # so just delete fpath if it's there.
             shutil.rmtree(fpath)
         tf.saved_model.simple_save(export_dir=fpath,
                                    **self.tf_saver_elements)
         joblib.dump(self.tf_saver_info, osp.join(fpath, u'model_info.pkl'))
Esempio n. 5
0
    def __init__(self,
                 output_dir=None,
                 output_fname=u'progress.txt',
                 exp_name=None):
        u"""
        Initialize a Logger.

        Args:
            output_dir (string): A directory for saving results to. If 
                ``None``, defaults to a temp directory of the form
                ``/tmp/experiments/somerandomnumber``.

            output_fname (string): Name for the tab-separated-value file 
                containing metrics logged throughout a training run. 
                Defaults to ``progress.txt``. 

            exp_name (string): Experiment name. If you run multiple training
                runs and give them all the same ``exp_name``, the plotter
                will know to group them. (Use case: if you run the same
                hyperparameter configuration with multiple random seeds, you
                should give them all the same ``exp_name``.)
        """
        if proc_id() == 0:
            self.output_dir = output_dir or u"/tmp/experiments/%i" % int(
                time.time())
            if osp.exists(self.output_dir):
                print u"Warning: Log dir %s already exists! Storing info there anyway." % self.output_dir
            else:
                os.makedirs(self.output_dir)
            self.output_file = open(osp.join(self.output_dir, output_fname),
                                    u'w')
            atexit.register(self.output_file.close)
            print colorize(u"Logging data to %s" % self.output_file.name,
                           u'green',
                           bold=True)
        else:
            self.output_dir = None
            self.output_file = None
        self.first_row = True
        self.log_headers = []
        self.log_current_row = {}
        self.exp_name = exp_name
Esempio n. 6
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    u"""

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    #ros stuff
    name = rospy.get_name() + "/ppo_rl_agent"
    params_topic = rospy.get_param("~topics/params")
    params_pub = rospy.Publisher(params_topic, LearnedParameters)

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs[u'action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: value and logprob (we'll get action from the env)
    get_action_ops = [v, logp]  # logp instead of logp_pi since we know a

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in [u'pi', u'v'])
    logger.log(u'\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={u'x': x_ph},
                          outputs={
                              u'pi': pi,
                              u'v': v
                          })

    def update():
        inputs = dict((k, v) for k, v in izip(all_phs, buf.get()))
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in xrange(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    u'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in xrange(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

        # Publish ros parameters
        params_msg = LearnedParameters()
        params = [
            sess.run(v).flatten() for v in tf.trainable_variables()
            if u"pi" in v.name
        ]
        num_params_in_msg = sum([len(p) for p in params])
        assert (num_params_in_msg == core.count_vars(u'pi'))
        for p in params:
            msg = Parameters()
            if isinstance(p, np.ndarray):
                msg.params = list(p)
            else:
                msg.params = [p]
            params_msg.params.append(msg)
        params_pub.publish(params_msg)

        # params = [sess.run(v)[0] for v in tf.trainable_variables() if u"pi" in v.name]
        # for p in params:
        #     msg = Parameters()
        #     if isinstance(p, np.ndarray):
        #         msg.params = list(p)
        #     else:
        #         msg.params = [p]
        #     params_msg.params.append(msg)
        # params_pub.publish(params_msg)

    # RUN THIS THING!
    start_time = time.time()
    env.reset()
    #o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    ep_ret = 0
    ep_len = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in xrange(epochs):
        for t in xrange(local_steps_per_epoch):
            o, r, a, d, _ = env.step()

            ep_ret += r
            ep_len += 1

            # get log prob
            v_t, logp_t = sess.run(get_action_ops,
                                   feed_dict={
                                       x_ph: o.reshape(1, -1),
                                       a_ph: a.reshape(1, -1)
                                   })

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t > local_steps_per_epoch - 1):
                if not (terminal):
                    print u'Warning: trajectory cut off by epoch at %d steps.' % ep_len

                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)

                # NOTE: check this call
                #o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                env.reset()
                ep_ret = 0
                ep_len = 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({u'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular(u'Epoch', epoch)
        logger.log_tabular(u'EpRet', with_min_and_max=True)
        logger.log_tabular(u'EpLen', average_only=True)
        logger.log_tabular(u'VVals', with_min_and_max=True)
        logger.log_tabular(u'TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular(u'LossPi', average_only=True)
        logger.log_tabular(u'LossV', average_only=True)
        logger.log_tabular(u'DeltaLossPi', average_only=True)
        logger.log_tabular(u'DeltaLossV', average_only=True)
        logger.log_tabular(u'Entropy', average_only=True)
        logger.log_tabular(u'KL', average_only=True)
        logger.log_tabular(u'ClipFrac', average_only=True)
        logger.log_tabular(u'StopIter', average_only=True)
        logger.log_tabular(u'Time', time.time() - start_time)
        logger.dump_tabular()
Esempio n. 7
0
 def log(self, msg, color=u'green'):
     u"""Print a colorized message to stdout."""
     if proc_id() == 0:
         print colorize(msg, color, bold=True)
Esempio n. 8
0
def vpgpolynomial(env_fn,
                  actor_critic=core.polynomial_actor_critic,
                  ac_kwargs=dict(),
                  seed=0,
                  steps_per_epoch=4000,
                  epochs=50,
                  gamma=0.9,
                  pi_lr=2e-5,
                  vf_lr=1e-3,
                  train_v_iters=80,
                  lam=0.97,
                  max_ep_len=1000,
                  logger_kwargs=dict(),
                  save_freq=10,
                  l1_scaling=0.001):
    u"""

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    #ros stuff
    name = rospy.get_name() + "/ppo_rl_agent"
    params_topic = rospy.get_param("~topics/params")
    params_pub = rospy.Publisher(params_topic, LearnedParameters)

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs[u'action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [v, logp]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in [u'pi', u'v'])
    logger.log(u'\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    var = [v for v in tf.trainable_variables() if u"pi" in v.name][0]
    norm_loss = l1_scaling * tf.norm(var, 1)
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    pi_optim = MpiAdamOptimizer(learning_rate=pi_lr)
    train_pi = pi_optim.minimize(pi_loss)
    train_pi_norm = pi_optim.minimize(norm_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={u'x': x_ph},
                          outputs={
                              u'pi': pi,
                              u'v': v
                          })

    def update():
        inputs = dict((k, v) for k, v in izip(all_phs, buf.get()))
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # print "grads"
        # print sess.run(pi_optim.compute_gradients(pi_loss, tf.trainable_variables(u'pi')),
        #                feed_dict=inputs)

        # Policy gradient step
        sess.run(train_pi, feed_dict=inputs)
        sess.run(train_pi_norm, feed_dict=inputs)

        #polynomial penalizing number of terms
        # with tf.variable_scope('pi'):
        #     grads_and_vars = pi_optim.compute_gradients(tf.norm(tf.trainable_variables(),ord=1))
        #     pi_optim.apply_gradient(grads_and_vars)

        # Value function learning
        for _ in xrange(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, pi_l_norm = sess.run(
            [pi_loss, v_loss, approx_kl, norm_loss], feed_dict=inputs)
        logger.store(LossNorm=pi_l_norm,
                     LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

        # Publish ros parameters
        params_msg = LearnedParameters()
        params = [
            sess.run(v).flatten() for v in tf.trainable_variables()
            if u"pi" in v.name
        ]
        num_params_in_msg = sum([len(p) for p in params])
        assert (num_params_in_msg == core.count_vars(u'pi'))
        for p in params:
            msg = Parameters()
            if isinstance(p, np.ndarray):
                msg.params = list(p)
            else:
                msg.params = [p]
            params_msg.params.append(msg)
        params_pub.publish(params_msg)

    start_time = time.time()
    #o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    env.reset()
    ep_ret = 0
    ep_len = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in xrange(epochs):
        for t in xrange(local_steps_per_epoch):
            o, r, a, d, _ = env.step()
            ep_ret += r
            ep_len += 1

            v_t, logp_t = sess.run(get_action_ops,
                                   feed_dict={
                                       x_ph: o.reshape(1, -1),
                                       a_ph: a.reshape(1, -1)
                                   })

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print u'Warning: trajectory cut off by epoch at %d steps.' % ep_len
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                _, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({u'env': env}, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        logger.log_tabular(u'Epoch', epoch)
        logger.log_tabular(u'EpRet', with_min_and_max=True)
        logger.log_tabular(u'EpLen', average_only=True)
        logger.log_tabular(u'VVals', with_min_and_max=True)
        logger.log_tabular(u'TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular(u'LossPi', average_only=True)
        logger.log_tabular(u'LossV', average_only=True)
        logger.log_tabular(u'DeltaLossPi', average_only=True)
        logger.log_tabular(u'DeltaLossV', average_only=True)
        logger.log_tabular(u'Entropy', average_only=True)
        logger.log_tabular(u'KL', average_only=True)
        logger.log_tabular(u'Time', time.time() - start_time)
        logger.dump_tabular()