Beispiel #1
0
 def test_restrict_invalid_scope(self):
     tf.Variable(tf.zeros((3, 2)), trainable=True)
     with tf.variable_scope('foo'):
         tf.Variable(tf.zeros((5, 2)), trainable=True)
         with tf.variable_scope('bar'):
             tf.Variable(tf.zeros((1, 2)), trainable=True)
     self.assertEqual(0, count_weights('bar'))
Beispiel #2
0
    def _build_normalizers(self):
        with self.sess.as_default(), self.graph.as_default(
        ), tf.variable_scope(self.tf_scope):
            with tf.variable_scope(self.RESOURCE_SCOPE):
                self.s_norm = TFNormalizer(
                    self.sess, 's_norm', self.get_state_size(),
                    self.world.env.build_state_norm_groups(self.id))
                state_offset = -self.world.env.build_state_offset(self.id)
                print("state_offset=", state_offset)
                state_scale = 1 / self.world.env.build_state_scale(self.id)
                print("state_scale=", state_scale)
                self.s_norm.set_mean_std(
                    -self.world.env.build_state_offset(self.id),
                    1 / self.world.env.build_state_scale(self.id))

                self.g_norm = TFNormalizer(
                    self.sess, 'g_norm', self.get_goal_size(),
                    self.world.env.build_goal_norm_groups(self.id))
                self.g_norm.set_mean_std(
                    -self.world.env.build_goal_offset(self.id),
                    1 / self.world.env.build_goal_scale(self.id))

                self.a_norm = TFNormalizer(self.sess, 'a_norm',
                                           self.get_action_size())
                self.a_norm.set_mean_std(
                    -self.world.env.build_action_offset(self.id),
                    1 / self.world.env.build_action_scale(self.id))
        return
Beispiel #3
0
  def _restore_policy(self, network, policy_layers, value_layers, action_size, checkpoint):
    """Restore the PPO policy from a TensorFlow checkpoint.

    Args:
      network: The neural network definition.
      policy_layers: A tuple specify the number of layers and number of neurons
        of each layer for the policy network.
      value_layers: A tuple specify the number of layers and number of neurons
        of each layer for the value network.
      action_size: The dimension of the action space.
      checkpoint: The checkpoint path.
    """
    observ = self._observ_filter.transform(self.observation_placeholder)
    with tf.variable_scope("network/rnn"):
      self.network = network(policy_layers=policy_layers,
                             value_layers=value_layers,
                             action_size=action_size)

    with tf.variable_scope("temporary"):
      self.last_state = tf.Variable(self.network.zero_state(1, tf.float32), False)
      self.sess.run(self.last_state.initializer)

    with tf.variable_scope("network"):
      (mean_action, _, _), new_state = tf.nn.dynamic_rnn(self.network,
                                                         observ[:, None],
                                                         tf.ones(1),
                                                         self.last_state,
                                                         tf.float32,
                                                         swap_memory=True)
      self.mean_action = mean_action
      self.update_state = self.last_state.assign(new_state)

    saver = utility.define_saver(exclude=(r"temporary/.*",))
    saver.restore(self.sess, checkpoint)
Beispiel #4
0
 def test_exclude_by_regex(self):
     tf.Variable(tf.zeros((3, 2)), trainable=True)
     with tf.variable_scope('foo'):
         tf.Variable(tf.zeros((5, 2)), trainable=True)
         with tf.variable_scope('bar'):
             tf.Variable(tf.zeros((1, 2)), trainable=True)
     self.assertEqual(0, count_weights(exclude=r'.*'))
     self.assertEqual(6, count_weights(exclude=r'(^|/)foo/.*'))
     self.assertEqual(16, count_weights(exclude=r'.*/bar/.*'))
 def _build_normalizers(self):
     super()._build_normalizers()
     with self.sess.as_default(), self.graph.as_default(
     ), tf.variable_scope(self.tf_scope):
         with tf.variable_scope(self.RESOURCE_SCOPE):
             val_offset, val_scale = self._calc_val_offset_scale(
                 self.discount)
             self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
             self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
     return
Beispiel #6
0
    def _build_nets(self, json_data):
        assert self.ACTOR_NET_KEY in json_data
        assert self.CRITIC_NET_KEY in json_data

        actor_net_name = json_data[self.ACTOR_NET_KEY]
        critic_net_name = json_data[self.CRITIC_NET_KEY]
        actor_init_output_scale = 1 if (
            self.ACTOR_INIT_OUTPUT_SCALE_KEY
            not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]

        s_size = self.get_state_size()
        g_size = self.get_goal_size()
        a_size = self.get_action_size()

        # setup input tensors
        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s")
        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a")
        self.tar_val_tf = tf.placeholder(tf.float32,
                                         shape=[None],
                                         name="tar_val")
        self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv")
        self.g_tf = tf.placeholder(
            tf.float32,
            shape=([None, g_size] if self.has_goal() else None),
            name="g")
        self.old_logp_tf = tf.placeholder(tf.float32,
                                          shape=[None],
                                          name="old_logp")
        self.exp_mask_tf = tf.placeholder(tf.float32,
                                          shape=[None],
                                          name="exp_mask")

        with tf.variable_scope('main'):
            with tf.variable_scope('actor'):
                self.a_mean_tf = self._build_net_actor(
                    actor_net_name, actor_init_output_scale)
            with tf.variable_scope('critic'):
                self.critic_tf = self._build_net_critic(critic_net_name)

        if (self.a_mean_tf != None):
            Logger.print2('Built actor net: ' + actor_net_name)

        if (self.critic_tf != None):
            Logger.print2('Built critic net: ' + critic_net_name)

        self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size)
        norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal(
            shape=tf.shape(self.a_mean_tf))
        norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1)
        self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf
        self.sample_a_logp_tf = TFUtil.calc_logp_gaussian(
            x_tf=norm_a_noise_tf, mean_tf=None, std_tf=self.norm_a_std_tf)

        return
Beispiel #7
0
    def _build_graph(self, json_data):
        with self.sess.as_default(), self.graph.as_default():
            with tf.variable_scope(self.tf_scope):
                self._build_nets(json_data)

                with tf.variable_scope(self.SOLVER_SCOPE):
                    self._build_losses(json_data)
                    self._build_solvers(json_data)

                self._initialize_vars()
                self._build_saver()
        return
def recurrent_gaussian(config, action_size, observations, length, state=None):
  """Independent recurrent policy and feed forward value networks.

  The policy network outputs the mean action and the log standard deviation
  is learned as independent parameter vector. The last policy layer is
  recurrent and uses a GRU cell.

  Args:
    config: Configuration object.
    action_size: Length of the action vector.
    observations: Sequences of observations.
    length: Batch of sequence lengths.
    state: Batch of initial recurrent states.

  Returns:
    NetworkOutput tuple.
  """
  mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer(
      factor=config.init_mean_factor)
  logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
  cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1])
  flat_observations = tf.reshape(observations, [
      tf.shape(observations)[0],
      tf.shape(observations)[1],
      functools.reduce(operator.mul,
                       observations.shape.as_list()[2:], 1)
  ])
  with tf.variable_scope('policy'):
    x = flat_observations
    for size in config.policy_layers[:-1]:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32)
    mean = tf.contrib.layers.fully_connected(x,
                                             action_size,
                                             tf.tanh,
                                             weights_initializer=mean_weights_initializer)
    logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer)
    logstd = tf.tile(logstd[None, None],
                     [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
  with tf.variable_scope('value'):
    x = flat_observations
    for size in config.value_layers:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
  mean = tf.check_numerics(mean, 'mean')
  logstd = tf.check_numerics(logstd, 'logstd')
  value = tf.check_numerics(value, 'value')
  policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd))
  # assert state.shape.as_list()[0] is not None
  return NetworkOutput(policy, mean, logstd, value, state)
Beispiel #9
0
 def __call__(self, observation, state):
   with tf.variable_scope('policy'):
     x = tf.contrib.layers.flatten(observation)
     mean = tf.contrib.layers.fully_connected(x,
                                              self._action_size,
                                              tf.tanh,
                                              weights_initializer=self._mean_weights_initializer)
     logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
     logstd = tf.tile(logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
   with tf.variable_scope('value'):
     x = tf.contrib.layers.flatten(observation)
     for size in self._value_layers:
       x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
     value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
   return (mean, logstd, value), state
Beispiel #10
0
    def __init__(self, batch_env):
        """Batch of environments inside the TensorFlow graph.

    Args:
      batch_env: Batch environment.
    """
        self._batch_env = batch_env
        observ_shape = self._parse_shape(self._batch_env.observation_space)
        observ_dtype = self._parse_dtype(self._batch_env.observation_space)
        action_shape = self._parse_shape(self._batch_env.action_space)
        action_dtype = self._parse_dtype(self._batch_env.action_space)
        with tf.variable_scope('env_temporary'):
            self._observ = tf.Variable(tf.zeros(
                (len(self._batch_env), ) + observ_shape, observ_dtype),
                                       name='observ',
                                       trainable=False)
            self._action = tf.Variable(tf.zeros(
                (len(self._batch_env), ) + action_shape, action_dtype),
                                       name='action',
                                       trainable=False)
            self._reward = tf.Variable(tf.zeros((len(self._batch_env), ),
                                                tf.float32),
                                       name='reward',
                                       trainable=False)
            self._done = tf.Variable(tf.cast(tf.ones((len(self._batch_env), )),
                                             tf.bool),
                                     name='done',
                                     trainable=False)
    def _build_nets(self, json_data):
        assert self.ACTOR_NET_KEY in json_data
        assert self.CRITIC_NET_KEY in json_data

        actor_net_name = json_data[self.ACTOR_NET_KEY]
        critic_net_name = json_data[self.CRITIC_NET_KEY]
        actor_init_output_scale = 1 if (
            self.ACTOR_INIT_OUTPUT_SCALE_KEY
            not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]

        s_size = self.get_state_size()
        g_size = self.get_goal_size()
        a_size = self.get_action_size()

        # setup input tensors
        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size],
                                   name="s")  # observations
        self.tar_val_tf = tf.placeholder(tf.float32,
                                         shape=[None],
                                         name="tar_val")  # target value s
        self.adv_tf = tf.placeholder(tf.float32, shape=[None],
                                     name="adv")  # advantage
        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size],
                                   name="a")  # target actions
        self.g_tf = tf.placeholder(
            tf.float32,
            shape=([None, g_size] if self.has_goal() else None),
            name="g")  # goals

        with tf.variable_scope('main'):
            with tf.variable_scope('actor'):
                self.actor_tf = self._build_net_actor(actor_net_name,
                                                      actor_init_output_scale)
            with tf.variable_scope('critic'):
                self.critic_tf = self._build_net_critic(critic_net_name)

        if (self.actor_tf != None):
            Logger.print2('Built actor net: ' + actor_net_name)

        if (self.critic_tf != None):
            Logger.print2('Built critic net: ' + critic_net_name)

        return
    def __init__(self,
                 sess,
                 scope,
                 size,
                 groups_ids=None,
                 eps=0.02,
                 clip=np.inf):
        self.sess = sess
        self.scope = scope
        super().__init__(size, groups_ids, eps, clip)

        with tf.variable_scope(self.scope):
            self._build_resource_tf()
        return
def fc_net(input,
           layers_sizes,
           activation,
           reuse=None,
           flatten=False):  # build fully connected network
    curr_tf = input
    for i, size in enumerate(layers_sizes):
        with tf.variable_scope(str(i), reuse=reuse):
            curr_tf = tf.layers.dense(
                inputs=curr_tf,
                units=size,
                kernel_initializer=xavier_initializer,
                activation=activation if i < len(layers_sizes) - 1 else None)
    if flatten:
        assert layers_sizes[-1] == 1
        curr_tf = tf.reshape(curr_tf, [-1])

    return curr_tf
Beispiel #14
0
def define_batch_env(constructor, num_agents, env_processes):
    """Create environments and apply all desired wrappers.

  Args:
    constructor: Constructor of an OpenAI gym environment.
    num_agents: Number of environments to combine in the batch.
    env_processes: Whether to step environment in external processes.

  Returns:
    In-graph environments object.
  """
    with tf.variable_scope('environments'):
        if env_processes:
            envs = [
                tools.wrappers.ExternalProcess(constructor)
                for _ in range(num_agents)
            ]
        else:
            envs = [constructor() for _ in range(num_agents)]
        batch_env = tools.BatchEnv(envs, blocking=not env_processes)
        batch_env = tools.InGraphBatchEnv(batch_env)
    return batch_env
Beispiel #15
0
    def _network(self, observ, length=None, state=None, reuse=True):
        """Compute the network output for a batched sequence of observations.

    Optionally, the initial state can be specified. The weights should be
    reused for all calls, except for the first one. Output is a named tuple
    containing the policy as a TensorFlow distribution, the policy mean and log
    standard deviation, the approximated state value, and the new recurrent
    state.

    Args:
      observ: Sequences of observations.
      length: Batch of sequence lengths.
      state: Batch of initial recurrent states.
      reuse: Python boolean whether to reuse previous variables.

    Returns:
      NetworkOutput tuple.
    """
        with tf.variable_scope('network', reuse=reuse):
            observ = tf.convert_to_tensor(observ)
            use_gpu = self._config.use_gpu and utility.available_gpus()
            with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
                observ = tf.check_numerics(observ, 'observ')
                cell = self._config.network(
                    self._batch_env.action.shape[1].value)
                (mean, logstd,
                 value), state = tf.nn.dynamic_rnn(cell,
                                                   observ,
                                                   length,
                                                   state,
                                                   tf.float32,
                                                   swap_memory=True)
            mean = tf.check_numerics(mean, 'mean')
            logstd = tf.check_numerics(logstd, 'logstd')
            value = tf.check_numerics(value, 'value')
            policy = tf.contrib.distributions.MultivariateNormalDiag(
                mean, tf.exp(logstd))
            return _NetworkOutput(policy, mean, logstd, value, state)
Beispiel #16
0
    def __init__(self, template, capacity, max_length, scope):
        """Create a memory that stores episodes.

    Each transition tuple consists of quantities specified by the template.
    These quantities would typically be be observartions, actions, rewards, and
    done indicators.

    Args:
      template: List of tensors to derive shapes and dtypes of each transition.
      capacity: Number of episodes, or rows, hold by the memory.
      max_length: Allocated sequence length for the episodes.
      scope: Variable scope to use for internal variables.
    """
        self._capacity = capacity
        self._max_length = max_length
        with tf.variable_scope(scope) as var_scope:
            self._scope = var_scope
            self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
            self._buffers = [
                tf.Variable(
                    tf.zeros([capacity, max_length] + elem.shape.as_list(),
                             elem.dtype), False) for elem in template
            ]
Beispiel #17
0
 def test_include_scopes(self):
     tf.Variable(tf.zeros((3, 2)), trainable=True)
     with tf.variable_scope('foo'):
         tf.Variable(tf.zeros((5, 2)), trainable=True)
     self.assertEqual(6 + 10, count_weights())
    def __init__(self, batch_env, step, is_training, should_log, config):
        """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agent configuration as attributes.
    """
        self._batch_env = batch_env
        self._step = step
        self._is_training = is_training
        self._should_log = should_log
        self._config = config
        self._observ_filter = normalize.StreamingNormalize(
            self._batch_env.observ[0],
            center=True,
            scale=True,
            clip=5,
            name='normalize_observ')
        self._reward_filter = normalize.StreamingNormalize(
            self._batch_env.reward[0],
            center=False,
            scale=True,
            clip=10,
            name='normalize_reward')
        # Memory stores tuple of observ, action, mean, logstd, reward.
        template = (self._batch_env.observ[0], self._batch_env.action[0],
                    self._batch_env.action[0], self._batch_env.action[0],
                    self._batch_env.reward[0])
        self._memory = memory.EpisodeMemory(template, config.update_every,
                                            config.max_length, 'memory')
        self._memory_index = tf.Variable(0, False)
        use_gpu = self._config.use_gpu and utility.available_gpus()
        with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
            # Create network variables for later calls to reuse.
            action_size = self._batch_env.action.shape[1].value
            self._network = tf.make_template(
                'network',
                functools.partial(config.network, config, action_size))
            output = self._network(
                tf.zeros_like(self._batch_env.observ)[:, None],
                tf.ones(len(self._batch_env)))
            with tf.variable_scope('ppo_temporary'):
                self._episodes = memory.EpisodeMemory(template, len(batch_env),
                                                      config.max_length,
                                                      'episodes')
                if output.state is None:
                    self._last_state = None
                else:
                    # Ensure the batch dimension is set.
                    tf.contrib.framework.nest.map_structure(
                        lambda x: x.set_shape([len(batch_env)] + x.shape.
                                              as_list()[1:]), output.state)
                    # pylint: disable=undefined-variable
                    self._last_state = tf.contrib.framework.nest.map_structure(
                        lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
                        output.state)
                self._last_action = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_action')
                self._last_mean = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                              False,
                                              name='last_mean')
                self._last_logstd = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_logstd')
        self._penalty = tf.Variable(self._config.kl_init_penalty,
                                    False,
                                    dtype=tf.float32)
        self._optimizer = self._config.optimizer(self._config.learning_rate)
Beispiel #19
0
def simulate(batch_env, algo, log=True, reset=False):
    """Simulation step of a vecrotized algorithm with in-graph environments.

  Integrates the operations implemented by the algorithm and the environments
  into a combined operation.

  Args:
    batch_env: In-graph batch environment.
    algo: Algorithm instance implementing required operations.
    log: Tensor indicating whether to compute and return summaries.
    reset: Tensor causing all environments to reset.

  Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    def _define_begin_episode(agent_indices):
        """Reset environments, intermediate scores and durations for new episodes.

    Args:
      agent_indices: Tensor containing batch indices starting an episode.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        zero_scores = tf.zeros_like(agent_indices, tf.float32)
        zero_durations = tf.zeros_like(agent_indices)
        reset_ops = [
            batch_env.reset(agent_indices),
            tf.scatter_update(score, agent_indices, zero_scores),
            tf.scatter_update(length, agent_indices, zero_durations)
        ]
        with tf.control_dependencies(reset_ops):
            return algo.begin_episode(agent_indices)

    def _define_step():
        """Request actions from the algorithm and apply them to the environments.

    Increments the lengths of all episodes and increases their scores by the
    current reward. After stepping the environments, provides the full
    transition tuple to the algorithm.

    Returns:
      Summary tensor.
    """
        prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
        action, step_summary = algo.perform(prevob)
        action.set_shape(batch_env.action.shape)
        with tf.control_dependencies([batch_env.simulate(action)]):
            add_score = score.assign_add(batch_env.reward)
            inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
        with tf.control_dependencies([add_score, inc_length]):
            experience_summary = algo.experience(prevob, batch_env.action,
                                                 batch_env.reward,
                                                 batch_env.done,
                                                 batch_env.observ)
        return tf.summary.merge([step_summary, experience_summary])

    def _define_end_episode(agent_indices):
        """Notify the algorithm of ending episodes.

    Also updates the mean score and length counters used for summaries.

    Args:
      agent_indices: Tensor holding batch indices that end their episodes.

    Returns:
      Summary tensor.
    """
        assert agent_indices.shape.ndims == 1
        submit_score = mean_score.submit(tf.gather(score, agent_indices))
        submit_length = mean_length.submit(
            tf.cast(tf.gather(length, agent_indices), tf.float32))
        with tf.control_dependencies([submit_score, submit_length]):
            return algo.end_episode(agent_indices)

    def _define_summaries():
        """Reset the average score and duration, and return them as summary.

    Returns:
      Summary string.
    """
        score_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
            lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
        length_summary = tf.cond(
            tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
            lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
        return tf.summary.merge([score_summary, length_summary])

    with tf.name_scope('simulate'):
        log = tf.convert_to_tensor(log)
        reset = tf.convert_to_tensor(reset)
        with tf.variable_scope('simulate_temporary'):
            score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32),
                                False,
                                name='score')
            length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32),
                                 False,
                                 name='length')
        mean_score = streaming_mean.StreamingMean((), tf.float32)
        mean_length = streaming_mean.StreamingMean((), tf.float32)
        agent_indices = tf.cond(
            reset, lambda: tf.range(len(batch_env)),
            lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
        begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                lambda: _define_begin_episode(agent_indices),
                                str)
        with tf.control_dependencies([begin_episode]):
            step = _define_step()
        with tf.control_dependencies([step]):
            agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
            end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool),
                                  lambda: _define_end_episode(agent_indices),
                                  str)
        with tf.control_dependencies([end_episode]):
            summary = tf.summary.merge(
                [_define_summaries(), begin_episode, step, end_episode])
        with tf.control_dependencies([summary]):
            done, score = tf.identity(batch_env.done), tf.identity(score)
        return done, score, summary
Beispiel #20
0
    def __init__(self, batch_env, step, is_training, should_log, config):
        """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agents configuration as attributes.
    """
        self._batch_env = batch_env
        self._step = step
        self._is_training = is_training
        self._should_log = should_log
        self._config = config
        self._observ_filter = normalize.StreamingNormalize(
            self._batch_env.observ[0],
            center=True,
            scale=True,
            clip=5,
            name='normalize_observ')
        self._reward_filter = normalize.StreamingNormalize(
            self._batch_env.reward[0],
            center=False,
            scale=True,
            clip=10,
            name='normalize_reward')
        # Memory stores tuple of observ, action, mean, logstd, reward.
        template = (self._batch_env.observ[0], self._batch_env.action[0],
                    self._batch_env.action[0], self._batch_env.action[0],
                    self._batch_env.reward[0])
        self._memory = memory.EpisodeMemory(template, config.update_every,
                                            config.max_length, 'memory')
        self._memory_index = tf.Variable(0, False)
        use_gpu = self._config.use_gpu and utility.available_gpus()
        with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
            # Create network variables for later calls to reuse.
            self._network(tf.zeros_like(self._batch_env.observ)[:, None],
                          tf.ones(len(self._batch_env)),
                          reuse=None)
            cell = self._config.network(self._batch_env.action.shape[1].value)
            with tf.variable_scope('ppo_temporary'):
                self._episodes = memory.EpisodeMemory(template, len(batch_env),
                                                      config.max_length,
                                                      'episodes')
                self._last_state = utility.create_nested_vars(
                    cell.zero_state(len(batch_env), tf.float32))
                self._last_action = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_action')
                self._last_mean = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                              False,
                                              name='last_mean')
                self._last_logstd = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_logstd')
        self._penalty = tf.Variable(self._config.kl_init_penalty,
                                    False,
                                    dtype=tf.float32)
        self._policy_optimizer = self._config.policy_optimizer(
            self._config.policy_lr, name='policy_optimizer')
        self._value_optimizer = self._config.value_optimizer(
            self._config.value_lr, name='value_optimizer')