예제 #1
0
  def _network(self, observ, length=None, state=None, reuse=True):
    """Compute the network output for a batched sequence of observations.

    Optionally, the initial state can be specified. The weights should be
    reused for all calls, except for the first one. Output is a named tuple
    containing the policy as a TensorFlow distribution, the policy mean and log
    standard deviation, the approximated state value, and the new recurrent
    state.

    Args:
      observ: Sequences of observations.
      length: Batch of sequence lengths.
      state: Batch of initial recurrent states.
      reuse: Python boolean whether to reuse previous variables.

    Returns:
      NetworkOutput tuple.
    """
    with tf.variable_scope('network', reuse=reuse):
      observ = tf.convert_to_tensor(observ)
      use_gpu = self._config.use_gpu and utility.available_gpus()
      with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
        observ = tf.check_numerics(observ, 'observ')
        cell = self._config.network(self._batch_env.action.shape[1].value)
        (mean, logstd, value), state = tf.nn.dynamic_rnn(
            cell, observ, length, state, tf.float32, swap_memory=True)
      mean = tf.check_numerics(mean, 'mean')
      logstd = tf.check_numerics(logstd, 'logstd')
      value = tf.check_numerics(value, 'value')
      policy = tf.contrib.distributions.MultivariateNormalDiag(
          mean, tf.exp(logstd))
      return _NetworkOutput(policy, mean, logstd, value, state)
예제 #2
0
    def _training(self):
        """Perform multiple training iterations of both policy and value baseline.

    Training on the episodes collected in the memory. Reset the memory
    afterwards. Always returns a summary string.

    Returns:
      Summary tensor.
    """
        use_gpu = self._config.use_gpu and utility.available_gpus()
        with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
            with tf.name_scope('training'):
                assert_full = tf.assert_equal(self._memory_index,
                                              self._config.update_every)
                with tf.control_dependencies([assert_full]):
                    data = self._memory.data()
                (observ, action, old_mean, old_logstd, reward), length = data
                with tf.control_dependencies([tf.assert_greater(length, 0)]):
                    length = tf.identity(length)
                observ = self._observ_filter.transform(observ)
                reward = self._reward_filter.transform(reward)
                update_summary = self._perform_update_steps(
                    observ, action, old_mean, old_logstd, reward, length)
                with tf.control_dependencies([update_summary]):
                    penalty_summary = self._adjust_penalty(
                        observ, old_mean, old_logstd, length)
                with tf.control_dependencies([penalty_summary]):
                    clear_memory = tf.group(self._memory.clear(),
                                            self._memory_index.assign(0))
                with tf.control_dependencies([clear_memory]):
                    weight_summary = utility.variable_summaries(
                        tf.trainable_variables(),
                        self._config.weight_summaries)
                    return tf.summary.merge(
                        [update_summary, penalty_summary, weight_summary])
예제 #3
0
  def __init__(self, batch_env, step, is_training, should_log, config):
    """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agent configuration as attributes.
    """
    self._batch_env = batch_env
    self._step = step
    self._is_training = is_training
    self._should_log = should_log
    self._config = config
    self._observ_filter = normalize.StreamingNormalize(
        self._batch_env.observ[0], center=True, scale=True, clip=5,
        name='normalize_observ')
    self._reward_filter = normalize.StreamingNormalize(
        self._batch_env.reward[0], center=False, scale=True, clip=10,
        name='normalize_reward')
    # Memory stores tuple of observ, action, mean, logstd, reward.
    template = (
        self._batch_env.observ[0], self._batch_env.action[0],
        self._batch_env.action[0], self._batch_env.action[0],
        self._batch_env.reward[0])
    self._memory = memory.EpisodeMemory(
        template, config.update_every, config.max_length, 'memory')
    self._memory_index = tf.Variable(0, False)
    use_gpu = self._config.use_gpu and utility.available_gpus()
    with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
      # Create network variables for later calls to reuse.
      self._network(
          tf.zeros_like(self._batch_env.observ)[:, None],
          tf.ones(len(self._batch_env)), reuse=None)
      cell = self._config.network(self._batch_env.action.shape[1].value)
      with tf.variable_scope('ppo_temporary'):
        self._episodes = memory.EpisodeMemory(
            template, len(batch_env), config.max_length, 'episodes')
        self._last_state = utility.create_nested_vars(
            cell.zero_state(len(batch_env), tf.float32))
        self._last_action = tf.Variable(
            tf.zeros_like(self._batch_env.action), False, name='last_action')
        self._last_mean = tf.Variable(
            tf.zeros_like(self._batch_env.action), False, name='last_mean')
        self._last_logstd = tf.Variable(
            tf.zeros_like(self._batch_env.action), False, name='last_logstd')
    self._penalty = tf.Variable(
        self._config.kl_init_penalty, False, dtype=tf.float32)
    self._policy_optimizer = self._config.policy_optimizer(
        self._config.policy_lr, name='policy_optimizer')
    self._value_optimizer = self._config.value_optimizer(
        self._config.value_lr, name='value_optimizer')
예제 #4
0
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            use_gpu = self._config.use_gpu and utility.available_gpus()
            with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
                output = self._network(observ[:, None],
                                       tf.ones(observ.shape[0]), state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    assign_state,
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
예제 #5
0
    def _network(self, observ, length=None, state=None, reuse=True):
        """Compute the network output for a batched sequence of observations.

    Optionally, the initial state can be specified. The weights should be
    reused for all calls, except for the first one. Output is a named tuple
    containing the policy as a TensorFlow distribution, the policy mean and log
    standard deviation, the approximated state value, and the new recurrent
    state.

    Args:
      observ: Sequences of observations.
      length: Batch of sequence lengths.
      state: Batch of initial recurrent states.
      reuse: Python boolean whether to reuse previous variables.

    Returns:
      NetworkOutput tuple.
    """
        with tf.variable_scope('network', reuse=reuse):
            observ = tf.convert_to_tensor(observ)
            use_gpu = self._config.use_gpu and utility.available_gpus()
            with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
                observ = tf.check_numerics(observ, 'observ')
                # The argument to self._config.network, self._batch_env.action.shape[1].value, is the
                # response dimension of the network.
                # How does the network outputs actions for all environments, given observations?
                # It is the same network, given different observations and initial states as inputs
                # return different values. Just like you apply a function multiple times with different
                # inputs to get different outputs. And the outputs here are (mean, logstd, value), each
                # element in the tuple has the first dimension as the environment id.
                cell = self._config.network(
                    self._batch_env.action.shape[1].value)

                (mean, logstd,
                 value), state = tf.nn.dynamic_rnn(cell=cell,
                                                   inputs=observ,
                                                   sequence_length=length,
                                                   initial_state=state,
                                                   dtype=tf.float32,
                                                   swap_memory=True)
            mean = tf.check_numerics(mean, 'mean')
            logstd = tf.check_numerics(logstd, 'logstd')
            value = tf.check_numerics(value, 'value')
            policy = tf.contrib.distributions.MultivariateNormalDiag(
                mean, tf.exp(logstd))
            return _NetworkOutput(policy, mean, logstd, value, state)
예제 #6
0
    def __init__(self, batch_env, step, is_training, should_log, config):
        """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agent configuration as attributes.
    """
        self._batch_env = batch_env
        self._step = step
        self._is_training = is_training
        self._should_log = should_log
        self._config = config
        self._observ_filter = normalize.StreamingNormalize(
            self._batch_env.observ[0],
            center=True,
            scale=True,
            clip=5,
            name='normalize_observ')
        self._reward_filter = normalize.StreamingNormalize(
            self._batch_env.reward[0],
            center=False,
            scale=True,
            clip=10,
            name='normalize_reward')
        # Memory stores tuple of observ, action, mean, logstd, reward.
        template = (self._batch_env.observ[0], self._batch_env.action[0],
                    self._batch_env.action[0], self._batch_env.action[0],
                    self._batch_env.reward[0])
        self._memory = memory.EpisodeMemory(template, config.update_every,
                                            config.max_length, 'memory')
        self._memory_index = tf.Variable(0, False)
        use_gpu = self._config.use_gpu and utility.available_gpus()
        with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
            # Create network variables for later calls to reuse.
            action_size = self._batch_env.action.shape[1].value
            self._network = tf.make_template(
                'network',
                functools.partial(config.network, config, action_size))
            output = self._network(
                tf.zeros_like(self._batch_env.observ)[:, None],
                tf.ones(len(self._batch_env)))
            with tf.variable_scope('ppo_temporary'):
                self._episodes = memory.EpisodeMemory(template, len(batch_env),
                                                      config.max_length,
                                                      'episodes')
                if output.state is None:
                    self._last_state = None
                else:
                    # Ensure the batch dimension is set.
                    tf.contrib.framework.nest.map_structure(
                        lambda x: x.set_shape([len(batch_env)] + x.shape.
                                              as_list()[1:]), output.state)
                    # pylint: disable=undefined-variable
                    self._last_state = tf.contrib.framework.nest.map_structure(
                        lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
                        output.state)
                self._last_action = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_action')
                self._last_mean = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                              False,
                                              name='last_mean')
                self._last_logstd = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_logstd')
        self._penalty = tf.Variable(self._config.kl_init_penalty,
                                    False,
                                    dtype=tf.float32)
        self._optimizer = self._config.optimizer(self._config.learning_rate)