コード例 #1
0
  def __init__(self, batch_env, step, is_training, should_log, config):
    """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agent configuration as attributes.
    """
    self._batch_env = batch_env
    self._step = step
    self._is_training = is_training
    self._should_log = should_log
    self._config = config
    self._observ_filter = normalize.StreamingNormalize(
        self._batch_env.observ[0], center=True, scale=True, clip=5,
        name='normalize_observ')
    self._reward_filter = normalize.StreamingNormalize(
        self._batch_env.reward[0], center=False, scale=True, clip=10,
        name='normalize_reward')
    # Memory stores tuple of observ, action, mean, logstd, reward.
    template = (
        self._batch_env.observ[0], self._batch_env.action[0],
        self._batch_env.action[0], self._batch_env.action[0],
        self._batch_env.reward[0])
    self._memory = memory.EpisodeMemory(
        template, config.update_every, config.max_length, 'memory')
    self._memory_index = tf.Variable(0, False)
    use_gpu = self._config.use_gpu and utility.available_gpus()
    with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
      # Create network variables for later calls to reuse.
      self._network(
          tf.zeros_like(self._batch_env.observ)[:, None],
          tf.ones(len(self._batch_env)), reuse=None)
      cell = self._config.network(self._batch_env.action.shape[1].value)
      with tf.variable_scope('ppo_temporary'):
        self._episodes = memory.EpisodeMemory(
            template, len(batch_env), config.max_length, 'episodes')
        self._last_state = utility.create_nested_vars(
            cell.zero_state(len(batch_env), tf.float32))
        self._last_action = tf.Variable(
            tf.zeros_like(self._batch_env.action), False, name='last_action')
        self._last_mean = tf.Variable(
            tf.zeros_like(self._batch_env.action), False, name='last_mean')
        self._last_logstd = tf.Variable(
            tf.zeros_like(self._batch_env.action), False, name='last_logstd')
    self._penalty = tf.Variable(
        self._config.kl_init_penalty, False, dtype=tf.float32)
    self._policy_optimizer = self._config.policy_optimizer(
        self._config.policy_lr, name='policy_optimizer')
    self._value_optimizer = self._config.value_optimizer(
        self._config.value_lr, name='value_optimizer')
コード例 #2
0
ファイル: algorithm.py プロジェクト: plexzhang/agents
    def __init__(self, batch_env, step, is_training, should_log, config):
        """Create an instance of the PPO algorithm.

    Args:
      batch_env: In-graph batch environment.
      step: Integer tensor holding the current training step.
      is_training: Boolean tensor for whether the algorithm should train.
      should_log: Boolean tensor for whether summaries should be returned.
      config: Object containing the agent configuration as attributes.
    """
        self._batch_env = batch_env
        self._step = step
        self._is_training = is_training
        self._should_log = should_log
        self._config = config
        self._observ_filter = normalize.StreamingNormalize(
            self._batch_env.observ[0],
            center=True,
            scale=True,
            clip=5,
            name='normalize_observ')
        self._reward_filter = normalize.StreamingNormalize(
            self._batch_env.reward[0],
            center=False,
            scale=True,
            clip=10,
            name='normalize_reward')
        # Memory stores tuple of observ, action, mean, logstd, reward.
        template = (self._batch_env.observ[0], self._batch_env.action[0],
                    self._batch_env.action[0], self._batch_env.action[0],
                    self._batch_env.reward[0])
        self._memory = memory.EpisodeMemory(template, config.update_every,
                                            config.max_length, 'memory')
        self._memory_index = tf.Variable(0, False)
        use_gpu = self._config.use_gpu and utility.available_gpus()
        with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
            # Create network variables for later calls to reuse.
            action_size = self._batch_env.action.shape[1].value
            self._network = tf.make_template(
                'network',
                functools.partial(config.network, config, action_size))
            output = self._network(
                tf.zeros_like(self._batch_env.observ)[:, None],
                tf.ones(len(self._batch_env)))
            with tf.variable_scope('ppo_temporary'):
                self._episodes = memory.EpisodeMemory(template, len(batch_env),
                                                      config.max_length,
                                                      'episodes')
                if output.state is None:
                    self._last_state = None
                else:
                    # Ensure the batch dimension is set.
                    tf.contrib.framework.nest.map_structure(
                        lambda x: x.set_shape([len(batch_env)] + x.shape.
                                              as_list()[1:]), output.state)
                    # pylint: disable=undefined-variable
                    self._last_state = tf.contrib.framework.nest.map_structure(
                        lambda x: tf.Variable(lambda: tf.zeros_like(x), False),
                        output.state)
                self._last_action = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_action')
                self._last_mean = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                              False,
                                              name='last_mean')
                self._last_logstd = tf.Variable(tf.zeros_like(
                    self._batch_env.action),
                                                False,
                                                name='last_logstd')
        self._penalty = tf.Variable(self._config.kl_init_penalty,
                                    False,
                                    dtype=tf.float32)
        self._optimizer = self._config.optimizer(self._config.learning_rate)