Beispiel #1
0
    def simulate(self, action):
        """Step the environment.

    The result of the step can be accessed from the variables defined below.

    Args:
      action: Tensor holding the action to apply.

    Returns:
      Operation.
    """
        with tf.name_scope('environment/simulate'):
            if action.dtype in (tf.float16, tf.float32, tf.float64):
                action = tf.check_numerics(action, 'action')
            observ_dtype = self._parse_dtype(self._env.observation_space)
            observ, reward, done = tf.py_func(
                lambda a: self._env.step(a)[:3], [action],
                [observ_dtype, tf.float32, tf.bool],
                name='step')
            observ = tf.check_numerics(observ, 'observ')
            reward = tf.check_numerics(reward, 'reward')
            return tf.group(self._observ.assign(observ),
                            self._action.assign(action),
                            self._reward.assign(reward),
                            self._done.assign(done), self._step.assign_add(1))
def recurrent_gaussian(config, action_size, observations, length, state=None):
  """Independent recurrent policy and feed forward value networks.

  The policy network outputs the mean action and the log standard deviation
  is learned as independent parameter vector. The last policy layer is
  recurrent and uses a GRU cell.

  Args:
    config: Configuration object.
    action_size: Length of the action vector.
    observations: Sequences of observations.
    length: Batch of sequence lengths.
    state: Batch of initial recurrent states.

  Returns:
    NetworkOutput tuple.
  """
  mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer(
      factor=config.init_mean_factor)
  logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
  cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1])
  flat_observations = tf.reshape(observations, [
      tf.shape(observations)[0],
      tf.shape(observations)[1],
      functools.reduce(operator.mul,
                       observations.shape.as_list()[2:], 1)
  ])
  with tf.variable_scope('policy'):
    x = flat_observations
    for size in config.policy_layers[:-1]:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32)
    mean = tf.contrib.layers.fully_connected(x,
                                             action_size,
                                             tf.tanh,
                                             weights_initializer=mean_weights_initializer)
    logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer)
    logstd = tf.tile(logstd[None, None],
                     [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
  with tf.variable_scope('value'):
    x = flat_observations
    for size in config.value_layers:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
  mean = tf.check_numerics(mean, 'mean')
  logstd = tf.check_numerics(logstd, 'logstd')
  value = tf.check_numerics(value, 'value')
  policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd))
  # assert state.shape.as_list()[0] is not None
  return NetworkOutput(policy, mean, logstd, value, state)
Beispiel #3
0
    def transform(self, value):
        """Normalize a single or batch tensor.

    Applies the activated transformations in the constructor using current
    estimates of mean and variance.

    Args:
      value: Batch or single value tensor.

    Returns:
      Normalized batch or single value tensor.
    """
        with tf.name_scope(self._name + '/transform'):
            no_batch_dim = value.shape.ndims == self._mean.shape.ndims
            if no_batch_dim:
                # Add a batch dimension if necessary.
                value = value[None, ...]
            if self._center:
                value -= self._mean[None, ...]
            if self._scale:
                # We cannot scale before seeing at least two samples.
                value /= tf.cond(self._count > 1, lambda: self._std() + 1e-8,
                                 lambda: tf.ones_like(self._var_sum))[None]
            if self._clip:
                value = tf.clip_by_value(value, -self._clip, self._clip)
            # Remove batch dimension if necessary.
            if no_batch_dim:
                value = value[0]
            return tf.check_numerics(value, 'value')
    def _value_loss(self, observ, reward, length):
        """Compute the loss function for the value baseline.

    The value loss is the difference between empirical and approximated returns
    over the collected episodes. Returns the loss tensor and a summary strin.

    Args:
      observ: Sequences of observations.
      reward: Sequences of reward.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        with tf.name_scope('value_loss'):
            value = self._network(observ, length).value
            return_ = utility.discounted_return(reward, length,
                                                self._config.discount)
            advantage = return_ - value
            value_loss = 0.5 * self._mask(advantage**2, length)
            summary = tf.summary.merge([
                tf.summary.histogram('value_loss', value_loss),
                tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))
            ])
            value_loss = tf.reduce_mean(value_loss)
            return tf.check_numerics(value_loss, 'value_loss'), summary
Beispiel #5
0
    def perform(self, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      observ: Tensor of a batch of observations for all algorithms.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            network = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                    self._last_state)
            action = tf.cond(self._is_training, network.policy.sample,
                             lambda: network.mean)
            logprob = network.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', network.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            with tf.control_dependencies([
                    utility.assign_nested_vars(self._last_state,
                                               network.state),
                    self._last_action.assign(action[:, 0]),
                    self._last_mean.assign(network.mean[:, 0]),
                    self._last_logstd.assign(network.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
Beispiel #6
0
def discounted_return(reward, length, discount):
  """Discounted Monte-Carlo returns."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  return_ = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur + discount * agg,
                  tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
                  tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')
Beispiel #7
0
def fixed_step_return(reward, value, length, discount, window):
  """N-step discounted return."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  return_ = tf.zeros_like(reward)
  for _ in range(window):
    return_ += reward
    reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
  return_ += discount**window * tf.concat(
      [value[:, window:], tf.zeros_like(value[:, -window:]), 1])
  return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
Beispiel #8
0
def lambda_advantage(reward, value, length, discount):
  """Generalized Advantage Estimation."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
  delta = reward + discount * next_value - value
  advantage = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur + discount * agg,
                  tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]),
                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
Beispiel #9
0
def lambda_return(reward, value, length, discount, lambda_):
  """TD-lambda returns."""
  timestep = tf.range(reward.shape[1].value)
  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
  sequence = mask * reward + discount * value * (1 - lambda_)
  discount = mask * discount * lambda_
  sequence = tf.stack([sequence, discount], 2)
  return_ = tf.reverse(
      tf.transpose(
          tf.scan(lambda agg, cur: cur[0] + cur[1] * agg,
                  tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]),
                  1, False), [1, 0]), [1])
  return tf.check_numerics(tf.stop_gradient(return_), 'return')
Beispiel #10
0
    def _network(self, observ, length=None, state=None, reuse=True):
        """Compute the network output for a batched sequence of observations.

    Optionally, the initial state can be specified. The weights should be
    reused for all calls, except for the first one. Output is a named tuple
    containing the policy as a TensorFlow distribution, the policy mean and log
    standard deviation, the approximated state value, and the new recurrent
    state.

    Args:
      observ: Sequences of observations.
      length: Batch of sequence lengths.
      state: Batch of initial recurrent states.
      reuse: Python boolean whether to reuse previous variables.

    Returns:
      NetworkOutput tuple.
    """
        with tf.variable_scope('network', reuse=reuse):
            observ = tf.convert_to_tensor(observ)
            use_gpu = self._config.use_gpu and utility.available_gpus()
            with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
                observ = tf.check_numerics(observ, 'observ')
                cell = self._config.network(
                    self._batch_env.action.shape[1].value)
                (mean, logstd,
                 value), state = tf.nn.dynamic_rnn(cell,
                                                   observ,
                                                   length,
                                                   state,
                                                   tf.float32,
                                                   swap_memory=True)
            mean = tf.check_numerics(mean, 'mean')
            logstd = tf.check_numerics(logstd, 'logstd')
            value = tf.check_numerics(value, 'value')
            policy = tf.contrib.distributions.MultivariateNormalDiag(
                mean, tf.exp(logstd))
            return _NetworkOutput(policy, mean, logstd, value, state)
Beispiel #11
0
    def reset(self):
        """Reset the environment.

    Returns:
      Tensor of the current observation.
    """
        observ_dtype = self._parse_dtype(self._env.observation_space)
        observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
        observ = tf.check_numerics(observ, 'observ')
        with tf.control_dependencies([
                self._observ.assign(observ),
                self._reward.assign(0),
                self._done.assign(False)
        ]):
            return tf.identity(observ)
    def perform(self, agent_indices, observ):
        """Compute batch of actions and a summary for a batch of observation.

    Args:
      agent_indices: Tensor containing current batch indices.
      observ: Tensor of a batch of observations for all agents.

    Returns:
      Tuple of action batch tensor and summary tensor.
    """
        with tf.name_scope('perform/'):
            observ = self._observ_filter.transform(observ)
            if self._last_state is None:
                state = None
            else:
                state = tf.contrib.framework.nest.map_structure(
                    lambda x: tf.gather(x, agent_indices), self._last_state)
            output = self._network(observ[:, None], tf.ones(observ.shape[0]),
                                   state)
            action = tf.cond(self._is_training, output.policy.sample,
                             lambda: output.mean)
            logprob = output.policy.log_prob(action)[:, 0]
            # pylint: disable=g-long-lambda
            summary = tf.cond(
                self._should_log, lambda: tf.summary.merge([
                    tf.summary.histogram('mean', output.mean[:, 0]),
                    tf.summary.histogram('std', tf.exp(output.logstd[:, 0])),
                    tf.summary.histogram('action', action[:, 0]),
                    tf.summary.histogram('logprob', logprob)
                ]), str)
            # Remember current policy to append to memory in the experience callback.
            if self._last_state is None:
                assign_state = tf.no_op()
            else:
                assign_state = utility.assign_nested_vars(
                    self._last_state, output.state, agent_indices)
            with tf.control_dependencies([
                    assign_state,
                    tf.scatter_update(self._last_action, agent_indices,
                                      action[:, 0]),
                    tf.scatter_update(self._last_mean, agent_indices,
                                      output.mean[:, 0]),
                    tf.scatter_update(self._last_logstd, agent_indices,
                                      output.logstd[:, 0])
            ]):
                return tf.check_numerics(action[:, 0],
                                         'action'), tf.identity(summary)
    def _mask(self, tensor, length):
        """Set padding elements of a batch of sequences to zero.

    Useful to then safely sum along the time dimension.

    Args:
      tensor: Tensor of sequences.
      length: Batch of sequence lengths.

    Returns:
      Masked sequences.
    """
        with tf.name_scope('mask'):
            range_ = tf.range(tensor.shape[1].value)
            mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
            masked = tensor * mask
            return tf.check_numerics(masked, 'masked')
Beispiel #14
0
    def reset(self, indices=None):
        """Reset the batch of environments.

    Args:
      indices: The batch indices of the environments to reset; defaults to all.

    Returns:
      Batch tensor of the new observations.
    """
        if indices is None:
            indices = tf.range(len(self._batch_env))
        observ_dtype = self._parse_dtype(self._batch_env.observation_space)
        observ = tf.py_func(self._batch_env.reset, [indices],
                            observ_dtype,
                            name='reset')
        observ = tf.check_numerics(observ, 'observ')
        reward = tf.zeros_like(indices, tf.float32)
        done = tf.zeros_like(indices, tf.bool)
        with tf.control_dependencies([
                tf.scatter_update(self._observ, indices, observ),
                tf.scatter_update(self._reward, indices, reward),
                tf.scatter_update(self._done, indices, done)
        ]):
            return tf.identity(observ)
    def _policy_loss(self, mean, logstd, old_mean, old_logstd, action,
                     advantage, length):
        """Compute the policy loss composed of multiple components.

    1. The policy gradient loss is importance sampled from the data-collecting
       policy at the beginning of training.
    2. The second term is a KL penalty between the policy at the beginning of
       training and the current policy.
    3. Additionally, if this KL already changed more than twice the target
       amount, we activate a strong penalty discouraging further divergence.

    Args:
      mean: Sequences of action means of the current policy.
      logstd: Sequences of action log stddevs of the current policy.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      action: Sequences of actions.
      advantage: Sequences of advantages.
      length: Batch of sequence lengths.

    Returns:
      Tuple of loss tensor and summary tensor.
    """
        with tf.name_scope('policy_loss'):
            entropy = utility.diag_normal_entropy(mean, logstd)
            kl = tf.reduce_mean(
                self._mask(
                    utility.diag_normal_kl(old_mean, old_logstd, mean, logstd),
                    length), 1)
            policy_gradient = tf.exp(
                utility.diag_normal_logpdf(mean, logstd, action) -
                utility.diag_normal_logpdf(old_mean, old_logstd, action))
            surrogate_loss = -tf.reduce_mean(
                self._mask(policy_gradient * tf.stop_gradient(advantage),
                           length), 1)
            kl_penalty = self._penalty * kl
            cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
            cutoff_count = tf.reduce_sum(
                tf.cast(kl > cutoff_threshold, tf.int32))
            with tf.control_dependencies([
                    tf.cond(cutoff_count > 0,
                            lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '),
                            int)
            ]):
                kl_cutoff = (self._config.kl_cutoff_coef *
                             tf.cast(kl > cutoff_threshold, tf.float32) *
                             (kl - cutoff_threshold)**2)
            policy_loss = surrogate_loss + kl_penalty + kl_cutoff
            summary = tf.summary.merge([
                tf.summary.histogram('entropy', entropy),
                tf.summary.histogram('kl', kl),
                tf.summary.histogram('surrogate_loss', surrogate_loss),
                tf.summary.histogram('kl_penalty', kl_penalty),
                tf.summary.histogram('kl_cutoff', kl_cutoff),
                tf.summary.histogram('kl_penalty_combined',
                                     kl_penalty + kl_cutoff),
                tf.summary.histogram('policy_loss', policy_loss),
                tf.summary.scalar('avg_surr_loss',
                                  tf.reduce_mean(surrogate_loss)),
                tf.summary.scalar('avg_kl_penalty',
                                  tf.reduce_mean(kl_penalty)),
                tf.summary.scalar('avg_policy_loss',
                                  tf.reduce_mean(policy_loss))
            ])
            policy_loss = tf.reduce_mean(policy_loss, 0)
            return tf.check_numerics(policy_loss, 'policy_loss'), summary