Example #1
0
def get_fullbatch_average(
        dataset: OffpolicyDataset,
        limit: Optional[int] = None,
        by_steps: bool = True,
        truncate_episode_at: Optional[int] = None,
        reward_fn: Callable = None,
        weight_fn: Callable = None,
        gamma: Union[float, tf.Tensor] = 1.0) -> Union[float, tf.Tensor]:
    """Computes average reward over full dataset.

    Args:
      dataset: The dataset to sample experience from.
      limit: If specified, the maximum number of steps/episodes to take from the
        dataset.
      by_steps: Whether to sample batches of steps (default) or episodes.
      truncate_episode_at: If sampling by episodes, where to truncate episodes
        from the environment, if at all.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward. When
        sampling by episode, valid_steps is also passed into reward_fn.
      weight_fn: A function that takes in an EnvStep and returns a weight for
        that step. If not specified, defaults to gamma ** step_num. When
        sampling by episode, valid_steps is also passed into reward_fn.
      gamma: The discount factor to use for the default reward/weight functions.

    Returns:
      An estimate of the average reward.
    """
    if reward_fn is None:
        if by_steps:
            reward_fn = _default_by_steps_reward_fn
        else:
            reward_fn = lambda *args: _default_by_episodes_reward_fn(
                *args, gamma=gamma)

    if weight_fn is None:
        if by_steps:
            weight_fn = lambda *args: _default_by_steps_weight_fn(*args,
                                                                  gamma=gamma)
        else:
            weight_fn = _default_by_episodes_weight_fn

    if by_steps:
        steps = dataset.get_all_steps(limit=limit)
        rewards = reward_fn(steps)
        weights = weight_fn(steps)
    else:
        episodes, valid_steps = dataset.get_all_episodes(
            truncate_episode_at=truncate_episode_at, limit=limit)
        rewards = reward_fn(episodes, valid_steps)
        weights = weight_fn(episodes, valid_steps)

    rewards = common_lib.reverse_broadcast(rewards, weights)
    weights = common_lib.reverse_broadcast(weights, rewards)
    if tf.rank(weights) < 2:
        return (tf.reduce_sum(rewards * weights, axis=0) /
                tf.reduce_sum(weights, axis=0))
    return (tf.linalg.matmul(weights, rewards) /
            tf.reduce_sum(tf.math.reduce_mean(weights, axis=0)))
Example #2
0
    def _eval_constraint_and_regs(self, dataset: dataset_lib.OffpolicyDataset,
                                  target_policy: tf_policy.TFPolicy):
        """Get the residual term and the primal and dual regularizers during eval.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      The residual term (weighted by zeta), primal, and dual reg values.
    """

        experience = dataset.get_all_steps(num_steps=2)
        env_step = tf.nest.map_structure(lambda t: t[:, 0, ...], experience)
        next_env_step = tf.nest.map_structure(lambda t: t[:, 1, ...],
                                              experience)
        nu_values, _, _ = self._sample_value(self._nu_network, env_step)
        next_nu_values, _, _ = self._sample_average_value(
            self._nu_network, next_env_step, target_policy)
        zeta_values, neg_kl, _ = self._sample_value(self._zeta_network,
                                                    env_step)
        discounts = self._gamma * env_step.discount
        bellman_residuals = (
            common_lib.reverse_broadcast(discounts, nu_values) * next_nu_values
            - nu_values - self._norm_regularizer * self._lam)

        # Always include reward during eval
        bellman_residuals += self._reward_fn(env_step)
        constraint = tf.reduce_mean(zeta_values * bellman_residuals)

        f_nu = tf.reduce_mean(self._f_fn(nu_values))
        f_zeta = tf.reduce_mean(self._f_fn(zeta_values))

        return constraint, f_nu, f_zeta, tf.reduce_mean(neg_kl)
Example #3
0
    def train_loss(self, initial_env_step, env_step, next_env_step, policy):
        nu_values = self._get_value(self._nu_network, env_step)
        initial_nu_values = self._get_average_value(self._nu_network,
                                                    initial_env_step, policy)
        next_nu_values = self._get_average_value(self._nu_network,
                                                 next_env_step, policy)
        zeta_values = self._get_value(self._zeta_network, env_step)

        discounts = self._gamma * next_env_step.discount
        policy_ratio = 1.0
        if not self._solve_for_state_action_ratio:
            tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step)
            policy_log_probabilities = policy.distribution(
                tfagents_step).action.log_prob(env_step.action)
            policy_ratio = tf.exp(policy_log_probabilities -
                                  env_step.get_log_probability())

        bellman_residuals = (nu_values - common_lib.reverse_broadcast(
            discounts * policy_ratio, nu_values) * next_nu_values)

        zeta_loss = self._fstar_fn(
            zeta_values) - bellman_residuals * zeta_values
        if self._primal_form:
            nu_loss = (self._f_fn(bellman_residuals) -
                       (1 - self._gamma) * initial_nu_values)
        else:
            nu_loss = -zeta_loss - (1 - self._gamma) * initial_nu_values

        return nu_loss, zeta_loss
Example #4
0
  def _get_average_value(self, network, env_step, policy):
    if self._solve_for_state_action_ratio:
      tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step)
      if self._categorical_action and self._num_samples is None:
        action_weights = policy.distribution(
            tfagents_step).action.probs_parameter()
        action_dtype = self._dataset_spec.action.dtype
        batch_size = tf.shape(action_weights)[0]
        num_actions = tf.shape(action_weights)[-1]
        actions = (  # Broadcast actions
            tf.ones([batch_size, 1], dtype=action_dtype) *
            tf.range(num_actions, dtype=action_dtype)[None, :])
      else:
        batch_size = tf.shape(env_step.observation)[0]
        num_actions = self._num_samples
        action_weights = tf.ones([batch_size, num_actions]) / num_actions
        actions = tf.stack(
            [policy.action(tfagents_step).action for _ in range(num_actions)],
            axis=1)

      flat_actions = tf.reshape(actions, [batch_size * num_actions] +
                                actions.shape[2:].as_list())
      flat_observations = tf.reshape(
          tf.tile(env_step.observation[:, None, ...],
                  [1, num_actions] + [1] * len(env_step.observation.shape[1:])),
          [batch_size * num_actions] + env_step.observation.shape[1:].as_list())

      flat_values, _ = network((flat_observations, flat_actions))
      values = tf.reshape(flat_values, [batch_size, num_actions] +
                          flat_values.shape[1:].as_list())
      return tf.reduce_sum(
          values * common_lib.reverse_broadcast(action_weights, values), axis=1)
    else:
      return network((env_step.observation,))[0]
    def _get_nu_loss(self, initial_env_step, env_step, next_env_step, policy):
        """Get nu_loss for both upper and lower confidence intervals."""
        nu_index = self._get_index(env_step.observation, env_step.action)
        nu_values = tf.gather(self._nu, nu_index)

        initial_nu_values = self._get_average_value(self._nu, initial_env_step,
                                                    policy)
        next_nu_values = self._get_average_value(self._nu, next_env_step,
                                                 policy)

        rewards = self._reward_fn(env_step)

        discounts = self._gamma * env_step.discount
        policy_ratio = 1.0

        if not self._solve_for_state_action_ratio:
            tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step)
            policy_log_probabilities = policy.distribution(
                tfagents_step).action.log_prob(env_step.action)
            policy_ratio = tf.exp(policy_log_probabilities -
                                  env_step.get_log_probability())

        bellman_residuals = (
            -nu_values + common_lib.reverse_broadcast(
                rewards, tf.convert_to_tensor(nu_values)) +
            common_lib.reverse_broadcast(discounts * policy_ratio,
                                         tf.convert_to_tensor(nu_values)) *
            next_nu_values)
        bellman_residuals *= self._algae_alpha_sign

        init_nu_loss = ((1 - self._gamma) * initial_nu_values *
                        self._algae_alpha_sign)

        nu_loss = (tf.math.abs(self._algae_alpha) * tf.math.square(
            bellman_residuals / tf.math.abs(self._algae_alpha)) / 2.0 +
                   init_nu_loss)

        if self._weight_by_gamma:
            weights = tf.expand_dims(self._gamma**tf.cast(
                env_step.step_num, tf.float32),
                                     axis=1)
            weights /= 1e-6 + tf.reduce_mean(weights)
            nu_loss *= weights

        return nu_loss
Example #6
0
  def train_loss(self, initial_env_step, env_step, next_env_step, policy):
    nu_values = self._get_value(self._nu_network, env_step)
    initial_nu_values = self._get_average_value(self._nu_network,
                                                initial_env_step, policy)
    next_nu_values = self._get_average_value(self._nu_network, next_env_step,
                                             policy)
    zeta_values = self._get_value(self._zeta_network, env_step)
    rewards = self._reward_fn(env_step)

    discounts = self._gamma * env_step.discount
    policy_ratio = 1.0
    if not self._solve_for_state_action_ratio:
      tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step)
      policy_log_probabilities = policy.distribution(
          tfagents_step).action.log_prob(env_step.action)
      policy_ratio = tf.exp(policy_log_probabilities -
                            env_step.get_log_probability())

    bellman_residuals = (
        -nu_values + common_lib.reverse_broadcast(rewards, nu_values) +
        common_lib.reverse_broadcast(discounts * policy_ratio, nu_values) *
        next_nu_values)
    bellman_residuals *= self._algae_alpha_sign
    #print(initial_nu_values, bellman_residuals)

    zeta_loss = (
        self._algae_alpha_abs * self._fstar_fn(zeta_values) -
        bellman_residuals * zeta_values)

    init_nu_loss = ((1 - self._gamma) * initial_nu_values *
                    self._algae_alpha_sign)
    if self._primal_form:
      nu_loss = (
          self._algae_alpha_abs *
          self._f_fn(bellman_residuals / self._algae_alpha_abs) + init_nu_loss)
    else:
      nu_loss = -zeta_loss + init_nu_loss

    if self._weight_by_gamma:
      weights = self._gamma**tf.cast(env_step.step_num, tf.float32)[:, None]
      weights /= 1e-6 + tf.reduce_mean(weights)
      nu_loss *= weights
      zeta_loss *= weights
    return nu_loss, zeta_loss
Example #7
0
 def weight_fn(env_step):
   zeta = self._get_value(self._zeta_network, env_step)
   policy_ratio = 1.0
   if not self._solve_for_state_action_ratio:
     tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(env_step)
     target_log_probabilities = target_policy.distribution(
         tfagents_timestep).action.log_prob(env_step.action)
     policy_ratio = tf.exp(target_log_probabilities -
                           env_step.get_log_probability())
   return zeta * common_lib.reverse_broadcast(policy_ratio, zeta)
Example #8
0
    def train_loss(self, initial_env_step, env_step, next_env_step, policy):
        nu_values, _, eps = self._sample_value(self._nu_network, env_step)
        initial_nu_values, _, _ = self._sample_average_value(
            self._nu_network, initial_env_step, policy)
        next_nu_values, _, _ = self._sample_average_value(
            self._nu_network, next_env_step, policy)

        zeta_values, zeta_neg_kl, _ = self._sample_value(
            self._zeta_network, env_step, eps)

        discounts = self._gamma * env_step.discount
        policy_ratio = 1.0
        if not self._solve_for_state_action_ratio:
            tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step)
            policy_log_probabilities = policy.distribution(
                tfagents_step).action.log_prob(env_step.action)
            policy_ratio = tf.exp(policy_log_probabilities -
                                  env_step.get_log_probability())

        bellman_residuals = (
            common_lib.reverse_broadcast(discounts * policy_ratio, nu_values) *
            next_nu_values - nu_values - self._norm_regularizer * self._lam)
        if not self._zero_reward:
            bellman_residuals += policy_ratio * self._reward_fn(env_step)

        zeta_loss = -zeta_values * bellman_residuals
        nu_loss = (1 - self._gamma) * initial_nu_values
        lam_loss = self._norm_regularizer * self._lam
        if self._primal_form:
            nu_loss += self._fstar_fn(bellman_residuals)
            lam_loss = lam_loss + self._fstar_fn(bellman_residuals)
        else:
            nu_loss += zeta_values * bellman_residuals
            lam_loss = lam_loss - self._norm_regularizer * zeta_values * self._lam

        nu_loss += self._primal_regularizer * self._f_fn(nu_values)
        zeta_loss += self._dual_regularizer * self._f_fn(zeta_values)
        zeta_loss -= self._kl_regularizer * tf.reduce_mean(zeta_neg_kl)

        if self._weight_by_gamma:
            weights = self._gamma**tf.cast(env_step.step_num, tf.float32)[:,
                                                                          None]
            weights /= 1e-6 + tf.reduce_mean(weights)
            nu_loss *= weights
            zeta_loss *= weights

        return nu_loss, zeta_loss, lam_loss
Example #9
0
def get_minibatch_average(
        dataset: Dataset,
        batch_size: int,
        num_batches: int = 1,
        by_steps: bool = True,
        truncate_episode_at: Optional[int] = None,
        reward_fn: Callable = None,
        weight_fn: Callable = None,
        gamma: Union[float, tf.Tensor] = 1.0) -> Union[float, tf.Tensor]:
    """Computes average reward via randomly sampled mini-batches.

    Samples steps or episodes from the dataset and computes average reward.

    Args:
      dataset: The dataset to sample experience from.
      batch_size: The number of episodes to sample per batch.
      num_batches: The number of batches to use for estimation.
      by_steps: Whether to sample batches of steps (default) or episodes.
      truncate_episode_at: If sampling by episodes, where to truncate episodes
        from the environment, if at all.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward. When
        sampling by episode, valid_steps is also passed into reward_fn.
      weight_fn: A function that takes in an EnvStep and returns a weight for
        that step. If not specified, defaults to gamma ** step_num. When
        sampling by episode, valid_steps is also passed into reward_fn.
      gamma: The discount factor to use for the default reward/weight functions.

    Returns:
      An estimate of the average reward.
    """
    if reward_fn is None:
        if by_steps:
            reward_fn = _default_by_steps_reward_fn
        else:
            reward_fn = lambda *args: _default_by_episodes_reward_fn(
                *args, gamma=gamma)

    if weight_fn is None:
        if by_steps:
            weight_fn = lambda *args: _default_by_steps_weight_fn(*args,
                                                                  gamma=gamma)
        else:
            weight_fn = _default_by_episodes_weight_fn

    total_reward = 0.
    total_weight = 0.
    for _ in range(num_batches):
        if by_steps:
            if isinstance(dataset, OnpolicyDataset):
                steps = dataset.get_step(num_steps=batch_size)
            else:
                steps = dataset.get_step(batch_size)
            rewards = reward_fn(steps)
            weights = weight_fn(steps)
        else:
            episodes, valid_steps = dataset.get_episode(
                batch_size, truncate_episode_at=truncate_episode_at)
            rewards = reward_fn(episodes, valid_steps)
            weights = weight_fn(episodes, valid_steps)

        rewards = common_lib.reverse_broadcast(rewards, weights)
        weights = common_lib.reverse_broadcast(weights, rewards)
        total_reward += tf.reduce_sum(rewards * weights, axis=0)
        total_weight += tf.reduce_sum(weights, axis=0)

    return total_reward / total_weight