コード例 #1
0
def get_fullbatch_average(
        dataset: OffpolicyDataset,
        limit: Optional[int] = None,
        by_steps: bool = True,
        truncate_episode_at: Optional[int] = None,
        reward_fn: Callable = None,
        weight_fn: Callable = None,
        gamma: Union[float, tf.Tensor] = 1.0) -> Union[float, tf.Tensor]:
    """Computes average reward over full dataset.

    Args:
      dataset: The dataset to sample experience from.
      limit: If specified, the maximum number of steps/episodes to take from the
        dataset.
      by_steps: Whether to sample batches of steps (default) or episodes.
      truncate_episode_at: If sampling by episodes, where to truncate episodes
        from the environment, if at all.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward. When
        sampling by episode, valid_steps is also passed into reward_fn.
      weight_fn: A function that takes in an EnvStep and returns a weight for
        that step. If not specified, defaults to gamma ** step_num. When
        sampling by episode, valid_steps is also passed into reward_fn.
      gamma: The discount factor to use for the default reward/weight functions.

    Returns:
      An estimate of the average reward.
    """
    if reward_fn is None:
        if by_steps:
            reward_fn = _default_by_steps_reward_fn
        else:
            reward_fn = lambda *args: _default_by_episodes_reward_fn(
                *args, gamma=gamma)

    if weight_fn is None:
        if by_steps:
            weight_fn = lambda *args: _default_by_steps_weight_fn(*args,
                                                                  gamma=gamma)
        else:
            weight_fn = _default_by_episodes_weight_fn

    if by_steps:
        steps = dataset.get_all_steps(limit=limit)
        rewards = reward_fn(steps)
        weights = weight_fn(steps)
    else:
        episodes, valid_steps = dataset.get_all_episodes(
            truncate_episode_at=truncate_episode_at, limit=limit)
        rewards = reward_fn(episodes, valid_steps)
        weights = weight_fn(episodes, valid_steps)

    rewards = common_lib.reverse_broadcast(rewards, weights)
    weights = common_lib.reverse_broadcast(weights, rewards)
    if tf.rank(weights) < 2:
        return (tf.reduce_sum(rewards * weights, axis=0) /
                tf.reduce_sum(weights, axis=0))
    return (tf.linalg.matmul(weights, rewards) /
            tf.reduce_sum(tf.math.reduce_mean(weights, axis=0)))
コード例 #2
0
    def _eval_constraint_and_regs(self, dataset: dataset_lib.OffpolicyDataset,
                                  target_policy: tf_policy.TFPolicy):
        """Get the residual term and the primal and dual regularizers during eval.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      The residual term (weighted by zeta), primal, and dual reg values.
    """

        experience = dataset.get_all_steps(num_steps=2)
        env_step = tf.nest.map_structure(lambda t: t[:, 0, ...], experience)
        next_env_step = tf.nest.map_structure(lambda t: t[:, 1, ...],
                                              experience)
        nu_values, _, _ = self._sample_value(self._nu_network, env_step)
        next_nu_values, _, _ = self._sample_average_value(
            self._nu_network, next_env_step, target_policy)
        zeta_values, neg_kl, _ = self._sample_value(self._zeta_network,
                                                    env_step)
        discounts = self._gamma * env_step.discount
        bellman_residuals = (
            common_lib.reverse_broadcast(discounts, nu_values) * next_nu_values
            - nu_values - self._norm_regularizer * self._lam)

        # Always include reward during eval
        bellman_residuals += self._reward_fn(env_step)
        constraint = tf.reduce_mean(zeta_values * bellman_residuals)

        f_nu = tf.reduce_mean(self._f_fn(nu_values))
        f_zeta = tf.reduce_mean(self._f_fn(zeta_values))

        return constraint, f_nu, f_zeta, tf.reduce_mean(neg_kl)
コード例 #3
0
    def solve(self,
              dataset: dataset_lib.OffpolicyDataset,
              target_policy: tf_policy.TFPolicy,
              regularizer: float = 1e-8):
        """Solves for density ratios and then approximates target policy value.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.
      regularizer: A small constant to add to matrices before inverting them or
        to floats before taking square root.

    Returns:
      Estimated average per-step reward of the target policy.
    """
        td_residuals = np.zeros([self._dimension, self._dimension])
        total_weights = np.zeros([self._dimension])
        initial_weights = np.zeros([self._dimension])

        episodes, valid_steps = dataset.get_all_episodes(limit=None)
        tfagents_episodes = dataset_lib.convert_to_tfagents_timestep(episodes)

        for episode_num in range(tf.shape(valid_steps)[0]):
            # Precompute probabilites for this episode.
            this_episode = tf.nest.map_structure(lambda t: t[episode_num],
                                                 episodes)
            first_step = tf.nest.map_structure(lambda t: t[0], this_episode)
            this_tfagents_episode = dataset_lib.convert_to_tfagents_timestep(
                this_episode)
            episode_target_log_probabilities = target_policy.distribution(
                this_tfagents_episode).action.log_prob(this_episode.action)
            episode_target_probs = target_policy.distribution(
                this_tfagents_episode).action.probs_parameter()

            for step_num in range(tf.shape(valid_steps)[1] - 1):
                this_step = tf.nest.map_structure(
                    lambda t: t[episode_num, step_num], episodes)
                next_step = tf.nest.map_structure(
                    lambda t: t[episode_num, step_num + 1], episodes)
                if this_step.is_last() or not valid_steps[episode_num,
                                                          step_num]:
                    continue

                weight = 1.0
                nu_index = self._get_index(this_step.observation,
                                           this_step.action)
                td_residuals[nu_index, nu_index] += weight
                total_weights[nu_index] += weight

                policy_ratio = 1.0
                if not self._solve_for_state_action_ratio:
                    policy_ratio = tf.exp(
                        episode_target_log_probabilities[step_num] -
                        this_step.get_log_probability())

                # Need to weight next nu by importance weight.
                next_weight = (weight if self._solve_for_state_action_ratio
                               else policy_ratio * weight)
                next_probs = episode_target_probs[step_num + 1]
                for next_action, next_prob in enumerate(next_probs):
                    next_nu_index = self._get_index(next_step.observation,
                                                    next_action)
                    td_residuals[next_nu_index,
                                 nu_index] += (-next_prob * self._gamma *
                                               next_weight)

                initial_probs = episode_target_probs[0]
                for initial_action, initial_prob in enumerate(initial_probs):
                    initial_nu_index = self._get_index(first_step.observation,
                                                       initial_action)
                    initial_weights[initial_nu_index] += weight * initial_prob

        td_residuals /= np.sqrt(regularizer + total_weights)[None, :]
        td_errors = np.dot(td_residuals, td_residuals.T)
        self._nu = np.linalg.solve(
            td_errors + regularizer * np.eye(self._dimension),
            (1 - self._gamma) * initial_weights)
        self._zeta = np.dot(
            self._nu, td_residuals) / np.sqrt(regularizer + total_weights)
        return self.estimate_average_reward(dataset, target_policy)
コード例 #4
0
    def solve_nu_zeta(self,
                      dataset: dataset_lib.OffpolicyDataset,
                      target_policy: tf_policy.TFPolicy,
                      regularizer: float = 1e-6):
        """Solves for density ratios and then approximates target policy value.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.
      regularizer: A small constant to add to matrices before inverting them or
        to floats before taking square root.

    Returns:
      Estimated average per-step reward of the target policy.
    """

        if not hasattr(self, '_td_mat'):
            # Set up env_steps.
            episodes, valid_steps = dataset.get_all_episodes(
                limit=self._limit_episodes)
            total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1
            num_episodes = tf.shape(valid_steps)[0]
            num_samples = num_episodes * total_num_steps_per_episode
            valid_and_not_last = tf.logical_and(valid_steps,
                                                episodes.discount > 0)
            valid_indices = tf.squeeze(
                tf.where(tf.reshape(valid_and_not_last[:, :-1], [-1])))

            initial_env_step = tf.nest.map_structure(
                lambda t: tf.squeeze(
                    tf.reshape(
                        tf.repeat(t[:, 0:1, ...],
                                  axis=1,
                                  repeats=total_num_steps_per_episode),
                        [num_samples, -1])), episodes)
            initial_env_step = tf.nest.map_structure(
                lambda t: tf.gather(t, valid_indices), initial_env_step)
            tfagents_initial_env_step = dataset_lib.convert_to_tfagents_timestep(
                initial_env_step)

            env_step = tf.nest.map_structure(
                lambda t: tf.squeeze(
                    tf.reshape(t[:, 0:total_num_steps_per_episode, ...],
                               [num_samples, -1])), episodes)
            env_step = tf.nest.map_structure(
                lambda t: tf.gather(t, valid_indices), env_step)
            tfagents_env_step = dataset_lib.convert_to_tfagents_timestep(
                env_step)

            next_env_step = tf.nest.map_structure(
                lambda t: tf.squeeze(
                    tf.reshape(t[:, 1:total_num_steps_per_episode + 1, ...],
                               [num_samples, -1])), episodes)
            next_env_step = tf.nest.map_structure(
                lambda t: tf.gather(t, valid_indices), next_env_step)
            tfagents_next_env_step = dataset_lib.convert_to_tfagents_timestep(
                next_env_step)

            # get probabilities
            initial_target_probs = target_policy.distribution(
                tfagents_initial_env_step).action.probs_parameter()
            next_target_probs = target_policy.distribution(
                tfagents_next_env_step).action.probs_parameter()

            # First, get the nu_loss and data weights
            #current_nu_loss = self._get_nu_loss(initial_env_step, env_step,
            #                                    next_env_step, target_policy)
            #data_weight, _ = self._get_weights(current_nu_loss)

            # # debug only and to reproduce dual dice result, DELETE
            # data_weight = tf.ones_like(data_weight)

            state_action_count = self._get_state_action_counts(env_step)
            counts = tf.reduce_sum(
                tf.one_hot(state_action_count, self._dimension), 0)
            gamma_sample = tf.pow(self._gamma,
                                  tf.cast(env_step.step_num, tf.float32))

            # # debug only and to reproduce dual dice result, DELETE
            # gamma_sample = tf.ones_like(gamma_sample)

            # now we need to expand_dims to include action space in extra dimensions
            #data_weights = tf.reshape(data_weight, [-1, self._num_limits])
            # both are data sample weights for L2 problem, needs to be normalized later
            #gamma_data_weights = tf.reshape(gamma_sample, [-1, 1]) * data_weights

            initial_states = tf.tile(
                tf.reshape(initial_env_step.observation, [-1, 1]),
                [1, self._num_actions])
            initial_actions = tf.tile(
                tf.reshape(tf.range(self._num_actions), [1, -1]),
                [initial_env_step.observation.shape[0], 1])
            initial_nu_indices = self._get_index(initial_states,
                                                 initial_actions)

            # linear term w.r.t. initial distribution
            #b_vec_2 = tf.stack([
            #    tf.reduce_sum(
            #        tf.reshape(
            #            data_weights[:, itr] / tf.reduce_sum(data_weights[:, itr]),
            #            [-1, 1]) * tf.reduce_sum(
            #                tf.one_hot(initial_nu_indices, self._dimension) *
            #                (1 - self._gamma) *
            #                tf.expand_dims(initial_target_probs, axis=-1),
            #                axis=1),
            #        axis=0) for itr in range(self._num_limits)
            #],
            #                   axis=0)

            next_states = tf.tile(
                tf.reshape(next_env_step.observation, [-1, 1]),
                [1, self._num_actions])
            next_actions = tf.tile(
                tf.reshape(tf.range(self._num_actions), [1, -1]),
                [next_env_step.observation.shape[0], 1])
            next_nu_indices = self._get_index(next_states, next_actions)
            next_nu_indices = tf.where(
                tf.expand_dims(next_env_step.is_absorbing(), -1),
                -1 * tf.ones_like(next_nu_indices), next_nu_indices)

            nu_indices = self._get_index(env_step.observation, env_step.action)

            target_log_probabilities = target_policy.distribution(
                tfagents_env_step).action.log_prob(env_step.action)
            if not self._solve_for_state_action_ratio:
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())
            else:
                policy_ratio = tf.ones([
                    target_log_probabilities.shape[0],
                ])
            policy_ratios = tf.tile(tf.reshape(policy_ratio, [-1, 1]),
                                    [1, self._num_actions])

            # the tabular feature vector
            a_vec = tf.one_hot(nu_indices, self._dimension) - tf.reduce_sum(
                self._gamma *
                tf.expand_dims(next_target_probs * policy_ratios, axis=-1) *
                tf.one_hot(next_nu_indices, self._dimension),
                axis=1)

            # linear term w.r.t. reward
            #b_vec_1 = tf.stack([
            #    tf.reduce_sum(
            #        tf.reshape(
            #            (gamma_data_weights[:, itr] /
            #             tf.reduce_sum(gamma_data_weights[:, itr])) * self._reward_fn(env_step), #/
            #            #tf.cast(state_action_count, tf.float32),
            #            [-1, 1]) * a_vec,
            #        axis=0) for itr in range(self._num_limits)
            #],
            #                   axis=0)
            # quadratic term of feature
            # Get weighted outer product by using einsum to save computing resource!
            #a_mat = tf.stack([
            #    tf.einsum(
            #        'ai, a, aj -> ij', a_vec,
            #        #1.0 / tf.cast(state_action_count, tf.float32),
            #        gamma_data_weights[:, itr] /
            #        tf.reduce_sum(gamma_data_weights[:, itr]),
            #        a_vec)
            #    for itr in range(self._num_limits)
            #],
            #                 axis=0)

            td_mat = tf.einsum('ai, a, aj -> ij',
                               tf.one_hot(nu_indices, self._dimension),
                               1.0 / tf.cast(state_action_count, tf.float32),
                               a_vec)

            weighted_rewards = policy_ratio * self._reward_fn(env_step)

            bias = tf.reduce_sum(
                tf.one_hot(nu_indices, self._dimension) *
                tf.reshape(weighted_rewards, [-1, 1]) * 1.0 /
                tf.cast(state_action_count, tf.float32)[:, None],
                axis=0)

            # Initialize
            self._nu = np.ones_like(self._nu) * bias[:, None]
            self._nu2 = np.ones_like(self._nu2) * bias[:, None]

            self._a_vec = a_vec
            self._td_mat = td_mat
            self._bias = bias
            self._weighted_rewards = weighted_rewards
            self._state_action_count = state_action_count
            self._nu_indices = nu_indices
            self._initial_nu_indices = initial_nu_indices
            self._initial_target_probs = initial_target_probs
            self._gamma_sample = gamma_sample
            self._gamma_sample = tf.ones_like(gamma_sample)

        saddle_bellman_residuals = (tf.matmul(self._a_vec, self._nu) -
                                    self._weighted_rewards[:, None])
        saddle_bellman_residuals *= -1 * self._algae_alpha_sign
        saddle_zetas = tf.gather(self._zeta, self._nu_indices)
        saddle_initial_nu_values = tf.reduce_sum(  # Average over actions.
            self._initial_target_probs[:, :, None] *
            tf.gather(self._nu, self._initial_nu_indices),
            axis=1)
        saddle_init_nu_loss = ((1 - self._gamma) * saddle_initial_nu_values *
                               self._algae_alpha_sign)

        saddle_bellman_residuals2 = (tf.matmul(self._a_vec, self._nu2) -
                                     self._weighted_rewards[:, None])
        saddle_bellman_residuals2 *= 1 * self._algae_alpha_sign
        saddle_zetas2 = tf.gather(self._zeta2, self._nu_indices)
        saddle_initial_nu_values2 = tf.reduce_sum(  # Average over actions.
            self._initial_target_probs[:, :, None] *
            tf.gather(self._nu2, self._initial_nu_indices),
            axis=1)
        saddle_init_nu_loss2 = ((1 - self._gamma) * saddle_initial_nu_values2 *
                                -1 * self._algae_alpha_sign)

        saddle_loss = 0.5 * (
            saddle_init_nu_loss + saddle_bellman_residuals * saddle_zetas +
            -tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas) +
            -saddle_init_nu_loss2 + -saddle_bellman_residuals2 * saddle_zetas2
            + tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas2))
        # Binary search to find best alpha.
        left = tf.constant([-8., -8.])
        right = tf.constant([32., 32.])
        for _ in range(16):
            mid = 0.5 * (left + right)
            self._alpha.assign(mid)
            weights, log_weights = self._get_weights(
                saddle_loss * self._gamma_sample[:, None])

            divergence = self._compute_divergence(weights, log_weights)
            divergence_violation = divergence - self._two_sided_limit
            left = tf.where(divergence_violation > 0., mid, left)
            right = tf.where(divergence_violation > 0., right, mid)
        self._alpha.assign(0.5 * (left + right))
        weights, log_weights = self._get_weights(saddle_loss *
                                                 self._gamma_sample[:, None])

        gamma_data_weights = tf.stop_gradient(weights *
                                              self._gamma_sample[:, None])
        #print(tf.concat([gamma_data_weights, saddle_loss], axis=-1))
        avg_saddle_loss = (
            tf.reduce_sum(gamma_data_weights * saddle_loss, axis=0) /
            tf.reduce_sum(gamma_data_weights, axis=0))

        weighted_state_action_count = tf.reduce_sum(
            tf.one_hot(self._nu_indices, self._dimension)[:, :, None] *
            weights[:, None, :],
            axis=0)
        weighted_state_action_count = tf.gather(weighted_state_action_count,
                                                self._nu_indices)
        my_td_mat = tf.einsum(
            'ai, ab, ab, aj -> bij',
            tf.one_hot(self._nu_indices, self._dimension),
            #1.0 / tf.cast(self._state_action_count, tf.float32),
            1.0 / weighted_state_action_count,
            weights,
            self._a_vec)
        my_bias = tf.reduce_sum(
            tf.transpose(weights)[:, :, None] *
            tf.one_hot(self._nu_indices, self._dimension)[None, :, :] *
            tf.reshape(self._weighted_rewards, [1, -1, 1]) *
            #1.0 / tf.cast(self._state_action_count, tf.float32)[None, :, None],
            1.0 / tf.transpose(weighted_state_action_count)[:, :, None],
            axis=1)

        #print('hello', saddle_initial_nu_values[:1], saddle_zetas[:3],
        #      self._nu[:2], my_bias[:, :2], saddle_loss[:4])

        with tf.GradientTape(watch_accessed_variables=False,
                             persistent=True) as tape:
            tape.watch([self._nu, self._nu2, self._alpha])
            bellman_residuals = tf.matmul(
                my_td_mat,
                tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None]
            bellman_residuals = tf.transpose(tf.squeeze(bellman_residuals, -1))
            bellman_residuals = tf.gather(bellman_residuals, self._nu_indices)
            initial_nu_values = tf.reduce_sum(  # Average over actions.
                self._initial_target_probs[:, :, None] *
                tf.gather(self._nu, self._initial_nu_indices),
                axis=1)

            bellman_residuals *= self._algae_alpha_sign

            init_nu_loss = ((1 - self._gamma) * initial_nu_values *
                            self._algae_alpha_sign)

            nu_loss = (tf.math.square(bellman_residuals) / 2.0 +
                       tf.math.abs(self._algae_alpha) * init_nu_loss)

            loss = (gamma_data_weights * nu_loss /
                    tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True))

            bellman_residuals2 = tf.matmul(
                my_td_mat,
                tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None]
            bellman_residuals2 = tf.transpose(
                tf.squeeze(bellman_residuals2, -1))
            bellman_residuals2 = tf.gather(bellman_residuals2,
                                           self._nu_indices)
            initial_nu_values2 = tf.reduce_sum(  # Average over actions.
                self._initial_target_probs[:, :, None] *
                tf.gather(self._nu2, self._initial_nu_indices),
                axis=1)

            bellman_residuals2 *= -1 * self._algae_alpha_sign

            init_nu_loss2 = ((1 - self._gamma) * initial_nu_values2 * -1 *
                             self._algae_alpha_sign)

            nu_loss2 = (tf.math.square(bellman_residuals2) / 2.0 +
                        tf.math.abs(self._algae_alpha) * init_nu_loss2)

            loss2 = (gamma_data_weights * nu_loss2 /
                     tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True))

            divergence = self._compute_divergence(weights, log_weights)
            divergence_violation = divergence - self._two_sided_limit

            alpha_loss = (-tf.exp(self._alpha) *
                          tf.stop_gradient(divergence_violation))

            extra_loss = tf.reduce_sum(tf.math.square(self._nu[-1, :]))
            extra_loss2 = tf.reduce_sum(tf.math.square(self._nu2[-1, :]))
            nu_grad = tape.gradient(loss + extra_loss, [self._nu])[0]
            nu_grad2 = tape.gradient(loss2 + extra_loss2, [self._nu2])[0]
        avg_loss = tf.reduce_sum(0.5 * (loss - loss2) /
                                 tf.math.abs(self._algae_alpha),
                                 axis=0)
        nu_jacob = tape.jacobian(nu_grad, [self._nu])[0]
        nu_hess = tf.stack(
            [nu_jacob[:, i, :, i] for i in range(self._num_limits)], axis=0)

        nu_jacob2 = tape.jacobian(nu_grad2, [self._nu2])[0]
        nu_hess2 = tf.stack(
            [nu_jacob2[:, i, :, i] for i in range(self._num_limits)], axis=0)

        for idx, div in enumerate(divergence):
            tf.summary.scalar('divergence%d' % idx, div)

        #alpha_grads = tape.gradient(alpha_loss, [self._alpha])
        #alpha_grad_op = self._alpha_optimizer.apply_gradients(
        #    zip(alpha_grads, [self._alpha]))
        #self._alpha.assign(tf.minimum(8., tf.maximum(-8., self._alpha)))

        #print(self._alpha, tf.concat([weights, nu_loss], -1))
        #regularizer = 0.1
        nu_transformed = tf.transpose(
            tf.squeeze(
                tf.linalg.solve(
                    nu_hess + regularizer * tf.eye(self._dimension),
                    tf.expand_dims(-tf.transpose(nu_grad), axis=-1))))
        self._nu = self._nu + 0.1 * nu_transformed
        nu_transformed2 = tf.transpose(
            tf.squeeze(
                tf.linalg.solve(
                    nu_hess2 + regularizer * tf.eye(self._dimension),
                    tf.expand_dims(-tf.transpose(nu_grad2), axis=-1))))
        self._nu2 = self._nu2 + 0.1 * nu_transformed2

        print(avg_loss * self._algae_alpha_sign,
              avg_saddle_loss * self._algae_alpha_sign, self._nu[:2],
              divergence)
        #print(init_nu_loss[:8], init_nu_loss[-8:])
        #print(bellman_residuals[:8])
        #print(self._nu[:3], self._zeta[:3])

        zetas = tf.matmul(my_td_mat,
                          tf.transpose(self._nu)[:, :, None]) - my_bias[:, :,
                                                                        None]
        zetas = tf.transpose(tf.squeeze(zetas, -1))
        zetas *= -self._algae_alpha_sign
        zetas /= tf.math.abs(self._algae_alpha)
        self._zeta = self._zeta + 0.1 * (zetas - self._zeta)

        zetas2 = tf.matmul(my_td_mat,
                           tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :,
                                                                          None]
        zetas2 = tf.transpose(tf.squeeze(zetas2, -1))
        zetas2 *= 1 * self._algae_alpha_sign
        zetas2 /= tf.math.abs(self._algae_alpha)
        self._zeta2 = self._zeta2 + 0.1 * (zetas2 - self._zeta2)

        #self._zeta = (
        #    tf.einsum('ij,ja-> ia', self._td_mat, self._nu) -
        #    tf.transpose(my_bias))
        #self._zeta *= -tf.reshape(self._algae_alpha_sign, [1, self._num_limits])
        #self._zeta /= tf.math.abs(self._algae_alpha)
        return [
            avg_saddle_loss * self._algae_alpha_sign,
            avg_loss * self._algae_alpha_sign, divergence
        ]
コード例 #5
0
  def prepare_dataset(self, dataset: dataset_lib.OffpolicyDataset,
                      target_policy: tf_policy.TFPolicy):
    """Performs pre-computations on dataset to make solving easier."""
    episodes, valid_steps = dataset.get_all_episodes(limit=self._limit_episodes)
    total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1
    num_episodes = tf.shape(valid_steps)[0]
    num_samples = num_episodes * total_num_steps_per_episode
    valid_and_not_last = tf.logical_and(valid_steps, episodes.discount > 0)
    valid_indices = tf.squeeze(
        tf.where(tf.reshape(valid_and_not_last[:, :-1], [-1])))

    # Flatten all tensors so that each data sample is a tuple of
    # (initial_env_step, env_step, next_env_step).
    initial_env_step = tf.nest.map_structure(
        lambda t: tf.squeeze(
            tf.reshape(
                tf.repeat(
                    t[:, 0:1, ...], axis=1, repeats=total_num_steps_per_episode
                ), [num_samples, -1])), episodes)
    initial_env_step = tf.nest.map_structure(
        lambda t: tf.gather(t, valid_indices), initial_env_step)
    tfagents_initial_env_step = dataset_lib.convert_to_tfagents_timestep(
        initial_env_step)

    env_step = tf.nest.map_structure(
        lambda t: tf.squeeze(
            tf.reshape(t[:, 0:total_num_steps_per_episode, ...],
                       [num_samples, -1])), episodes)
    env_step = tf.nest.map_structure(lambda t: tf.gather(t, valid_indices),
                                     env_step)
    tfagents_env_step = dataset_lib.convert_to_tfagents_timestep(env_step)

    next_env_step = tf.nest.map_structure(
        lambda t: tf.squeeze(
            tf.reshape(t[:, 1:total_num_steps_per_episode + 1, ...],
                       [num_samples, -1])), episodes)
    next_env_step = tf.nest.map_structure(lambda t: tf.gather(t, valid_indices),
                                          next_env_step)
    tfagents_next_env_step = dataset_lib.convert_to_tfagents_timestep(
        next_env_step)

    # Get target probabilities for initial and next steps.
    initial_target_probs = target_policy.distribution(
        tfagents_initial_env_step).action.probs_parameter()
    next_target_probs = target_policy.distribution(
        tfagents_next_env_step).action.probs_parameter()

    # Map states and actions to indices into tabular representation.
    initial_states = tf.tile(
        tf.reshape(initial_env_step.observation, [-1, 1]),
        [1, self._num_actions])
    initial_actions = tf.tile(
        tf.reshape(tf.range(self._num_actions), [1, -1]),
        [initial_env_step.observation.shape[0], 1])
    initial_nu_indices = self._get_index(initial_states, initial_actions)

    next_states = tf.tile(
        tf.reshape(next_env_step.observation, [-1, 1]), [1, self._num_actions])
    next_actions = tf.tile(
        tf.reshape(tf.range(self._num_actions), [1, -1]),
        [next_env_step.observation.shape[0], 1])
    next_nu_indices = self._get_index(next_states, next_actions)
    next_nu_indices = tf.where(
        tf.expand_dims(next_env_step.is_absorbing(), -1),
        -1 * tf.ones_like(next_nu_indices), next_nu_indices)

    nu_indices = self._get_index(env_step.observation, env_step.action)

    target_log_probabilities = target_policy.distribution(
        tfagents_env_step).action.log_prob(env_step.action)
    if not self._solve_for_state_action_ratio:
      policy_ratio = tf.exp(target_log_probabilities -
                            env_step.get_log_probability())
    else:
      policy_ratio = tf.ones([
          target_log_probabilities.shape[0],
      ])
    policy_ratios = tf.tile(
        tf.reshape(policy_ratio, [-1, 1]), [1, self._num_actions])

    # Bellman residual matrix of size [n_data, n_dim].
    a_vec = tf.one_hot(nu_indices, self._dimension) - tf.reduce_sum(
        self._gamma *
        tf.expand_dims(next_target_probs * policy_ratios, axis=-1) *
        tf.one_hot(next_nu_indices, self._dimension),
        axis=1)

    state_action_count = self._get_state_action_counts(env_step)
    # Bellman residual matrix of size [n_dim, n_dim].
    td_mat = tf.einsum('ai, a, aj -> ij', tf.one_hot(nu_indices,
                                                     self._dimension),
                       1.0 / tf.cast(state_action_count, tf.float32), a_vec)

    # Reward vector of size [n_data].
    weighted_rewards = policy_ratio * self._reward_fn(env_step)

    # Reward vector of size [n_dim].
    bias = tf.reduce_sum(
        tf.one_hot(nu_indices, self._dimension) *
        tf.reshape(weighted_rewards, [-1, 1]) * 1.0 /
        tf.cast(state_action_count, tf.float32)[:, None],
        axis=0)

    # Initialize.
    self._nu = np.ones_like(self._nu) * bias[:, None]
    self._nu2 = np.ones_like(self._nu2) * bias[:, None]

    self._a_vec = a_vec
    self._td_mat = td_mat
    self._bias = bias
    self._weighted_rewards = weighted_rewards
    self._state_action_count = state_action_count
    self._nu_indices = nu_indices
    self._initial_nu_indices = initial_nu_indices
    self._initial_target_probs = initial_target_probs
コード例 #6
0
  def solve(self,
            dataset: dataset_lib.OffpolicyDataset,
            target_policy: tf_policy.TFPolicy,
            regularizer: float = 1e-8):
    """Solves for Q-values and then approximates target policy value.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.
      regularizer: A small constant to add before dividing.

    Returns:
      Estimated average per-step reward of the target policy.
    """
    num_estimates = 1 + int(self._num_qvalues)
    transition_matrix = np.zeros(
        [self._dimension, self._dimension, num_estimates])
    reward_vector = np.zeros(
        [self._dimension, num_estimates, self._num_perturbations])
    total_weights = np.zeros([self._dimension, num_estimates])

    episodes, valid_steps = dataset.get_all_episodes(limit=self._limit_episodes)
    #all_rewards = self._reward_fn(episodes)
    #reward_std = np.ma.MaskedArray(all_rewards, valid_steps).std()
    tfagents_episodes = dataset_lib.convert_to_tfagents_timestep(episodes)

    sample_weights = np.array(valid_steps, dtype=np.int64)
    if not self._bootstrap or self._num_qvalues is None:
      sample_weights = (
          sample_weights[:, :, None] * np.ones([1, 1, num_estimates]))
    else:
      probs = np.reshape(sample_weights, [-1]) / np.sum(sample_weights)
      weights = np.random.multinomial(
          np.sum(sample_weights), probs,
          size=self._num_qvalues).astype(np.float32)
      weights = np.reshape(
          np.transpose(weights),
          list(np.shape(sample_weights)) + [self._num_qvalues])
      sample_weights = np.concatenate([sample_weights[:, :, None], weights],
                                      axis=-1)

    for episode_num in range(tf.shape(valid_steps)[0]):
      # Precompute probabilites for this episode.
      this_episode = tf.nest.map_structure(lambda t: t[episode_num], episodes)
      this_tfagents_episode = dataset_lib.convert_to_tfagents_timestep(
          this_episode)
      episode_target_log_probabilities = target_policy.distribution(
          this_tfagents_episode).action.log_prob(this_episode.action)
      episode_target_probs = target_policy.distribution(
          this_tfagents_episode).action.probs_parameter()

      for step_num in range(tf.shape(valid_steps)[1] - 1):
        this_step = tf.nest.map_structure(lambda t: t[episode_num, step_num],
                                          episodes)
        next_step = tf.nest.map_structure(
            lambda t: t[episode_num, step_num + 1], episodes)
        this_tfagents_step = dataset_lib.convert_to_tfagents_timestep(this_step)
        next_tfagents_step = dataset_lib.convert_to_tfagents_timestep(next_step)
        this_weights = sample_weights[episode_num, step_num, :]
        if this_step.is_last() or not valid_steps[episode_num, step_num]:
          continue

        weight = this_weights
        this_index = self._get_index(this_step.observation, this_step.action)

        reward_vector[this_index, :, :] += np.expand_dims(
            self._reward_fn(this_step) * weight, -1)
        if self._num_qvalues is not None:
          random_noise = np.random.binomial(this_weights[1:].astype('int64'),
                                            0.5)
          reward_vector[this_index, 1:, :] += (
              self._perturbation_scale[None, :] *
              (2 * random_noise - this_weights[1:])[:, None])

        total_weights[this_index] += weight

        policy_ratio = 1.0
        if not self._solve_for_state_action_value:
          policy_ratio = tf.exp(episode_target_log_probabilities[step_num] -
                                this_step.get_log_probability())

        # Need to weight next nu by importance weight.
        next_weight = (
            weight if self._solve_for_state_action_value else policy_ratio *
            weight)
        if next_step.is_absorbing():
          next_index = -1  # Absorbing state.
          transition_matrix[this_index, next_index] += next_weight
        else:
          next_probs = episode_target_probs[step_num + 1]
          for next_action, next_prob in enumerate(next_probs):
            next_index = self._get_index(next_step.observation, next_action)
            transition_matrix[this_index, next_index] += next_prob * next_weight
    print('Done processing data.')

    transition_matrix /= (regularizer + total_weights)[:, None, :]
    reward_vector /= (regularizer + total_weights)[:, :, None]
    reward_vector[np.where(np.equal(total_weights,
                                    0.0))] = self._default_reward_value
    reward_vector[-1, :, :] = 0.0  # Terminal absorbing state has 0 reward.

    self._point_qvalues = np.linalg.solve(
        np.eye(self._dimension) - self._gamma * transition_matrix[:, :, 0],
        reward_vector[:, 0])
    if self._num_qvalues is not None:
      self._ensemble_qvalues = np.linalg.solve(
          (np.eye(self._dimension) -
           self._gamma * np.transpose(transition_matrix, [2, 0, 1])),
          np.transpose(reward_vector, [1, 0, 2]))

    return self.estimate_average_reward(dataset, target_policy)
コード例 #7
0
    def prepare_dataset(self, dataset: dataset_lib.OffpolicyDataset,
                        target_policy: tf_policy.TFPolicy):
        episodes, valid_steps = dataset.get_all_episodes()
        tfagents_episodes = dataset_lib.convert_to_tfagents_timestep(episodes)

        for episode_num in range(tf.shape(valid_steps)[0]):
            # Precompute probabilites for this episode.
            this_episode = tf.nest.map_structure(lambda t: t[episode_num],
                                                 episodes)
            first_step = tf.nest.map_structure(lambda t: t[0], this_episode)
            this_tfagents_episode = dataset_lib.convert_to_tfagents_timestep(
                this_episode)
            episode_target_log_probabilities = target_policy.distribution(
                this_tfagents_episode).action.log_prob(this_episode.action)
            episode_target_probs = target_policy.distribution(
                this_tfagents_episode).action.probs_parameter()

            for step_num in range(tf.shape(valid_steps)[1] - 1):
                this_step = tf.nest.map_structure(
                    lambda t: t[episode_num, step_num], episodes)
                next_step = tf.nest.map_structure(
                    lambda t: t[episode_num, step_num + 1], episodes)
                if this_step.is_last() or not valid_steps[episode_num,
                                                          step_num]:
                    continue

                weight = 1.0
                nu_index = self._get_index(this_step.observation,
                                           this_step.action)
                self._td_residuals[nu_index, nu_index] += -weight
                self._total_weights[nu_index] += weight

                policy_ratio = 1.0
                if not self._solve_for_state_action_ratio:
                    policy_ratio = tf.exp(
                        episode_target_log_probabilities[step_num] -
                        this_step.get_log_probability())

                # Need to weight next nu by importance weight.
                next_weight = (weight if self._solve_for_state_action_ratio
                               else policy_ratio * weight)
                next_probs = episode_target_probs[step_num + 1]
                for next_action, next_prob in enumerate(next_probs):
                    next_nu_index = self._get_index(next_step.observation,
                                                    next_action)
                    self._td_residuals[next_nu_index,
                                       nu_index] += (next_prob * self._gamma *
                                                     next_weight)

                initial_probs = episode_target_probs[0]
                for initial_action, initial_prob in enumerate(initial_probs):
                    initial_nu_index = self._get_index(first_step.observation,
                                                       initial_action)
                    self._initial_weights[
                        initial_nu_index] += weight * initial_prob

        self._initial_weights = tf.cast(self._initial_weights, tf.float32)
        self._total_weights = tf.cast(self._total_weights, tf.float32)
        self._td_residuals = self._td_residuals / np.sqrt(
            1e-8 + self._total_weights)[None, :]
        self._td_errors = tf.cast(
            np.dot(self._td_residuals, self._td_residuals.T), tf.float32)
        self._td_residuals = tf.cast(self._td_residuals, tf.float32)
コード例 #8
0
    def estimate_reward_ci(self,
                           dataset: dataset_lib.OffpolicyDataset,
                           target_policy: tf_policy.TFPolicy,
                           episode_limit: Optional[int] = None,
                           num_grid: Optional[int] = 100,
                           eps: Optional[float] = 1e-6,
                           num_bootstraps: Optional[int] = 10000,
                           num_bootstrap_samples: Optional[int] = 10000):
        """Estimate the confidence interval of reward."""
        is_weighted_reward_samples = self.get_is_weighted_reward_samples(
            dataset, target_policy, episode_limit)
        episodes, valid_steps = dataset.get_all_episodes(limit=episode_limit)
        num_episodes = tf.shape(valid_steps)[0]
        max_abs_reward = tf.reduce_max(
            tf.where(valid_steps, tf.abs(self._reward_fn(episodes)), 0.))

        # mean estimate
        center = self.estimate_average_reward(dataset, target_policy)
        delta_tail_half = self._delta_tail / 2.0
        num_episodes_float = tf.cast(num_episodes, tf.float32)

        if self._ci_method == 'CH':  # Chernoff-Hoeffding
            width = max_abs_reward * tf.math.sqrt(
                tf.math.log(1.0 / delta_tail_half) / num_episodes_float)
            lb = center - width
            ub = center + width
        elif self._ci_method == 'BE':  # Empirical Bernstein
            constant_term = 7 * max_abs_reward * tf.math.log(
                2.0 / delta_tail_half) / (3 * (num_episodes_float - 1))
            is_weighted_reward_samples_2d = tf.reshape(
                is_weighted_reward_samples, [-1, 1])
            variance_term = tf.reduce_sum(
                tf.square(
                    tf.tile(is_weighted_reward_samples_2d, [1, num_episodes]) -
                    is_weighted_reward_samples_2d))

            variance_term *= tf.math.log(
                2.0 / delta_tail_half) / (num_episodes_float - 1)
            width = constant_term + tf.math.sqrt(
                variance_term) / num_episodes_float
            lb = center - width
            ub = center + width
        elif self._ci_method == 'C-BE':  # Clipped empirical Bernstein
            # need to learn c
            def compute_center_width(c_const):
                """Compute the center and width of CI."""
                c_vec = c_const * tf.ones_like(is_weighted_reward_samples)
                c_is_weighted_reward_samples = tf.minimum(
                    is_weighted_reward_samples, c_vec) / c_vec
                c_is_weighted_reward_samples_2d = tf.reshape(
                    c_is_weighted_reward_samples, [-1, 1])
                constant_term = 7 * num_episodes_float * tf.math.log(
                    2.0 / delta_tail_half) / (3 * (num_episodes_float - 1))

                variance_term = tf.reduce_sum(
                    tf.square(
                        tf.tile(c_is_weighted_reward_samples_2d,
                                [1, num_episodes]) -
                        c_is_weighted_reward_samples_2d))
                variance_term *= tf.math.log(
                    2.0 / delta_tail_half) / (num_episodes_float - 1)

                width = (constant_term + tf.math.sqrt(variance_term)
                         ) / tf.reduce_sum(1.0 / c_vec)
                center = tf.reduce_sum(
                    c_is_weighted_reward_samples) / tf.reduce_sum(1.0 / c_vec)
                return center, width

            def compute_bdd(c_const):
                center, width = compute_center_width(c_const)
                return center - width, center + width

            def compute_obj(c_const, obj='width'):
                center, width = compute_center_width(c_const)
                if obj == 'lb':
                    return center - width
                elif obj == 'ub':  # minimize ub
                    return -(center + width)
                elif obj == 'width':
                    return width
                elif obj == 'lb_ub':
                    return -2 * width
                else:
                    ValueError('Objective is not implemented')

            c_grid = tf.linspace(eps, max_abs_reward, num_grid)
            objs = tf.map_fn(compute_obj, c_grid, dtype=tf.float32)

            star_index = tf.argmax(objs)
            c_star = tf.gather(c_grid, star_index)

            lb, ub = compute_bdd(c_star)

        elif self._ci_method == 'TT':  # Student-t test
            # Two-tailed confidence intervals
            t_statistic_quantile = stats.t.ppf(1 - delta_tail_half,
                                               num_episodes_float - 1)
            std_term = tf.math.sqrt(
                tf.reduce_sum(tf.square(is_weighted_reward_samples - center)) /
                (num_episodes_float - 1))
            width = t_statistic_quantile * std_term / tf.math.sqrt(
                num_episodes_float)
            lb = center - width
            ub = center + width
        elif self._ci_method == 'BCa':  # Bootstrap
            # see references
            # https://faculty.washington.edu/heagerty/Courses/b572/public/GregImholte-1.pdf
            # http://users.stat.umn.edu/~helwig/notes/bootci-Notes.pdf
            gaussian_rv = tfp.distributions.Normal(loc=0, scale=1)

            def _compute_bootstrap_lb_ub(reward_samples):
                """Compute Efron's bootstrap lb."""
                sample_mean = tf.reduce_mean(reward_samples)
                # Step 1, sample with replacement and compute subsampled mean
                uniform_log_prob = tf.tile(
                    tf.expand_dims(tf.zeros(num_episodes), 0),
                    [num_bootstraps, 1])
                ind = tf.random.categorical(uniform_log_prob,
                                            num_bootstrap_samples)
                bootstrap_subsamples = tf.gather(reward_samples, ind)
                subsample_means = tf.reduce_mean(bootstrap_subsamples, axis=1)

                # Step 2, sort subsample means, compute y, z_0, and a
                sorted_subsample_means = tf.sort(subsample_means,
                                                 axis=0,
                                                 direction='ASCENDING')

                # bias factor
                z_0 = gaussian_rv.quantile(
                    tf.reduce_sum(
                        tf.cast(
                            tf.greater(sample_mean, sorted_subsample_means),
                            tf.float32)) / float(num_bootstraps))
                # y is the leave-one-out, jackknife sample mean
                mask_matrix = tf.ones([num_episodes, num_episodes
                                       ]) - tf.eye(num_episodes)
                leave_one_out_subsample_sums = tf.einsum(
                    'j,jk->k', reward_samples, mask_matrix)
                ys = leave_one_out_subsample_sums / (num_episodes_float - 1)

                # average of jackknife estimate
                y_bar = tf.reduce_mean(ys)

                # acceleration factor
                d_ys = y_bar - ys
                a = tf.reduce_sum(tf.pow(d_ys, 3.0)) / tf.maximum(
                    eps, 6.0 * tf.pow(tf.reduce_sum(tf.pow(d_ys, 2.0)), 1.5))

                # Step 3, compute z_scores for lb and ub
                z_score_delta_tail = gaussian_rv.quantile(delta_tail_half)
                z_score_1_delta_tail = gaussian_rv.quantile(1.0 -
                                                            delta_tail_half)

                z_lb = z_0 + (z_score_delta_tail + z_0) / tf.maximum(
                    eps, 1 - a * (z_score_delta_tail + z_0))
                z_ub = z_0 + (z_score_1_delta_tail + z_0) / tf.maximum(
                    eps, 1 - a * (z_score_1_delta_tail + z_0))

                # Step 4, compute corresponding quantiles and get bootstrap intervals
                lb_index = tf.cast(
                    tf.maximum(
                        tf.minimum(
                            tf.floor(num_bootstraps * gaussian_rv.cdf(z_lb)),
                            num_bootstraps - 1), 1), tf.int64)
                ub_index = tf.cast(
                    tf.maximum(
                        tf.minimum(
                            tf.floor(num_bootstraps * gaussian_rv.cdf(z_ub)),
                            num_bootstraps - 1), 1), tf.int64)

                lb = tf.gather(sorted_subsample_means, lb_index)
                ub = tf.gather(sorted_subsample_means, ub_index)

                return lb, ub

            lb, ub = _compute_bootstrap_lb_ub(is_weighted_reward_samples)
        else:
            ValueError('Confidence interval is not implemented!')
        return [lb, ub]
コード例 #9
0
    def get_is_weighted_reward_samples(self,
                                       dataset: dataset_lib.OffpolicyDataset,
                                       target_policy: tf_policy.TFPolicy,
                                       episode_limit: Optional[int] = None,
                                       eps: Optional[float] = 1e-8):
        """Get the IS weighted reweard samples."""
        episodes, valid_steps = dataset.get_all_episodes(limit=episode_limit)
        total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1
        num_episodes = tf.shape(valid_steps)[0]
        num_samples = num_episodes * total_num_steps_per_episode

        init_env_step = tf.nest.map_structure(lambda t: t[:, 0, ...], episodes)
        env_step = tf.nest.map_structure(
            lambda t: tf.squeeze(
                tf.reshape(t[:, 0:total_num_steps_per_episode, ...],
                           [num_samples, -1])), episodes)
        next_env_step = tf.nest.map_structure(
            lambda t: tf.squeeze(
                tf.reshape(t[:, 1:1 + total_num_steps_per_episode, ...],
                           [num_samples, -1])), episodes)
        tfagents_env_step = dataset_lib.convert_to_tfagents_timestep(env_step)

        gamma_weights = tf.reshape(
            tf.pow(self._gamma, tf.cast(env_step.step_num, tf.float32)),
            [num_episodes, total_num_steps_per_episode])

        rewards = (-self._get_q_value(env_step) + self._reward_fn(env_step) +
                   self._gamma * next_env_step.discount *
                   self._get_v_value(next_env_step, target_policy))
        rewards = tf.reshape(rewards,
                             [num_episodes, total_num_steps_per_episode])

        init_values = self._get_v_value(init_env_step, target_policy)
        init_offset = (1 - self._gamma) * init_values

        target_log_probabilities = target_policy.distribution(
            tfagents_env_step).action.log_prob(env_step.action)
        if tf.rank(target_log_probabilities) > 1:
            target_log_probabilities = tf.reduce_sum(target_log_probabilities,
                                                     -1)
        if self._policy_network is not None:
            baseline_policy_log_probability = self._get_log_prob(
                self._policy_network, env_step)
            if tf.rank(baseline_policy_log_probability) > 1:
                baseline_policy_log_probability = tf.reduce_sum(
                    baseline_policy_log_probability, -1)
            policy_log_ratios = tf.reshape(
                tf.maximum(
                    -1.0 / eps, target_log_probabilities -
                    baseline_policy_log_probability),
                [num_episodes, total_num_steps_per_episode])
        else:
            policy_log_ratios = tf.reshape(
                tf.maximum(
                    -1.0 / eps,
                    target_log_probabilities - env_step.get_log_probability()),
                [num_episodes, total_num_steps_per_episode])
        valid_steps_in = valid_steps[:, 0:total_num_steps_per_episode]
        mask = tf.cast(
            tf.logical_and(valid_steps_in, episodes.discount[:, :-1] > 0.),
            tf.float32)

        masked_rewards = tf.where(mask > 0, rewards, tf.zeros_like(rewards))
        clipped_policy_log_ratios = mask * self.clip_log_factor(
            policy_log_ratios)

        if self._mode in ['trajectory-wise', 'weighted-trajectory-wise']:
            trajectory_avg_rewards = tf.reduce_sum(
                masked_rewards * gamma_weights, axis=1) / tf.reduce_sum(
                    gamma_weights, axis=1)
            trajectory_log_ratios = tf.reduce_sum(clipped_policy_log_ratios,
                                                  axis=1)
            if self._mode == 'trajectory-wise':
                trajectory_avg_rewards *= tf.exp(trajectory_log_ratios)
                return init_offset + trajectory_avg_rewards
            else:
                offset = tf.reduce_max(trajectory_log_ratios)
                normalized_clipped_ratios = tf.exp(trajectory_log_ratios -
                                                   offset)
                normalized_clipped_ratios /= tf.maximum(
                    eps, tf.reduce_mean(normalized_clipped_ratios))
                trajectory_avg_rewards *= normalized_clipped_ratios
                return init_offset + trajectory_avg_rewards

        elif self._mode in ['step-wise', 'weighted-step-wise']:
            trajectory_log_ratios = mask * tf.cumsum(policy_log_ratios, axis=1)
            if self._mode == 'step-wise':
                trajectory_avg_rewards = tf.reduce_sum(
                    masked_rewards * gamma_weights *
                    tf.exp(trajectory_log_ratios),
                    axis=1) / tf.reduce_sum(gamma_weights, axis=1)
                return init_offset + trajectory_avg_rewards
            else:
                # Average over data, for each time step.
                offset = tf.reduce_max(trajectory_log_ratios,
                                       axis=0)  # TODO: Handle mask.
                normalized_imp_weights = tf.exp(trajectory_log_ratios - offset)
                normalized_imp_weights /= tf.maximum(
                    eps,
                    tf.reduce_sum(mask * normalized_imp_weights, axis=0) /
                    tf.maximum(eps, tf.reduce_sum(mask, axis=0)))[None, :]

                trajectory_avg_rewards = tf.reduce_sum(
                    masked_rewards * gamma_weights * normalized_imp_weights,
                    axis=1) / tf.reduce_sum(gamma_weights, axis=1)
                return init_offset + trajectory_avg_rewards
        else:
            ValueError('Estimator is not implemented!')