コード例 #1
0
ファイル: particles.py プロジェクト: adak32/bellman
def averaged_particle_returns(reward: Tensor, discount: Tensor,
                              number_of_particles: int) -> Tensor:
    """
    Compute the returns from a set of trajectories, averaging over a number of particles per
    element of the batch.

    :param reward: A batch of trajectories of step rewards. The batch size is the number of action
                   trajectories multiplied by the number of monte-carlo rollouts of each action
                   trajectory.
    :param discount: A batch of trajectories of step discounts. The batch size is the number of
                     action trajectories multiplied by the number of monte-carlo rollouts of each
                     action trajectory.
    :param number_of_particles: Number of monte-carlo rollouts of each action trajectory.

    :return: Monte-carlo estimate of the returns from each action trajectory.
    """
    # Looks weird but is correct! At first sight, digging into it, it evokes the impression
    # that the last reward signal is missed. However, tf's cumprod method has an `exclusive`
    # flag which is set to True such that the last reward signal is included.
    mask = get_contiguous_sub_episodes(discount)

    particles_returns = tf.reduce_sum(reward * mask,
                                      axis=1)  # shape = (batch_size,)
    batch_particles_returns = reshape_create_particle_axis(
        particles_returns, number_of_particles)
    return tf.reduce_mean(batch_particles_returns, axis=1)
コード例 #2
0
ファイル: common_test.py プロジェクト: tonylibing/agents
    def testNumSteps(self):
        discounts = [
            [0.9, 0.9, 0.9, 0.9],  # No episode termination.
            [0.0, 0.9, 0.9, 0.9],  # Episode terminates on first step.
            [0.9, 0.9, 0.0, 0.9]
        ]  # Episode terminates on third step.

        tensor = tf.constant(discounts, dtype=tf.float32)
        result = common.get_contiguous_sub_episodes(tensor)

        expected_result = [[1.0, 1.0, 1.0, 1.0], [1.0, 0.0, 0.0, 0.0],
                           [1.0, 1.0, 1.0, 0.0]]

        self.assertAllClose(expected_result, self.evaluate(result))