Ejemplo n.º 1
0
  def test_compute_fixed_budget(self):
    """Tests for a specific KL if the program computes cost correctly."""
    kl_per_t = np.array([5., 4., 3., 2.85, 2.80])
    budgets = [1, 2, 3, 4, 5]

    correct_outs = [25, 19, 18, 17.7, 17.65]

    _, costs = dynamic_programming.compute_fixed_budget(kl_per_t, budgets)

    self.assertTrue(np.allclose(costs, correct_outs))
Ejemplo n.º 2
0
    def compute_policies_and_costs(self, kl_per_t, budgets):
        assert kl_per_t.shape[0] == self.num_steps

        # Sorting reduces bias in the dynamic programming computation. Otherwise
        # it easily exploits the non-monotonicity due to stochastic estimates.
        kl_values_sorted = jnp.sort(kl_per_t)[::-1]
        policies, costs = dynamic_programming.compute_fixed_budget(
            kl_values_sorted, budgets)

        return policies, costs
Ejemplo n.º 3
0
    def compute_policies_and_costs(self, kl_per_t, budgets):
        assert kl_per_t.shape[0] == self.num_steps

        policies = [[] for _ in range(len(budgets))]
        costs = [0 for _ in range(len(budgets))]

        for stage in range(self.num_stages):
            kls_stage = kl_per_t[stage * self.num_steps_per_stage:(stage + 1) *
                                 self.num_steps_per_stage]
            # Sorting reduces bias in the dynamic programming computation. Otherwise
            # it easily exploits the non-monotonicity due to stochastic estimates.
            kls_stage_sorted = jnp.sort(kls_stage)[::-1]

            policies_stage, costs_stage = dynamic_programming.compute_fixed_budget(
                kls_stage_sorted, budgets)

            for i in range(len(policies_stage)):
                policies[i].append(policies_stage[i])
                costs[i] += costs_stage[i]

        policies = [jnp.asarray(p, dtype=jnp.int32) for p in policies]

        return policies, costs