コード例 #1
0
    def test_evaluate_mrp(self):
        start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()})

        v = iterate.converged(
            evaluate_mrp(
                self.finite_flip_flop,
                γ=0.99,
                approx_0=start,
                non_terminal_states_distribution=Choose(
                    set(self.finite_flip_flop.states())),
                num_state_samples=5,
            ),
            done=lambda a, b: a.within(b, 1e-4),
        )

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            self.assertLess(abs(v(s) - 170), 1.0)

        v_finite = iterate.converged(
            evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start),
            done=lambda a, b: a.within(b, 1e-4),
        )

        assert_allclose(v.evaluate([True, False]),
                        v_finite.evaluate([True, False]),
                        rtol=0.01)
コード例 #2
0
    def test_value_iteration(self):
        mdp_map: Mapping[NonTerminal[InventoryState],
                         float] = value_iteration_result(
                             self.si_mdp, self.gamma)[0]
        # print(mdp_map)
        mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states])

        fa = Dynamic({s: 0.0 for s in self.states})
        mdp_finite_fa = iterate.converged(value_iteration_finite(
            self.si_mdp, self.gamma, fa),
                                          done=lambda a, b: a.within(b, 1e-5))
        # print(mdp_finite_fa.values_map)
        mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states)

        self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.01)

        mdp_fa = iterate.converged(value_iteration(self.si_mdp,
                                                   self.gamma,
                                                   fa,
                                                   Choose(self.states),
                                                   num_state_samples=30),
                                   done=lambda a, b: a.within(b, 1e-5))
        # print(mdp_fa.values_map)
        mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states)
        self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 0.01)
コード例 #3
0
    def test_evaluate_mrp(self):
        mrp_vf1: np.ndarray = self.implied_mrp.get_value_function_vec(
            self.gamma)
        # print({s: mrp_vf1[i] for i, s in enumerate(self.states)})

        fa = Dynamic({s: 0.0 for s in self.states})
        mrp_finite_fa = iterate.converged(
            evaluate_finite_mrp(self.implied_mrp, self.gamma, fa),
            done=lambda a, b: a.within(b, 1e-4),
        )
        # print(mrp_finite_fa.values_map)
        mrp_vf2: np.ndarray = mrp_finite_fa.evaluate(self.states)

        self.assertLess(max(abs(mrp_vf1 - mrp_vf2)), 0.001)

        mrp_fa = iterate.converged(
            evaluate_mrp(
                self.implied_mrp,
                self.gamma,
                fa,
                Choose(self.states),
                num_state_samples=30,
            ),
            done=lambda a, b: a.within(b, 0.1),
        )
        # print(mrp_fa.values_map)
        mrp_vf3: np.ndarray = mrp_fa.evaluate(self.states)
        self.assertLess(max(abs(mrp_vf1 - mrp_vf3)), 1.0)
コード例 #4
0
def value_iteration_result(mdp: FiniteMarkovDecisionProcess[S, A],
                           gamma: float) -> Tuple[V[S], FinitePolicy[S, A]]:
    opt_vf: V[S] = converged(value_iteration(mdp, gamma),
                             done=almost_equal_vfs)
    opt_policy: FinitePolicy[S, A] = greedy_policy_from_vf(mdp, opt_vf, gamma)

    return opt_vf, opt_policy
コード例 #5
0
    def solve(
            self,
            xy_vals_seq: Iterable[Tuple[X, float]],
            error_tolerance: Optional[float] = None
    ) -> LinearFunctionApprox[X]:
        if self.direct_solve:
            x_vals, y_vals = zip(*xy_vals_seq)
            feature_vals: np.ndarray = self.get_feature_values(x_vals)
            feature_vals_T: np.ndarray = feature_vals.T
            left: np.ndarray = np.dot(feature_vals_T, feature_vals) \
                + feature_vals.shape[0] * self.regularization_coeff * \
                np.eye(len(self.weights.weights))
            right: np.ndarray = np.dot(feature_vals_T, y_vals)
            ret = replace(self,
                          weights=Weights.create(
                              adam_gradient=self.weights.adam_gradient,
                              weights=np.dot(np.linalg.inv(left), right)))
        else:
            tol: float = 1e-6 if error_tolerance is None else error_tolerance

            def done(a: LinearFunctionApprox[X],
                     b: LinearFunctionApprox[X],
                     tol: float = tol) -> bool:
                return a.within(b, tol)

            ret = iterate.converged(self.iterate_updates(
                itertools.repeat(xy_vals_seq)),
                                    done=done)

        return ret
コード例 #6
0
ファイル: assignment5_code.py プロジェクト: sogipec/RL-book
def evaluate_mrp_result(
    mrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    approx_0: FunctionApprox[S],
) -> FunctionApprox[S]:
    v_star: np.ndarray = converged(evaluate_finite_mrp(mrp, gamma, approx_0),
                                   done=almost_equal_vf_approx)
    return v_star
コード例 #7
0
ファイル: assignment5_code.py プロジェクト: sogipec/RL-book
def approximate_policy_evaluation_result(mdp: FiniteMarkovDecisionProcess[S,
                                                                          A],
                                         policy: FinitePolicy[S, A],
                                         vf: FunctionApprox[S],
                                         gamma: float = 0.9):
    v_star: np.ndarray = converged(approximate_policy_evaluation(
        mdp, policy, vf, gamma),
                                   done=almost_equal_np_arrays)
    return {s: v_star[i] for i, s in enumerate(mdp.non_terminal_states)}
コード例 #8
0
    def test_evaluate_finite_mrp(self):
        start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()})
        v = iterate.converged(
            evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start),
            done=lambda a, b: a.within(b, 1e-4),
        )

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            self.assertLess(abs(v(s) - 170), 0.1)
コード例 #9
0
    def solve(self,
              xy_vals_seq: Iterable[Tuple[X, float]],
              error_tolerance: Optional[float] = None) -> DNNApprox[X]:
        tol: float = 1e-6 if error_tolerance is None else error_tolerance

        def done(a: DNNApprox[X], b: DNNApprox[X], tol: float = tol) -> bool:
            return a.within(b, tol)

        return iterate.converged(self.iterate_updates(
            itertools.repeat(xy_vals_seq)),
                                 done=done)
コード例 #10
0
    def test_converge(self):
        def close(a, b):
            return abs(a - b) < 0.1

        ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1))
        self.assertAlmostEqual(converged(ns, close), 0.33, places=2)

        ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1))
        all_ns = [1.0, 0.5, 0.33]
        for got, expected in zip(converge(ns, close), all_ns):
            self.assertAlmostEqual(got, expected, places=2)
コード例 #11
0
    def test_converge_end(self):
        """Check that converge ends the iterator at the right place when the
        underlying iterator ends before converging.

        """
        def close(a, b):
            return abs(a - b) < 0.1

        ns = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
        self.assertAlmostEqual(converged(iter(ns), close), 2.0)

        for got, expected in zip(converge(iter(ns), close), ns):
            self.assertAlmostEqual(got, expected)
コード例 #12
0
    def test_evaluate_finite_mrp(self):
        start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()})
        traces = self.finite_flip_flop.reward_traces(Choose({True, False}))
        v = iterate.converged(
            mc.evaluate_mrp(traces, γ=0.99, approx_0=start),
            # Loose bound of 0.025 to speed up test.
            done=lambda a, b: a.within(b, 0.025))

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            # Intentionally loose bound—otherwise test is too slow.
            # Takes >1s on my machine otherwise.
            self.assertLess(abs(v(s) - 170), 1.0)
コード例 #13
0
    def test_compare_to_backward_induction(self):
        finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10)

        start = Dynamic({s: 0.0 for s in finite_horizon.states()})
        v = iterate.converged(
            evaluate_finite_mrp(finite_horizon, γ=1, approx_0=start),
            done=lambda a, b: a.within(b, 1e-4),
        )

        self.assertEqual(len(v.values_map), 22)

        finite_v = list(
            evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1))

        for time in range(0, 10):
            self.assertAlmostEqual(v(WithTime(state=True, time=time)),
                                   finite_v[time][True])
            self.assertAlmostEqual(v(WithTime(state=False, time=time)),
                                   finite_v[time][False])
コード例 #14
0
ファイル: prob_5.2.py プロジェクト: lkourti/RL-book
    def update(vf_policy: Tuple[FunctionApprox[S], ThisPolicy[S, A]]) \
            -> Tuple[FunctionApprox[S], ThisPolicy[S, A]]:

        nt_states: Sequence[S] = non_terminal_states_distribution\
            .sample_n(num_state_samples)

        vf, pi = vf_policy
        mrp: MarkovRewardProcess[S] = mdp.apply_policy(pi)
        new_vf: FunctionApprox[S] = converged(
            evaluate_mrp(mrp, γ, vf, non_terminal_states_distribution, num_state_samples),
            done=lambda a, b: a.within(b, 1e-4)
        )

        def return_(s_r: Tuple[S, float]) -> float:
            s1, r = s_r
            return r + γ * new_vf.evaluate([s1]).item()

        return (new_vf.update([(s, max(mdp.step(s, a).expectation(return_)
                                       for a in mdp.actions(s))) for s in nt_states]),
                ThisPolicy(mdp, return_))
コード例 #15
0
ファイル: td.py プロジェクト: shenoy1/RL-book
def batch_td_prediction(
        transitions: Iterable[mp.TransitionStep[S]],
        approx_0: ValueFunctionApprox[S],
        γ: float,
        convergence_tolerance: float = 1e-5) -> ValueFunctionApprox[S]:
    '''transitions is a finite iterable'''
    def step(v: ValueFunctionApprox[S],
             tr_seq: Sequence[mp.TransitionStep[S]]) -> ValueFunctionApprox[S]:
        return v.update([(tr.state,
                          tr.reward + γ * extended_vf(v, tr.next_state))
                         for tr in tr_seq])

    def done(a: ValueFunctionApprox[S],
             b: ValueFunctionApprox[S],
             convergence_tolerance=convergence_tolerance) -> bool:
        return b.within(a, convergence_tolerance)

    return iterate.converged(iterate.accumulate(itertools.repeat(
        list(transitions)),
                                                step,
                                                initial=approx_0),
                             done=done)
コード例 #16
0
ファイル: assignment5_code.py プロジェクト: sogipec/RL-book
def approx_policy_iteration_result(
    mdp: FiniteMarkovDecisionProcess[S, A], gamma: float,
    approx0: FunctionApprox[S]
) -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]:
    return converged(policy_iteration(mdp, gamma, approx0),
                     done=almost_equal_vf_approx_pi)
コード例 #17
0
def evaluate_mrp_result(mrp: FiniteMarkovRewardProcess[S],
                        gamma: float) -> V[S]:
    v_star: np.ndarray = converged(evaluate_mrp(mrp, gamma=gamma),
                                   done=almost_equal_np_arrays)
    return {s: v_star[i] for i, s in enumerate(mrp.non_terminal_states)}
コード例 #18
0
def policy_iteration_result(
    mdp: FiniteMarkovDecisionProcess[S, A],
    gamma: float,
) -> Tuple[V[S], FinitePolicy[S, A]]:
    return converged(policy_iteration(mdp, gamma), done=almost_equal_vf_pis)
コード例 #19
0
ファイル: function_approx.py プロジェクト: mindis/RL-book
    def converged(iterator: Iterator[FunctionApprox[X]],
                  tolerance: float = 0.0001) -> FunctionApprox[X]:
        def done(a, b):
            return a.within(b, tolerance)

        return iterate.converged(iterator, done=done)