def test_evaluate_mrp(self): start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()}) v = iterate.converged( evaluate_mrp( self.finite_flip_flop, γ=0.99, approx_0=start, non_terminal_states_distribution=Choose( set(self.finite_flip_flop.states())), num_state_samples=5, ), done=lambda a, b: a.within(b, 1e-4), ) self.assertEqual(len(v.values_map), 2) for s in v.values_map: self.assertLess(abs(v(s) - 170), 1.0) v_finite = iterate.converged( evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start), done=lambda a, b: a.within(b, 1e-4), ) assert_allclose(v.evaluate([True, False]), v_finite.evaluate([True, False]), rtol=0.01)
def test_value_iteration(self): mdp_map: Mapping[NonTerminal[InventoryState], float] = value_iteration_result( self.si_mdp, self.gamma)[0] # print(mdp_map) mdp_vf1: np.ndarray = np.array([mdp_map[s] for s in self.states]) fa = Dynamic({s: 0.0 for s in self.states}) mdp_finite_fa = iterate.converged(value_iteration_finite( self.si_mdp, self.gamma, fa), done=lambda a, b: a.within(b, 1e-5)) # print(mdp_finite_fa.values_map) mdp_vf2: np.ndarray = mdp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf2)), 0.01) mdp_fa = iterate.converged(value_iteration(self.si_mdp, self.gamma, fa, Choose(self.states), num_state_samples=30), done=lambda a, b: a.within(b, 1e-5)) # print(mdp_fa.values_map) mdp_vf3: np.ndarray = mdp_fa.evaluate(self.states) self.assertLess(max(abs(mdp_vf1 - mdp_vf3)), 0.01)
def test_evaluate_mrp(self): mrp_vf1: np.ndarray = self.implied_mrp.get_value_function_vec( self.gamma) # print({s: mrp_vf1[i] for i, s in enumerate(self.states)}) fa = Dynamic({s: 0.0 for s in self.states}) mrp_finite_fa = iterate.converged( evaluate_finite_mrp(self.implied_mrp, self.gamma, fa), done=lambda a, b: a.within(b, 1e-4), ) # print(mrp_finite_fa.values_map) mrp_vf2: np.ndarray = mrp_finite_fa.evaluate(self.states) self.assertLess(max(abs(mrp_vf1 - mrp_vf2)), 0.001) mrp_fa = iterate.converged( evaluate_mrp( self.implied_mrp, self.gamma, fa, Choose(self.states), num_state_samples=30, ), done=lambda a, b: a.within(b, 0.1), ) # print(mrp_fa.values_map) mrp_vf3: np.ndarray = mrp_fa.evaluate(self.states) self.assertLess(max(abs(mrp_vf1 - mrp_vf3)), 1.0)
def value_iteration_result(mdp: FiniteMarkovDecisionProcess[S, A], gamma: float) -> Tuple[V[S], FinitePolicy[S, A]]: opt_vf: V[S] = converged(value_iteration(mdp, gamma), done=almost_equal_vfs) opt_policy: FinitePolicy[S, A] = greedy_policy_from_vf(mdp, opt_vf, gamma) return opt_vf, opt_policy
def solve( self, xy_vals_seq: Iterable[Tuple[X, float]], error_tolerance: Optional[float] = None ) -> LinearFunctionApprox[X]: if self.direct_solve: x_vals, y_vals = zip(*xy_vals_seq) feature_vals: np.ndarray = self.get_feature_values(x_vals) feature_vals_T: np.ndarray = feature_vals.T left: np.ndarray = np.dot(feature_vals_T, feature_vals) \ + feature_vals.shape[0] * self.regularization_coeff * \ np.eye(len(self.weights.weights)) right: np.ndarray = np.dot(feature_vals_T, y_vals) ret = replace(self, weights=Weights.create( adam_gradient=self.weights.adam_gradient, weights=np.dot(np.linalg.inv(left), right))) else: tol: float = 1e-6 if error_tolerance is None else error_tolerance def done(a: LinearFunctionApprox[X], b: LinearFunctionApprox[X], tol: float = tol) -> bool: return a.within(b, tol) ret = iterate.converged(self.iterate_updates( itertools.repeat(xy_vals_seq)), done=done) return ret
def evaluate_mrp_result( mrp: FiniteMarkovRewardProcess[S], gamma: float, approx_0: FunctionApprox[S], ) -> FunctionApprox[S]: v_star: np.ndarray = converged(evaluate_finite_mrp(mrp, gamma, approx_0), done=almost_equal_vf_approx) return v_star
def approximate_policy_evaluation_result(mdp: FiniteMarkovDecisionProcess[S, A], policy: FinitePolicy[S, A], vf: FunctionApprox[S], gamma: float = 0.9): v_star: np.ndarray = converged(approximate_policy_evaluation( mdp, policy, vf, gamma), done=almost_equal_np_arrays) return {s: v_star[i] for i, s in enumerate(mdp.non_terminal_states)}
def test_evaluate_finite_mrp(self): start = Dynamic({s: 0.0 for s in self.finite_flip_flop.states()}) v = iterate.converged( evaluate_finite_mrp(self.finite_flip_flop, γ=0.99, approx_0=start), done=lambda a, b: a.within(b, 1e-4), ) self.assertEqual(len(v.values_map), 2) for s in v.values_map: self.assertLess(abs(v(s) - 170), 0.1)
def solve(self, xy_vals_seq: Iterable[Tuple[X, float]], error_tolerance: Optional[float] = None) -> DNNApprox[X]: tol: float = 1e-6 if error_tolerance is None else error_tolerance def done(a: DNNApprox[X], b: DNNApprox[X], tol: float = tol) -> bool: return a.within(b, tol) return iterate.converged(self.iterate_updates( itertools.repeat(xy_vals_seq)), done=done)
def test_converge(self): def close(a, b): return abs(a - b) < 0.1 ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1)) self.assertAlmostEqual(converged(ns, close), 0.33, places=2) ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1)) all_ns = [1.0, 0.5, 0.33] for got, expected in zip(converge(ns, close), all_ns): self.assertAlmostEqual(got, expected, places=2)
def test_converge_end(self): """Check that converge ends the iterator at the right place when the underlying iterator ends before converging. """ def close(a, b): return abs(a - b) < 0.1 ns = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0] self.assertAlmostEqual(converged(iter(ns), close), 2.0) for got, expected in zip(converge(iter(ns), close), ns): self.assertAlmostEqual(got, expected)
def test_evaluate_finite_mrp(self): start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()}) traces = self.finite_flip_flop.reward_traces(Choose({True, False})) v = iterate.converged( mc.evaluate_mrp(traces, γ=0.99, approx_0=start), # Loose bound of 0.025 to speed up test. done=lambda a, b: a.within(b, 0.025)) self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 1.0)
def test_compare_to_backward_induction(self): finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10) start = Dynamic({s: 0.0 for s in finite_horizon.states()}) v = iterate.converged( evaluate_finite_mrp(finite_horizon, γ=1, approx_0=start), done=lambda a, b: a.within(b, 1e-4), ) self.assertEqual(len(v.values_map), 22) finite_v = list( evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1)) for time in range(0, 10): self.assertAlmostEqual(v(WithTime(state=True, time=time)), finite_v[time][True]) self.assertAlmostEqual(v(WithTime(state=False, time=time)), finite_v[time][False])
def update(vf_policy: Tuple[FunctionApprox[S], ThisPolicy[S, A]]) \ -> Tuple[FunctionApprox[S], ThisPolicy[S, A]]: nt_states: Sequence[S] = non_terminal_states_distribution\ .sample_n(num_state_samples) vf, pi = vf_policy mrp: MarkovRewardProcess[S] = mdp.apply_policy(pi) new_vf: FunctionApprox[S] = converged( evaluate_mrp(mrp, γ, vf, non_terminal_states_distribution, num_state_samples), done=lambda a, b: a.within(b, 1e-4) ) def return_(s_r: Tuple[S, float]) -> float: s1, r = s_r return r + γ * new_vf.evaluate([s1]).item() return (new_vf.update([(s, max(mdp.step(s, a).expectation(return_) for a in mdp.actions(s))) for s in nt_states]), ThisPolicy(mdp, return_))
def batch_td_prediction( transitions: Iterable[mp.TransitionStep[S]], approx_0: ValueFunctionApprox[S], γ: float, convergence_tolerance: float = 1e-5) -> ValueFunctionApprox[S]: '''transitions is a finite iterable''' def step(v: ValueFunctionApprox[S], tr_seq: Sequence[mp.TransitionStep[S]]) -> ValueFunctionApprox[S]: return v.update([(tr.state, tr.reward + γ * extended_vf(v, tr.next_state)) for tr in tr_seq]) def done(a: ValueFunctionApprox[S], b: ValueFunctionApprox[S], convergence_tolerance=convergence_tolerance) -> bool: return b.within(a, convergence_tolerance) return iterate.converged(iterate.accumulate(itertools.repeat( list(transitions)), step, initial=approx_0), done=done)
def approx_policy_iteration_result( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, approx0: FunctionApprox[S] ) -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]: return converged(policy_iteration(mdp, gamma, approx0), done=almost_equal_vf_approx_pi)
def evaluate_mrp_result(mrp: FiniteMarkovRewardProcess[S], gamma: float) -> V[S]: v_star: np.ndarray = converged(evaluate_mrp(mrp, gamma=gamma), done=almost_equal_np_arrays) return {s: v_star[i] for i, s in enumerate(mrp.non_terminal_states)}
def policy_iteration_result( mdp: FiniteMarkovDecisionProcess[S, A], gamma: float, ) -> Tuple[V[S], FinitePolicy[S, A]]: return converged(policy_iteration(mdp, gamma), done=almost_equal_vf_pis)
def converged(iterator: Iterator[FunctionApprox[X]], tolerance: float = 0.0001) -> FunctionApprox[X]: def done(a, b): return a.within(b, tolerance) return iterate.converged(iterator, done=done)