print("mdp.get_next_states('s1', 'a0') = ", mdp.get_next_states('s1', 'a0')) print("mdp.get_reward('s1', 'a0', 's0') = ", mdp.get_reward('s1', 'a0', 's0')) print("mdp.get_transition_prob('s1', 'a0', 's0') = ", mdp.get_transition_prob('s1', 'a0', 's0')) visualize = True from mdp import has_graphviz print('Graphviz available: ', has_graphviz) if has_graphviz and visualize: from mdp import plot_graph, plot_graph_with_state_values, plot_graph_optimal_strategy_and_state_values plot_graph(mdp).render() # Complete get_action_value(). check_generate_session_func(mdp, get_action_value) # Complete get_new_state_value() check_get_new_state_value_func(mdp, get_new_state_value) # Let's combine everything together # Complete rl_value_iteration() # Test rl_value_iteration() num_iter = 100 # Maximum iterations, excluding initialization min_difference = 0.001 # stop Value Iteration if new values are this close to old values (or closer)
from mdp_get_action_value import get_action_value from mdp import FrozenLakeEnv import matplotlib.pyplot as plt from IPython.display import clear_output from time import sleep from mdp import has_graphviz from IPython.display import display print("Graphviz available:", has_graphviz) mdp = MDP(transition_probs, rewards, initial_state='s0') if has_graphviz: from mdp import plot_graph, plot_graph_with_state_values, \ plot_graph_optimal_strategy_and_state_values display(plot_graph(mdp)) test_Vs = {s: i for i, s in enumerate(sorted(mdp.get_all_states()))} assert np.isclose(get_action_value(mdp, test_Vs, 's2', 'a1', 0.9), 0.69) assert np.isclose(get_action_value(mdp, test_Vs, 's1', 'a0', 0.9), 3.95) test_Vs_copy = dict(test_Vs) assert np.isclose(get_new_state_value(mdp, test_Vs, 's0', 0.9), 1.8) assert np.isclose(get_new_state_value(mdp, test_Vs, 's2', 0.9), 1.08) assert test_Vs == test_Vs_copy, "please do not change state_values in get_new_state_value" # parameters gamma = 0.9 # discount for MDP num_iter = 100 # maximum iterations, excluding initialization # stop VI if new values are this close to old values (or closer) min_difference = 0.001
# __Note:__ Installing graphviz on some OS (esp. Windows) may be tricky. However, you can ignore this part alltogether and use the standart vizualization. # In[5]: from mdp import has_graphviz from IPython.display import display print("Graphviz available:", has_graphviz) # In[6]: if has_graphviz: from mdp import plot_graph, plot_graph_with_state_values, plot_graph_optimal_strategy_and_state_values display(plot_graph(mdp, graph_size="50,50")) # ### Value Iteration # # Now let's build something to solve this MDP. The simplest algorithm so far is __V__alue __I__teration # # Here's the pseudo-code for VI: # # --- # # `1.` Initialize $V^{(0)}(s)=0$, for all $s$ # # `2.` For $i=0, 1, 2, \dots$ # # `3.` $ \quad V_{(i+1)}(s) = \max_a \sum_{s'} P(s' | s,a) \cdot [ r(s,a,s') + \gamma V_{i}(s')]$, for all $s$