if has_graphviz: display(plot_graph_with_state_values(mdp, state_values)) print("Final state values:", state_values) assert abs(state_values['s0'] - 3.781) < 0.01 assert abs(state_values['s1'] - 7.294) < 0.01 assert abs(state_values['s2'] - 4.202) < 0.01 assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1' assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0' assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a1' if has_graphviz: try: display(plot_graph_optimal_strategy_and_state_values( mdp, state_values)) except ImportError: raise ImportError( "Run the cell that starts with \"%%writefile mdp_get_action_value.py\"" ) s = mdp.reset() rewards = [] for _ in range(10000): s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) print("Average reward: ", np.mean(rewards)) assert (0.40 < np.mean(rewards) < 0.55)
# In[15]: assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1' assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0' assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a1' assert get_optimal_action(mdp, {'s0': -1e10, 's1': 0, 's2': -2e10}, 's0', 0.9) == 'a0', "Please ensure that you handle negative Q-values of arbitrary magnitude correctly" assert get_optimal_action(mdp, {'s0': -2e10, 's1': 0, 's2': -1e10}, 's0', 0.9) == 'a1', "Please ensure that you handle negative Q-values of arbitrary magnitude correctly" # In[16]: if has_graphviz: display(plot_graph_optimal_strategy_and_state_values(mdp, state_values, get_action_value)) # In[17]: # Measure agent's average reward s = mdp.reset() rewards = [] for _ in range(10000): s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) print("average reward: ", np.mean(rewards))
init_values = {state: 0 for state in mdp.get_all_states()} state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, init_values) # Draw state_values after training. if has_graphviz and visualize: plot_graph_with_state_values( mdp, state_values).render(filename='MDP_with_states') print('Final state values:', state_values) check_state_values(state_values) # Complete get_optimal_action function. check_get_optimal_action(get_optimal_action, mdp, state_values, gamma) # Visualize optimal strategy. if has_graphviz and visualize: plot_graph_optimal_strategy_and_state_values( mdp, state_values, get_action_value, gamma).render(filename='MDP_with_opt_strategy') print([ get_optimal_action(mdp, state_values, s, gamma=0.9) for s in mdp.get_all_states() ]) # Test optimal strategy. rewards = test_optimal_strategy(mdp, state_values, gamma, 10000) print('Average reward: ', np.mean(rewards)) assert (0.85 < np.mean(rewards) < 1.0)
if has_graphviz: plot_graph_with_state_values(mdp, state_values).render(view=True) print("Final state values:", state_values) assert abs(state_values['s0'] - 8.032) < 0.01 assert abs(state_values['s1'] - 11.169) < 0.01 assert abs(state_values['s2'] - 8.921) < 0.01 assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1' assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0' assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a0' if has_graphviz: try: plot_graph_optimal_strategy_and_state_values( mdp, state_values).render(view=True) except ImportError: raise ImportError( "Run the cell that starts with \"%%writefile mdp_get_action_value.py\"" ) # Measure agent's average reward s = mdp.reset() rewards = [] for _ in range(10000): s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) print("average reward: ", np.mean(rewards))