state_values = {state: 0 for state in mdp.get_all_states() } # Initialize state_values # Run value iteration algo! state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values) # See how our agent performs - e.g. render what is going on when agent choose `optimal` value s = mdp.reset() mdp.render() rewards = [] # Save all rewards to see mean reward. for _ in range(num_iter): action = get_optimal_action(mdp, state_values, s, gamma) new_state, reward, done, _ = mdp.step(action) rewards += [reward] s = new_state mdp.render() if done: break print('Done!') print(reward) print('Average reward: ', np.mean(rewards)) # if visualize: # draw_policy(mdp, state_values)
# Play in Frozen Lake Env state_values = mdp.get_all_states() # Initialize state_values # Run value iteration algo! init_values = {s: 0 for s in state_values} state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, init_values) # See how our agent performs - e.g. render what is going on when agent choose `optimal` value s = mdp.reset() mdp.render() rewards = [] # Save all rewards to see mean reward. # Your code here! for t in range(num_iter): s, r, done, _ = mdp.step( get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(rewards)) if visualize: draw_policy(mdp, state_values) # Let's see how it is improving in time. visualize_step_by_step(mdp, gamma, num_iter, min_difference) # Express test! mass_gaming(mdp, gamma, num_iter, 1000, 100)
state_values = {s: 0 for s in mdp.get_all_states()} # Initialize state_values # Run value iteration algo! state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values) # try # See how our agent performs - e.g. render what is going on when agent choose `optimal` value s = mdp.reset() mdp.render() rewards = [] # Save all rewards to see mean reward. # Your code here! max_steps = 10000 for _ in range(max_steps): act = get_optimal_action(mdp, state_values, s, gamma) if act is not None: s, r, done, _ = mdp.step(act) rewards.append(r) print('Average reward: ', np.mean(rewards)) if visualize: draw_policy(mdp, state_values) # Let's see how it is improving in time. visualize_step_by_step(mdp, gamma, num_iter, min_difference) # Express test! mass_gaming(mdp, gamma, num_iter, 1000, 100)
# Play in Frozen Lake Env state_values = {s: 0 for s in mdp.get_all_states()} # Initialize state_values # Run value iteration algo! state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values) # See how our agent performs - e.g. render what is going on when agent choose `optimal` value s = mdp.reset() mdp.render() rewards = [] # Save all rewards to see mean reward. for step in range(1000): s, current_reward, is_done, _ = mdp.step( get_optimal_action(mdp, state_values, s, gamma)) rewards.append(current_reward) if is_done: break print('Average reward: ', np.mean(rewards)) if visualize: draw_policy(mdp, state_values) # Let's see how it is improving in time. #visualize_step_by_step(mdp, gamma, num_iter, min_difference) # Express test! mass_gaming(mdp, gamma, num_iter, 1000, 100)