get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(total_rewards)) if mdp.slip_chance == 0: assert (1.0 <= np.mean(total_rewards) <= 1.0) else: assert (0.8 <= np.mean(total_rewards) <= 0.95) print('Well done!') if __name__ == '__main__': visualize = True mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) mdp.render() gamma = 0.9 num_iter = 100 min_difference = 1e-5 # Play in Frozen Lake Env state_values = {state: 0 for state in mdp.get_all_states() } # Initialize state_values # Run value iteration algo! state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values)
except ImportError: raise ImportError( "Run the cell that starts with \"%%writefile mdp_get_action_value.py\"" ) s = mdp.reset() rewards = [] for _ in range(10000): s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) print("Average reward: ", np.mean(rewards)) assert (0.40 < np.mean(rewards) < 0.55) mdp = FrozenLakeEnv(slip_chance=0) mdp.render() state_values = value_iteration(mdp) s = mdp.reset() mdp.render() for t in range(100): a = get_optimal_action(mdp, state_values, s, gamma) print(a, end='\n\n') s, r, done, _ = mdp.step(a) mdp.render() if done: break state_values = {s: 0 for s in mdp.get_all_states()}
get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(total_rewards)) if mdp.slip_chance == 0: assert (1.0 <= np.mean(total_rewards) <= 1.0) else: assert (0.8 <= np.mean(total_rewards) <= 0.95) print('Well done!') if __name__ == '__main__': visualize = True mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) mdp.render() gamma = 0.9 num_iter = 100 min_difference = 1e-5 # Play in Frozen Lake Env state_values = {s: 0 for s in mdp.get_all_states()} # Initialize state_values # Run value iteration algo! state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values) # try # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(total_rewards)) if mdp.slip_chance == 0: assert (1.0 <= np.mean(total_rewards) <= 1.0) else: assert (0.8 <= np.mean(total_rewards) <= 0.95) print('Well done!') return total_rewards if __name__ == '__main__': visualize = True mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) mdp.render() gamma = 0.9 num_iter = 100 min_difference = 1e-5 visualize = True mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) # Play in Frozen Lake Env state_values = {s: 0 for s in mdp.get_all_states()} # Initialize state_values # Run value iteration algo! state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values)
get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(total_rewards)) if mdp.slip_chance == 0: assert (1.0 <= np.mean(total_rewards) <= 1.0) else: assert (0.8 <= np.mean(total_rewards) <= 0.95) print('Well done!') if __name__ == '__main__': visualize = False mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) mdp.render() gamma = 0.9 num_iter = 100 min_difference = 1e-5 # Play in Frozen Lake Env state_values = mdp.get_all_states() # Initialize state_values # Run value iteration algo! init_values = {s: 0 for s in state_values} state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, init_values) # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
from mdp import FrozenLakeEnv import matplotlib.pyplot as plt import numpy as np from mdp_get_action_value import get_optimal_action, value_iteration, get_action_value, get_new_state_value mdp = FrozenLakeEnv(slip_chance=0) mdp.render() gamma = 0.9 def draw_policy(mdp, state_values): plt.figure(figsize=(3, 3)) h, w = mdp.desc.shape states = sorted(mdp.get_all_states()) V = np.array([state_values[s] for s in states]) Pi = {s: get_optimal_action(mdp, state_values, s, gamma) for s in states} plt.imshow(V.reshape(w, h), cmap='gray', interpolation='none', clim=(0, 1)) ax = plt.gca() ax.set_xticks(np.arange(h) - .5) ax.set_yticks(np.arange(w) - .5) ax.set_xticklabels([]) ax.set_yticklabels([]) Y, X = np.mgrid[0:4, 0:4] a2uv = {'left': (-1, 0), 'down': (0, -1), 'right': (1, 0), 'up': (-1, 0)} for y in range(h): for x in range(w): plt.text(x, y, str(mdp.desc[y, x].item()), color='g', size=12,
def submit_assigment(get_action_value, get_new_state_value, get_optimal_action, value_iteration, email, token): grader = grading.Grader("EheZDOgLEeenIA4g5qPHFA") sys.stdout = None transition_probs = { 's0': { 'a0': { 's1': 0.8, 's2': 0.2 }, 'a1': { 's1': 0.2, 's2': 0.8 }, }, 's1': { 'a0': { 's0': 0.2, 's2': 0.8 }, 'a1': { 's0': 0.8, 's2': 0.2 }, }, 's2': { 'a0': { 's3': 0.5, 's4': 0.5 }, 'a1': { 's3': 1.0 }, }, 's3': { 'a0': { 's1': 0.9, 's2': 0.1 }, 'a1': { 's1': 0.7, 's2': 0.3 }, }, 's4': { 'a0': { 's3': 1.0 }, 'a1': { 's3': 0.7, 's1': 0.3 }, } } rewards = { 's0': { 'a0': { 's1': 0, 's2': 1 }, 'a1': { 's1': 0, 's2': 1 } }, 's1': { 'a0': { 's0': -1, 's2': 1 }, 'a1': { 's0': -1, 's2': 1 } }, 's2': { 'a0': { 's3': 0, 's4': 1 }, 'a1': { 's3': 0, 's4': 1 } }, 's3': { 'a0': { 's1': -3, 's2': -3 }, 'a1': { 's1': -3, 's2': -3 } }, 's4': { 'a1': { 's1': +10 } } } mdp = MDP(transition_probs, rewards, initial_state='s0') test_Vs = {s: i for i, s in enumerate(mdp.get_all_states())} qvalue1 = get_action_value(mdp, test_Vs, 's1', 'a0', 0.9) qvalue2 = get_action_value(mdp, test_Vs, 's4', 'a1', 0.9) grader.set_answer("F16dC", qvalue1 + qvalue2) # --- svalue1 = get_new_state_value(mdp, test_Vs, 's2', 0.9) svalue2 = get_new_state_value(mdp, test_Vs, 's4', 0.9) grader.set_answer("72cBp", svalue1 + svalue2) # --- state_values = {s: 0 for s in mdp.get_all_states()} gamma = 0.9 # --- action1 = get_optimal_action(mdp, state_values, 's1', gamma) action2 = get_optimal_action(mdp, state_values, 's2', gamma) grader.set_answer("xIuti", action1 + action2) # --- s = mdp.reset() rewards = [] for _ in range(10000): s, r, done, _ = mdp.step( get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) grader.set_answer("Y8g0j", np.mean(rewards) + np.std(rewards)) mdp = FrozenLakeEnv(slip_chance=0.25) state_values = value_iteration(mdp) gamma = 0.9 total_rewards = [] for game_i in range(1000): s = mdp.reset() rewards = [] for t in range(100): s, r, done, _ = mdp.step( get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) grader.set_answer("ABf1b", np.mean(total_rewards) + np.std(total_rewards)) # --- mdp = FrozenLakeEnv(slip_chance=0.25, map_name='8x8') state_values = value_iteration(mdp) gamma = 0.9 total_rewards = [] for game_i in range(1000): s = mdp.reset() rewards = [] for t in range(100): s, r, done, _ = mdp.step( get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) grader.set_answer("U3RzE", np.mean(total_rewards) + np.std(total_rewards)) sys.stdout = sys.__stdout__ grader.submit(email, token)
if logging == True: print("iter %4i | diff: %6.5f | V(start): %.3f " % (i, diff, new_state_values[mdp._initial_state])) state_values = new_state_values if diff < min_difference: break return state_values #Frozen from mdp import FrozenLakeEnv mdp = FrozenLakeEnv(slip_chance=0) mdp.render() state_values = value_iteration(mdp) s = mdp.reset() mdp.render() for t in range(100): a = get_optimal_action(mdp, state_values, s, gamma) print(a, end='\n\n') s, r, done, _ = mdp.step(a) mdp.render() if done: break
# s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(total_rewards)) if mdp.slip_chance == 0: assert (1.0 <= np.mean(total_rewards) <= 1.0) else: assert (0.8 <= np.mean(total_rewards) <= 0.95) print('Well done!') if __name__ == '__main__': visualize = True mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) mdp.render() gamma = 0.9 num_iter = 100 min_difference = 1e-5 # Play in Frozen Lake Env state_values = {} # Initialize state_values # Run value iteration algo! state_values, _ = None, None # See how our agent performs - e.g. render what is going on when agent choose `optimal` value s = mdp.reset() mdp.render()
get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(total_rewards)) if mdp.slip_chance == 0: assert (1.0 <= np.mean(total_rewards) <= 1.0) else: assert (0.8 <= np.mean(total_rewards) <= 0.95) print('Well done!') if __name__ == '__main__': visualize = True mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) mdp.render() gamma = 0.9 num_iter = 100 min_difference = 1e-5 # Play in Frozen Lake Env state_values = {s: 0 for s in mdp.get_all_states()} # Initialize state_values # Run value iteration algo! state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, state_values) # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
# s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma)) rewards.append(r) if done: break total_rewards.append(np.sum(rewards)) print('Average reward: ', np.mean(total_rewards)) if mdp.slip_chance == 0: assert (1.0 <= np.mean(total_rewards) <= 1.0) else: assert (0.8 <= np.mean(total_rewards) <= 0.95) print('Well done!') if __name__ == '__main__': visualize = False mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1) mdp.render() gamma = 0.9 num_iter = 100 min_difference = 1e-5 # Play in Frozen Lake Env state_values = mdp.get_all_states() # Initialize state_values # Run value iteration algo! state_values, _ = mdp.get_all_states() # See how our agent performs - e.g. render what is going on when agent choose `optimal` value s = mdp.reset() mdp.render()