def policy_iteration_exercise(): pp = pprint.PrettyPrinter(indent=2) env = GridworldEnv() policy, v = policy_improvement(env) print("Policy Probability Distribution:") print(policy) print("") print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") print(np.reshape(np.argmax(policy, axis=1), env.shape)) print("") print("Value Function:") print(v) print("") print("Reshaped Grid Value Function:") print(v.reshape(env.shape)) print("") # Test the value function expected_v = np.array( [0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0]) np.testing.assert_array_almost_equal(v, expected_v, decimal=2)
def getEnv(domain): if domain == "Blackjack": return BlackjackEnv() elif domain == "Gridworld": return GridworldEnv() elif domain == "CliffWalking": return CliffWalkingEnv() elif domain == "WindyGridworld": return WindyGridworldEnv() else: try: return gym.make(domain) except: assert False, "Domain must be a valid (and installed) Gym environment"
def main(): env = GridworldEnv() random_policy = np.ones([env.nS, env.nA]) / env.nA v = policy_eval(random_policy, env) print("Value Function:") print(v) print("") print("Reshaped Grid Value Function:") print(v.reshape(env.shape)) print("") # Test: Make sure the evaluated policy is what we expected expected_v = np.array([ 0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0 ]) np.testing.assert_array_almost_equal(v, expected_v, decimal=2)
def main(): env = GridworldEnv() policy, v = value_iteration(env) print("Policy Probability Distribution:") print(policy) print("") print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") print(np.reshape(np.argmax(policy, axis=1), env.shape)) print("") print("Value Function:") print(v) print("") print("Reshaped Grid Value Function:") print(v.reshape(env.shape)) print("")
from lib.envs.gridworld import GridworldEnv # initialize env = GridworldEnv() # render env env._render() print('State space:', env.nS) print('Action space:', env.nA) # P[state][action] # return: probability, next_state, reward, is_terminated print('Action space:', env.P[14][3])
import numpy as np import pandas as pd import sys import random from collections import namedtuple from matplotlib import pyplot as plt from mpl_toolkits.mplot3d import Axes3D from collections import defaultdict from lib.envs.gridworld import GridworldEnv from lib.envs.windy_gridworld import WindyGridworldEnv from lib.envs.cliff_walking import CliffWalkingEnv from lib import plotting env = GridworldEnv() def make_epsilon_greedy_policy(Q, epsilon, nA): def policy_fn(observation): A = np.ones(nA, dtype=float) * epsilon / nA best_action = np.argmax(Q[observation]) A[best_action] += (1.0 - epsilon) return A return policy_fn def chosen_action(Q): best_action = np.argmax(Q) return best_action
import numpy as np import pprint import sys if "./" not in sys.path: sys.path.append(".") from lib.envs.gridworld import GridworldEnv pp = pprint.PrettyPrinter(indent=2) shape = [4,4] env = GridworldEnv(shape) # env.render() def policy_eval(policy, env, discount_factor=1.0, theta=0.00001): """ Evalueate a policy given an environment and a full description of the environment's dynamics. Args: policy: [S, A] shaped matrix representing the policy. env: OpenAI env. env.P represents transition probability of the environment. env.P[s][a] is a (prob, next_state, reward, done) tuple. theta: We stop evaluation one our value function changes is less than theta for all states. dicount_factor: lambda discount_factor Returns: Vector of length env.nS representing the value function. """ V = np.zeros(env.nS) while True: delta = 0 for s in np.arange(env.nS): v = 0
def setUpModule(): global env env = GridworldEnv()
def policy_evaluation_exercise(): env = GridworldEnv() random_policy = np.ones([env.nS, env.nA]) / env.nA v = policy_eval(random_policy, env) print(v)