Exemple #1
0
def args_handler(parser):
    args = parser.parse_args()
    config, size = {}, DEFAULT_GRID_SIZE
    if args.size:
        size = int(args.size)
    grid = GridworldEnv((size, size))
    if args.gamma:
        config['gamma'] = float(args.gamma)
    if args.exps:
        config['exps'] = int(args.exps)
    if args.eps:
        config['num_episodes'] = int(args.eps)
    if args.epsilon:
        config['epsilon'] = float(args.epsilon)
    if args.alpha:
        config['alpha'] = float(args.alpha)
    if args.lamb:
        config['l'] = float(args.lamb)
    if args.alg not in ['q', 's']:
        parser.print_help()
        return
    elif args.alg == 'q':
        learner = QLearner(grid, **config)
    elif args.alg == 's':
        learner = SarsaLambdaLearner(grid, **config)
    print "\n"
    print learner.learn()[0]
 def __init__(self, name, learner, size=5, **params):
     self.name = name
     self.grid = GridworldEnv((size, size))
     if learner == 'q':
         self.learner = QLearner(self.grid, **params)
     if learner == 's':
         self.learner = SarsaLambdaLearner(self.grid, **params)
def main():
    np.random.seed(26)
    env = GridworldEnv(shape=[4,4])
    agent = Agent(env)
    td_zero_res = agent.td_zero(discount_factor=0.25, alpha=0.10)    
    print("Result TD(0) Value function:")
    print(td_zero_res.reshape((env.shape)))
    
    td_lambda_res_bw = agent.td_lambda(discount_factor=0.25, alpha=0.1, _lambda=0.5, backward=True)
    print("Result backward TD(Lambda=0.5) Value function:")
    print(td_lambda_res_bw.reshape((env.shape)))
    
    td_lambda_res_fw = agent.td_lambda(discount_factor=0.25, alpha=0.1, _lambda=0.5, backward=False)
    print("Result forward TD(Lambda=0.5) Value function:")
    print(td_lambda_res_fw.reshape((env.shape)))
    
    sarsa_gridworld = agent.sarsa(num_iter=1000, alpha=0.25, discount_factor=None, epsilon=0.1)
    print("Result SARSA (no Lambda) for gridworld. Optimal Q function:")
    print(np.round(sarsa_gridworld, 2))        
    
    sarsa_gridworld_lambda = agent.sarsa_lambda(num_iter=1000, alpha=0.25, lambda_=0.5,
                                                discount_factor=None, epsilon=0.1)
    print("Result SARSA (lambda=0.5) for gridworld. Optimal Q function:")
    print(np.round(sarsa_gridworld_lambda, 2))   
    
    windy_env = WindyGridworldEnv()
    agent_2 = Agent(windy_env)
    sarsa_windy_gridworld = agent_2.sarsa(num_iter=1000, alpha=0.25, discount_factor=None, epsilon=0.1)
    print("Result SARSA (no Lambda) for windy gridworld. Optimal Q function:")
    print(np.round(sarsa_windy_gridworld, 2))    

    sarsa_windy_gridworld_lambda = agent_2.sarsa_lambda(num_iter=1000, alpha=0.25, discount_factor=None,
                                                        epsilon=0.1, lambda_=0.5)
    print("Result SARSA (lambda=0.5) for windy gridworld. Optimal Q function:")
    print(np.round(sarsa_windy_gridworld_lambda, 2))    
Exemple #4
0
    def setState(self, observation):

        self.lstate = GridworldEnv.state2str(observation)

        if self.lstate not in self.Q.keys():
            self.Q[self.lstate] = np.zeros(self.nb_action)

        if random.uniform(0, 1) < self.epsilon:
            self.laction = np.random.randint(self.nb_action)

        else:
            self.laction = np.argmax([self.Q[self.lstate]])
Exemple #5
0
def create_env(env_name):
    """
    Create/load the environment associated with :env_name
    """
    if env_name == "SimpleGridWorld":
        return GridworldEnv()
    elif env_name == "MediumGridWorld":
        return GridworldEnv(shape=[10,10])
    elif env_name == "LargeGridWorld":
        return GridworldEnv(shape=[20,20])
    elif env_name == "HugeGridWorld":
        return GridworldEnv(shape=[31,31])
    elif env_name == "SimpleRectangleWorld":
        return GridworldEnv(shape=[10,4])
    elif env_name == "LargeRectangleWorld":
        return GridworldEnv(shape=[15,31])
    elif env_name == "SimpleMazeWorld":
        return load_maze("SimpleMazeWorld")
    elif env_name == "MediumMazeWorld":
        return load_maze("MediumMazeWorld", (15, 15))
    elif env_name == "LargeMazeWorld":
        return load_maze("LargeMazeWorld", (25, 25))
    elif env_name == "SimpleWindyGridWorld":
        return create_windy_gridworld((7,10), ((0, 1, 2, 9), (3, 4, 5, 8), (6, 7)), (3,7))
    elif env_name == "MediumRectangularWindyGridWorld":
        return create_windy_gridworld((20,5), ((0, 1), (2, 3), (4)), (12, 3))
    elif env_name == "LargeRectangularWindyGridWorld":
        return create_windy_gridworld((30,15), ((5, 6, 7, 8, 12, 14), (0, 1, 2, 3, 13), (4), (9, 10, 11)), (7, 8))
    else:
        return gym.envs.make(env_name)
Exemple #6
0
def main():
    np.random.seed(26)
    env = GridworldEnv(shape=[4,4])
    agent = Agent(env)
    ## Sample one episode
    episode = agent.generate_episode(policy=agent.env.isap)
    print("Example: Sample episode for Monte Carlo:")
    print(episode)
    ## Do First-Visit-Monte-Carlo Prediction
    first_visit_MC_value_fnc = agent.monte_carlo_prediction(first_visit=True,
                                                 discount_factor=1.0,
                                                 num_iter=1000)
    first_visit_MC_value_fnc = np.round(first_visit_MC_value_fnc, 2)
    print("Result first-visit Monte Carlo:")
    print(first_visit_MC_value_fnc.reshape((env.shape)))

    ## Do Every-Visit Monte-Carlo Prediction
    every_visit_MC_value_fnc = agent.monte_carlo_prediction(first_visit=False,
                                                 discount_factor=1.0,
                                                 num_iter=1000)
    every_visit_MC_value_fnc = np.round(every_visit_MC_value_fnc, 2)
    print("Result every-visit Monte Carlo:")
    print(every_visit_MC_value_fnc.reshape((env.shape)))
    
    ## Do Every-Visit Monte-Carlo Control with Exploring Starts (no epsilon greedy method)
    Q_control_no_epsilon, policy_control_no_epsilon = agent.monte_carlo_control(policy=None, num_iter=200,
                                                                                discount_factor=None, epsilon_method=False,
                                                                                epsilon=0.1, on_policy=True)
    Q_control_no_epsilon = np.round(Q_control_no_epsilon, 2)
    policy_control_no_epsilon = np.round(policy_control_no_epsilon, 2)
    print("Result every-visit Monte Carlo Control Q-Function:")
    print(Q_control_no_epsilon)
    print("Result every-visit Monte Carlo Control optimal policy:")
    print(policy_control_no_epsilon)
    
    ## Do Every-Visit Monte-Carlo Control epsilon greedy method:
    Q_control_eps_greedy, policy_control_eps_greedy = agent.monte_carlo_control(policy=None, num_iter=500,
                                                                                discount_factor=None, epsilon_method=True,
                                                                                epsilon=0.1, on_policy=True)
    Q_control_eps_greedy = np.round(Q_control_eps_greedy, 2)
    policy_control_eps_greedy = np.round(policy_control_eps_greedy, 2)
    print("Result every-visit Monte Carlo Control Q-Function:")
    print(Q_control_eps_greedy)
    print("Result every-visit Monte Carlo Control optimal policy:")
    print(policy_control_eps_greedy)
Exemple #7
0
    def act(self, observation, reward, done):

        obs = GridworldEnv.state2str(observation)

        if obs not in self.Q.keys():
            self.Q[obs] = np.zeros(self.nb_action)

        if random.uniform(0, 1) < self.epsilon:
            self.laction = np.random.randint(self.nb_action)

        else:
            self.laction = np.argmax([self.Q[obs]])

        self._update_Qvalue(reward, obs, done)

        self.lstate = obs

        return self.laction
Exemple #8
0
def main(shape=[4, 4]):
    env = GridworldEnv(shape=shape)
    agent = Agent(env=env)

    ## Policy Evaluation
    print("Do Policy Evaluation...:")
    policy = env.isap
    print("Initial value function:")
    print(agent.vFnc.reshape((env.shape)))
    print("")
    print("Random Policy uniformly distributed")
    print(policy)
    print("")
    optimal_value_fnc = agent.policy_evaluation(policy)
    optimal_value_fnc = np.round(optimal_value_fnc)
    print("Optimal value function:")
    print(optimal_value_fnc.reshape((env.shape)))
    print("")

    ## Policy Improvement
    print("Do Policy Improvement...:")
    print("Start with Random Policy uniformly distributed")
    ## Initialize random policy for each state and action from environment
    policy = env.isap
    print(policy)
    print("")
    policy_improvement_res, value_fnc_optimal = agent.policy_improvement(
        policy)
    print("Optimal Policy Probability Distribution:")
    print(policy_improvement_res)
    print("")

    print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
    print(np.reshape(np.argmax(policy_improvement_res, axis=1), env.shape))
    print("")

    print("Value Function:")
    print(value_fnc_optimal)
    print("")

    print("Reshaped Grid Value Function:")
    print(value_fnc_optimal.reshape(env.shape))
    print("")
Exemple #9
0
    def _total_reward(self):
        """Sum of rewards expected for every state"""
        return sum(self.value[state] for state in self.mdp.keys())

        obs = GridworldEnv.state2str(observation)

        if obs in self.Q.keys():
            self.Q[obs] = np.zeros(self.nb_action)

        if random.uniform(0, 1) < self.epsilon:
            self.laction = np.random.randint(self.nb_action)

        else:
            self.laction = np.argmax([self.Q[obs]])

        self._update_Qvalue(reward, obs, done)

        self.lstate = obs

        return self.laction
Exemple #10
0
def main():
    np.random.seed(26)
    env = GridworldEnv(shape=[4, 4])
    agent = Agent(env)

    Q_learning_gridworld = agent.Q_learning(num_iter=1000,
                                            epsilon=0.10,
                                            alpha=0.20,
                                            discount_factor=0.30)

    print("Optimal Q-Function after 1000 iterations:")
    print(np.round(Q_learning_gridworld, 2))

    env2 = WindyGridworldEnv()
    agent2 = Agent(env2)
    Q_learning_windyworld = agent2.Q_learning(num_iter=1000,
                                              epsilon=0.10,
                                              alpha=0.20,
                                              discount_factor=0.30)

    print("Optimal Q-Function after 1000 iterations:")
    print(np.round(Q_learning_windyworld, 2))
Exemple #11
0
    # Create a deterministic policy using the optimal value function
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        # One step lookahead to find the best action for this state
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        # Always take the best action
        policy[s, best_action] = 1.0

    return policy, V


sizes = [5, 10, 20, 30, 50]
for size in sizes:
    print("Running VI Size: ", size)
    env = GridworldEnv(shape=[size, size])

    tic = time.time()
    policy, v = value_iteration(env)
    toc = time.time()
    elapsed_time = (toc - tic) * 1000
    print(f"Time to converge: {elapsed_time: 0.3} ms")

# print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
# print(np.reshape(np.argmax(policy, axis=1), env.shape))
# print("")

# print("Value Function:")
# print(v)
# print("")
import numpy as np
import pprint
import sys
if "./" not in sys.path:
    sys.path.append("./")
#from lib.envs.gridworld import GridworldEnv
from gridworld import GridworldEnv
from gridworld import print_policy

pp = pprint.PrettyPrinter(indent=2)
env = GridworldEnv(shape=(4, 4))  # 4*4的方格

print("env.nS:", env.nS, " env.nA:", env.nA, ' env.P[][]:', env.P)


def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
  Value Iteration Algorithm.

  Args:
      env: OpenAI env. env.P represents the transition probabilities of the environment.
          env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
          env.nS is a number of states in the environment.
          env.nA is a number of actions in the environment.
      theta: We stop evaluation once our value function change is less than theta for all states.
      discount_factor: Gamma discount factor.

  Returns:
      A tuple (policy, V) of the optimal policy and the optimal value function.
  """
    def one_step_lookahead(state, V):
    for i in range(num_episodes):
        episodes = []
        init_state = choice(list(set(
            env.P.keys())))  # draw a random state to start
        # generate an episode
        while not env.is_terminal(init_state):
            action = choice(list(env.P[init_state].keys(
            )))  # random policy such that draw an action randomly
            next_state = env.P[init_state][action][0][1]
            reward = env.P[init_state][action][0][2]
            episodes.append([init_state, action, reward])
            init_state = next_state
        G = 0
        states_seen = set()
        for S, A, R in reversed(episodes):
            G = 1.0 * G + R  # assuming discount factor is 1.0
            if S not in states_seen:
                states_seen.add(S)
                returns[S].append(G)
                V[S] = np.mean(returns[S])
    V_sorted = sorted(V.items(), key=lambda x: x[0])  # sort by state
    return V_sorted


if __name__ == '__main__':
    env = GridworldEnv((9, 9))
    print(env.P)
    env._render(mode="human")
    V = mc_policy_evaluation_random_policy(env, 5000)
    print(V)
def main():
    print("Running DQN")

    if config.env == "GridWorldEnv":
        print("Playing: ", config.env)
        env = GridworldEnv()
    else:
        env_name = config.env
        print("Playing:", env_name)
        env = gym.make(env_name)

    # not 100 % sure this will work for all envs
    obs_shape = env.observation_space.shape
    num_actions = env.action_space.n
    assert len(
        obs_shape) <= 1, "Not yet compatible with multi-dim observation space"
    if len(obs_shape) > 0:
        obs_size = obs_shape[0]
    else:
        obs_size = 1

    num_episodes = config.n_episodes
    batch_size = config.batch_size
    discount_factor = config.discount_factor
    learn_rate = config.learn_rate
    seed = config.seed
    num_hidden = config.num_hidden
    min_eps = config.min_eps
    max_eps = config.max_eps
    anneal_time = config.anneal_time
    clone_interval = config.clone_interval
    replay = (config.replay_off == False)
    clipping = (config.clipping_off == False)

    if config.memory_size is None:
        memory_size = 10 * batch_size
    else:
        memory_size = config.memory_size

    if not replay and (batch_size != 1 or memory_size != 1):
        print("Replay is turned off: adjusting memory and batch size to 1")
        batch_size = 1
        memory_size = 1

    memory = ReplayMemory(memory_size)

    # We will seed the algorithm (before initializing QNetwork!) for reproducibility
    random.seed(seed)
    torch.manual_seed(seed)
    env.seed(seed)

    Q_net = QNetwork(obs_size, num_actions, num_hidden=num_hidden)
    policy = EpsilonGreedyPolicy(Q_net, num_actions)
    episode_durations, losses, max_qs = run_episodes(
        train, Q_net, policy, memory, env, num_episodes, batch_size,
        discount_factor, learn_rate, clone_interval, min_eps, max_eps,
        anneal_time, clipping)

    plot_smooth(episode_durations, 10, show=True)

    # This just for now to see results quick. TODO: make nice plot function to test/compare multiple settings
    plt.plot(losses)
    plt.title(
        f"{config.env}, lr={learn_rate}, replay={replay}, clone_interval={clone_interval}"
    )
    plt.ylabel("Loss")
    plt.xlabel("Episode")
    plt.show()

    plt.plot(max_qs)
    if clipping:
        plt.axhline(y=1. / (1 - discount_factor), color='r', linestyle='-')
    plt.title(
        f"{config.env}, lr={learn_rate}, replay={replay}, clone_interval={clone_interval}"
    )
    plt.ylabel("max |Q|")
    plt.xlabel("Episode")
    plt.show()
Exemple #15
0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

NAME = ''
if RANDOMIZE:
    NAME = 'rand'
    if REGULARIZE:
        NAME = 'reg'

for se in range(N_SEEDS):
    print('seed : ' + str(se))
    first_actions = []
    env = GridworldEnv(randomized_params=INITIAL_ADDITIONAL_PARAMS,
                       randomize=RANDOMIZE,
                       regularize=REGULARIZE,
                       randomization_space=RANDOMIZATION_SPACE,
                       goal_reward=GOAL_REWARD,
                       lava_reward=LAVA_REWARD,
                       step_reward=STEP_REWARD,
                       out_of_grid=OUT_OF_GRID_REWARD,
                       max_episode_steps=10)

    nb_steps = 4000

    agent = VPG(env,
                MLP_Multihead,
                gamma=1,
                verbose=False,
                learning_rate=1e-3,
                regularize=REGULARIZE,
                lam=LAMBDA)
    print(agent.seed)
Exemple #16
0
            result.append(element.observation)
        return result

    def print_trajectory(self):
        print('Trajectory:')
        for element in self.trajectory:
            print(element)
        print('Total trajectory steps: {0}'.format(len(self.trajectory)))


if __name__ == '__main__':

    size_x = 4
    size_y = 4

    env = GridworldEnv(size_x, size_y)
    env.make_start(0, 0)
    env.make_goal(0, 3)
    env.make_goal(3, 0)

    agent = GridworldAgent(size_x, size_y)

    total_episodes = 1000
    for i in range(total_episodes):

        obs = env.reset()
        agent.reset()

        agent.append_trajectory(t_step=0,
                                prev_action=None,
                                observation=obs,
Exemple #17
0
 def act(self, observation, reward, done):
     # get action for current state
     # obs = str(obs.tolist())
     obs = GridworldEnv.state2str(observation)
     action = self.policy[obs]
     return action
Exemple #18
0
    def setState(self, observation):

        self.lstate = GridworldEnv.state2str(observation)

        if self.lstate not in self.Q.keys():
            self.Q[self.lstate] = np.zeros(self.nb_action)
Exemple #19
0
def her_experiment():
    batch_size = 256
    discount_factor = 0.8
    learn_rate = 1e-3
    num_hidden = 128
    num_episodes = 2
    epochs = 200
    training_steps = 10
    memory_size = 100000
    # her = False
    # seeds = [42, 30, 2,19,99]  # This is not randomly chosen
    seeds = [42, 30, 2, 19, 99]
    shape = [30, 30]
    targets = lambda x, y: [0, x * y - 1, x - 1, (y - 1) * x]
    env = GridworldEnv(shape=shape, targets=targets(*shape))

    # functions for grid world
    def sample_goal():
        return np.random.choice(env.targets, 1)

    extract_goal = lambda state: np.reshape(np.array(np.argmax(state)), -1)

    def calc_reward(state, action, goal):
        if state == goal:
            return 0.0
        else:
            return -1.0
        # # maze
        #     def sample_goal():
        #         return env.maze.end_pos
        #     extract_goal = lambda state: np.reshape(np.array(np.argmax(state)),-1)
        #     def calc_reward(state, action, goal):
        #         if state == goal:
        #             return 0.0
        #         else:
        #             return -1.0

    means = []
    x_epochs = []
    l_stds = []
    h_stds = []
    for her in [True, False]:
        episode_durations_all = []
        for seed in seeds:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            env.seed(seed)
            print(env.reset())
            memory = ReplayMemory(memory_size)
            if her:
                # model = QNetwork(env.observation_space.shape[0]+2, num_hidden, env.action_space.n)
                model = QNetwork(2 * env.observation_space.n, num_hidden,
                                 env.action_space.n)
                episode_durations, episode_rewards = run_her_episodes(
                    train,
                    model,
                    memory,
                    env,
                    num_episodes,
                    training_steps,
                    epochs,
                    batch_size,
                    discount_factor,
                    learn_rate,
                    sample_goal,
                    extract_goal,
                    calc_reward,
                    use_her=True)
            else:
                model = QNetwork(env.observation_space.n, num_hidden,
                                 env.action_space.n)
                episode_durations, episode_rewards = run_her_episodes(
                    train,
                    model,
                    memory,
                    env,
                    num_episodes,
                    training_steps,
                    epochs,
                    batch_size,
                    discount_factor,
                    learn_rate,
                    sample_goal,
                    extract_goal,
                    calc_reward,
                    use_her=False)

            episode_durations_all.append(
                loop_environments.smooth(episode_durations, 10))
        mean = np.mean(episode_durations_all, axis=0)
        means.append(mean)
        std = np.std(episode_durations_all, ddof=1, axis=0)
        l_stds.append(mean - std)
        h_stds.append(mean + std)
        x_epochs.append(list(range(len(mean))))
        # print(len(mean),mean,std)
    line_plot_var(x_epochs, means, l_stds, h_stds, "Epoch", "Duration",
                  ["HindsightReplay", "RandomReplay"],
                  "Episode duration per epoch", ["orange", "blue"])
    name = "her_" + str(shape)
    file_name = os.path.join("./results", name)

    with open(file_name + ".pkl", "wb") as f:
        pickle.dump((x_epochs, means, l_stds, h_stds), f)
Exemple #20
0
from gridworld import GridworldEnv
import gym
import numpy as np
from collections import defaultdict
import plotting
import gym_minigrid

#env_id = 'MiniGrid-Empty-6x6-v0'

env = GridworldEnv()
#env = gym.make(env_id)

EPSILON = 1
GAMMA = 0.99
LEARNING_RATE = 0.3
NUM_EPISODES = 500

# initialize Q
Q = defaultdict(lambda: np.zeros(env.action_space.n))
M = np.zeros((env.observation_space.n, env.observation_space.n))
w = np.zeros(env.observation_space.n)

stats = plotting.EpisodeStats(episode_lengths=np.zeros(NUM_EPISODES),
                              episode_rewards=np.zeros(NUM_EPISODES))


def convert(state):

    ct = 0
    for i in range(4):
        for j in range(4):
def setUpModule():
  global env
  env = GridworldEnv()
Exemple #22
0
                sys.stdout.flush()
        avg_time_steps = time_steps_per_episode / self.exps
        avg_max_q = max_q_value_per_episode / self.exps
        return self._policy_directions(self._choose_policy()), avg_time_steps, avg_max_q


def plot(v):
    import matplotlib.pylab as plt
    fig, ax = plt.subplots()
    min_val, max_val = 0, 5
    for i in xrange(5):
        for j in xrange(5):
            c = v[i][j]
            ax.text(i, j, str(c), va='center', ha='center')
    ax.matshow(v, cmap=plt.cm.Blues)

    ax.set_xlim(min_val, max_val)
    ax.set_ylim(min_val, max_val)
    ax.set_xticks(np.arange(max_val))
    ax.set_yticks(np.arange(max_val))
    ax.grid()
    plt.show()


if __name__ == '__main__':
    shape = (5, 5)
    g = GridworldEnv(shape=shape)
    l = SarsaLambdaLearner(g, exps=2, l=0.8, num_episodes=400,
                           gamma=0.99, alpha=0.1, epsilon=0.3)
    print l.learn()
import numpy as np
import sys
import gym.spaces
import timeit
if "../" not in sys.path:
    sys.path.append("../")
from gridworld import GridworldEnv

environment = GridworldEnv()


def value_iteration(environment, discountFactor=0.9, minError=0.1):
    def lookahead(V, a, s):

        [(next_state, reward, done)] = environment.P[s][a]
        #Bellman eqn
        value = (reward + discountFactor * V[next_state])
        return value

    #inital value function and policy
    V = np.zeros(environment.nS)
    policy = np.zeros([environment.nS, environment.nA])

    while True:

        error = 0

        #loop over states
        for s in range(environment.nS):

            actions_values = np.zeros(environment.nA)
Exemple #24
0
import numpy as np
from gridworld import GridworldEnv

env = GridworldEnv([6, 6])


def policy_iteration(env, theta=0.001, discount_factor=1.0):
    """
    Policy Iteration Algorithm.
    
    Args:
        env: gridWorld
        theta: Stopping threshold. 
        discount_factor: lambda time discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """

        A = 0.0
import numpy as np
import gym.spaces
from gridworld import GridworldEnv

env = GridworldEnv()


def policy_eval(policy, env, discount_factor=1.0, epsilon=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)

    while True:

        #old value function
        V_old = np.zeros(env.nS)
        #stopping condition
import numpy as np
import itertools
from collections import defaultdict
from gridworld import GridworldEnv

ENV = GridworldEnv()
EQUIPROBABLE_POLICY = random_policy = np.ones([ENV.nS, ENV.nA]) / ENV.nA

# Chris Fenton
# CNC - AI
# Winter Final Programming Problems

'''
a) For the gridworld in example 4.1, Figure 4.1 shows a synchronous iterative
policy evaluation, although the text explains asynchronous. An asynchronous
iterative policy evaluation would go through the states (in numerical order
from 1 to 14) and update after each state based on the previous updates.
Program the asynchronous version, and write the value for each state after
2000 iterations.
'''
def asyncPolicyEvaluation(gamma=0.9,iterations=2000):
    # ENV.P[s][a] : prob, next_state, reward, terminal?) tuple
    # Actions: up=0, right=1, down=2, left=3
    Q = defaultdict(lambda: np.zeros(ENV.action_space.n))
    # Q is the optimal action-value function, a dictionary mapping state -> action values.
    A = [0,1,2,3] #Actions
    values = [0] * ENV.nS
    for i in range(iterations):
        state = ENV.reset() #get a random state
        action = np.random.choice(A, replace=False) #random action
        #Q(s,a) = EV[R(t+1) + γV(s')]