Ejemplo n.º 1
0
                get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')


if __name__ == '__main__':
    visualize = True
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5

    # Play in Frozen Lake Env
    state_values = {state: 0
                    for state in mdp.get_all_states()
                    }  # Initialize state_values

    # Run value iteration algo!
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         state_values)
Ejemplo n.º 2
0
    except ImportError:
        raise ImportError(
            "Run the cell that starts with \"%%writefile mdp_get_action_value.py\""
        )

s = mdp.reset()
rewards = []
for _ in range(10000):
    s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
    rewards.append(r)

print("Average reward: ", np.mean(rewards))

assert (0.40 < np.mean(rewards) < 0.55)

mdp = FrozenLakeEnv(slip_chance=0)
mdp.render()

state_values = value_iteration(mdp)
s = mdp.reset()
mdp.render()
for t in range(100):
    a = get_optimal_action(mdp, state_values, s, gamma)
    print(a, end='\n\n')
    s, r, done, _ = mdp.step(a)
    mdp.render()
    if done:
        break

state_values = {s: 0 for s in mdp.get_all_states()}
Ejemplo n.º 3
0
                get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')


if __name__ == '__main__':
    visualize = True
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5

    # Play in Frozen Lake Env
    state_values = {s: 0
                    for s in mdp.get_all_states()}  # Initialize state_values

    # Run value iteration algo!
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         state_values)  # try

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
Ejemplo n.º 4
0
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')
    return total_rewards


if __name__ == '__main__':
    visualize = True
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5
    visualize = True

    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    # Play in Frozen Lake Env
    state_values = {s: 0
                    for s in mdp.get_all_states()}  # Initialize state_values
    # Run value iteration algo!
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         state_values)
                get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')


if __name__ == '__main__':
    visualize = False
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5

    # Play in Frozen Lake Env
    state_values = mdp.get_all_states()  # Initialize state_values

    # Run value iteration algo!
    init_values = {s: 0 for s in state_values}
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         init_values)

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
Ejemplo n.º 6
0
from mdp import FrozenLakeEnv
import matplotlib.pyplot as plt
import numpy as np
from mdp_get_action_value import get_optimal_action, value_iteration, get_action_value, get_new_state_value

mdp = FrozenLakeEnv(slip_chance=0)
mdp.render()
gamma = 0.9


def draw_policy(mdp, state_values):
    plt.figure(figsize=(3, 3))
    h, w = mdp.desc.shape
    states = sorted(mdp.get_all_states())
    V = np.array([state_values[s] for s in states])
    Pi = {s: get_optimal_action(mdp, state_values, s, gamma) for s in states}
    plt.imshow(V.reshape(w, h), cmap='gray', interpolation='none', clim=(0, 1))
    ax = plt.gca()
    ax.set_xticks(np.arange(h) - .5)
    ax.set_yticks(np.arange(w) - .5)
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    Y, X = np.mgrid[0:4, 0:4]
    a2uv = {'left': (-1, 0), 'down': (0, -1), 'right': (1, 0), 'up': (-1, 0)}
    for y in range(h):
        for x in range(w):
            plt.text(x,
                     y,
                     str(mdp.desc[y, x].item()),
                     color='g',
                     size=12,
def submit_assigment(get_action_value, get_new_state_value, get_optimal_action,
                     value_iteration, email, token):
    grader = grading.Grader("EheZDOgLEeenIA4g5qPHFA")
    sys.stdout = None

    transition_probs = {
        's0': {
            'a0': {
                's1': 0.8,
                's2': 0.2
            },
            'a1': {
                's1': 0.2,
                's2': 0.8
            },
        },
        's1': {
            'a0': {
                's0': 0.2,
                's2': 0.8
            },
            'a1': {
                's0': 0.8,
                's2': 0.2
            },
        },
        's2': {
            'a0': {
                's3': 0.5,
                's4': 0.5
            },
            'a1': {
                's3': 1.0
            },
        },
        's3': {
            'a0': {
                's1': 0.9,
                's2': 0.1
            },
            'a1': {
                's1': 0.7,
                's2': 0.3
            },
        },
        's4': {
            'a0': {
                's3': 1.0
            },
            'a1': {
                's3': 0.7,
                's1': 0.3
            },
        }
    }
    rewards = {
        's0': {
            'a0': {
                's1': 0,
                's2': 1
            },
            'a1': {
                's1': 0,
                's2': 1
            }
        },
        's1': {
            'a0': {
                's0': -1,
                's2': 1
            },
            'a1': {
                's0': -1,
                's2': 1
            }
        },
        's2': {
            'a0': {
                's3': 0,
                's4': 1
            },
            'a1': {
                's3': 0,
                's4': 1
            }
        },
        's3': {
            'a0': {
                's1': -3,
                's2': -3
            },
            'a1': {
                's1': -3,
                's2': -3
            }
        },
        's4': {
            'a1': {
                's1': +10
            }
        }
    }

    mdp = MDP(transition_probs, rewards, initial_state='s0')

    test_Vs = {s: i for i, s in enumerate(mdp.get_all_states())}
    qvalue1 = get_action_value(mdp, test_Vs, 's1', 'a0', 0.9)
    qvalue2 = get_action_value(mdp, test_Vs, 's4', 'a1', 0.9)

    grader.set_answer("F16dC", qvalue1 + qvalue2)

    # ---

    svalue1 = get_new_state_value(mdp, test_Vs, 's2', 0.9)
    svalue2 = get_new_state_value(mdp, test_Vs, 's4', 0.9)

    grader.set_answer("72cBp", svalue1 + svalue2)

    # ---

    state_values = {s: 0 for s in mdp.get_all_states()}
    gamma = 0.9

    # ---

    action1 = get_optimal_action(mdp, state_values, 's1', gamma)
    action2 = get_optimal_action(mdp, state_values, 's2', gamma)

    grader.set_answer("xIuti", action1 + action2)

    # ---

    s = mdp.reset()
    rewards = []
    for _ in range(10000):
        s, r, done, _ = mdp.step(
            get_optimal_action(mdp, state_values, s, gamma))
        rewards.append(r)

    grader.set_answer("Y8g0j", np.mean(rewards) + np.std(rewards))

    mdp = FrozenLakeEnv(slip_chance=0.25)
    state_values = value_iteration(mdp)
    gamma = 0.9

    total_rewards = []
    for game_i in range(1000):
        s = mdp.reset()
        rewards = []
        for t in range(100):
            s, r, done, _ = mdp.step(
                get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done: break
        total_rewards.append(np.sum(rewards))

    grader.set_answer("ABf1b", np.mean(total_rewards) + np.std(total_rewards))

    # ---

    mdp = FrozenLakeEnv(slip_chance=0.25, map_name='8x8')
    state_values = value_iteration(mdp)
    gamma = 0.9

    total_rewards = []
    for game_i in range(1000):
        s = mdp.reset()
        rewards = []
        for t in range(100):
            s, r, done, _ = mdp.step(
                get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done: break
        total_rewards.append(np.sum(rewards))

    grader.set_answer("U3RzE", np.mean(total_rewards) + np.std(total_rewards))

    sys.stdout = sys.__stdout__
    grader.submit(email, token)
Ejemplo n.º 8
0
        if logging == True:
            print("iter %4i   |   diff: %6.5f   |   V(start): %.3f " %
                  (i, diff, new_state_values[mdp._initial_state]))

        state_values = new_state_values
        if diff < min_difference:
            break

    return state_values


#Frozen

from mdp import FrozenLakeEnv
mdp = FrozenLakeEnv(slip_chance=0)

mdp.render()

state_values = value_iteration(mdp)

s = mdp.reset()
mdp.render()
for t in range(100):
    a = get_optimal_action(mdp, state_values, s, gamma)
    print(a, end='\n\n')
    s, r, done, _ = mdp.step(a)
    mdp.render()
    if done:
        break
            # s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')


if __name__ == '__main__':
    visualize = True
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5

    # Play in Frozen Lake Env
    state_values = {}  # Initialize state_values

    # Run value iteration algo!
    state_values, _ = None, None

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
    s = mdp.reset()
    mdp.render()
Ejemplo n.º 10
0
                get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')


if __name__ == '__main__':
    visualize = True
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5

    # Play in Frozen Lake Env
    state_values = {s: 0
                    for s in mdp.get_all_states()}  # Initialize state_values

    # Run value iteration algo!
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         state_values)

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
Ejemplo n.º 11
0
            # s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
            rewards.append(r)
            if done:
                break
        total_rewards.append(np.sum(rewards))
    print('Average reward: ', np.mean(total_rewards))
    if mdp.slip_chance == 0:
        assert (1.0 <= np.mean(total_rewards) <= 1.0)
    else:
        assert (0.8 <= np.mean(total_rewards) <= 0.95)
    print('Well done!')


if __name__ == '__main__':
    visualize = False
    mdp = FrozenLakeEnv(map_name='8x8', slip_chance=0.1)
    mdp.render()

    gamma = 0.9
    num_iter = 100
    min_difference = 1e-5

    # Play in Frozen Lake Env
    state_values = mdp.get_all_states() # Initialize state_values

    # Run value iteration algo!
    state_values, _ = mdp.get_all_states()

    # See how our agent performs - e.g. render what is going on when agent choose `optimal` value
    s = mdp.reset()
    mdp.render()