def policy_iteration(env, policy, epsilon):
    q = init_state_action_map(env)
    visits_map = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, policy)
        on_policy_evaluation(episode, q, visits_map)
        epsilon_greedy_policy_improvement(env, episode, q, policy, epsilon)
    return q
def policy_iteration(env, policy):
    q = init_state_action_map(env)
    visits_map = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode_es(env, policy)
        on_policy_evaluation(episode, q, visits_map)
        greedy_deterministic_policy_improvement(env, episode, q, policy)
    return q
Esempio n. 3
0
def policy_iteration(env, target_policy, behavior_policy):
    q = init_state_action_map(env)
    c = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, behavior_policy)
        off_policy_evaluation(episode, q, c, target_policy, behavior_policy)
        greedy_stochastic_policy_improvement(env, episode, q, target_policy)
    return q
def main():
    env = Blackjack()
    target_policy = init_policy(env)
    behavior_policy = init_equiprobable_random_policy(env)
    q = init_state_action_map(env)
    c = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, behavior_policy)
        off_policy_evaluation(episode, q, c, target_policy, behavior_policy)
    env.visualize_action_value(q)
Esempio n. 5
0
def policy_iteration2(env, target_policy, behavior_policy):
    q = init_state_action_map(env)
    c = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, behavior_policy)
        fine_grained_off_policy_iteration(episode,
                                          q,
                                          c,
                                          target_policy,
                                          behavior_policy,
                                          gamma=1)
    return q
def double_q_learning(env, epsilon=0.1, alpha=0.5, gamma=1, num_episodes=1000):
    q1 = init_state_action_map(env)
    q2 = init_state_action_map(env)
    for i in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            action = choose_doubled_epsilon_greedy_action(q1, q2, state, epsilon)
            (next_state, reward, done, _) = env.step(action)
            if random.random() < 0.5:
                double_q_update(q1, q2, state, action, reward, next_state, alpha, gamma)
            else:
                double_q_update(q2, q1, state, action, reward, next_state, alpha, gamma)
            state = next_state
    return q1, q2
def main():
    # define hyperparameters
    num_episodes = 1000
    epsilon = 1
    gamma = 0.9
    alpha = 0.1

    # create an env
    env = GridworldChase(12,
                         12,
                         p_goal_move=1,
                         agent_random_start=True,
                         goal_random_start=True)

    # init q and get baseline random performance
    q = init_state_action_map(env)

    estimate_performance(env, q, 1)

    # learn q
    print("running q-learning...")
    q = q_learning(env,
                   q,
                   epsilon=epsilon,
                   alpha=alpha,
                   gamma=gamma,
                   num_episodes=num_episodes)
    print("q-learning complete")

    # determine post-training performance
    estimate_performance(env, q, 0.01)
    visualize_performance(env, q, delay=0.15)
def n_step_sarsa(env, n=5, alpha=0.5, epsilon=0.1, gamma=0.9, num_episodes=10):
    q = init_state_action_map(env)
    for _ in range(num_episodes):
        # reset states, actions, and rewards lists
        states = []
        actions = []
        rewards = [None]
        # reset state, action
        state = env.reset()
        states.append(state)
        action = choose_epsilon_greedy_action(q, state, epsilon)
        actions.append(action)
        T = float("inf")
        t = 0
        while True:
            # while more actions remain to be taken
            if t < T:
                action = actions[t]
                next_state, reward, done, _ = env.step(action)
                states.append(next_state)
                rewards.append(reward)
                if done:
                    T = t + 1
                else:
                    action = choose_epsilon_greedy_action(
                        q, next_state, epsilon)
                    actions.append(action)
            # tau is the index on state/action updates
            tau = t - n + 1
            # if we are deep enough into an episode to perform an update
            if tau >= 0:
                # compute the target of the update (n-step return)
                G = sum([
                    gamma**(i - tau - 1) * rewards[i]
                    for i in range(tau + 1,
                                   min(tau + n, T) + 1)
                ])
                if tau + n < T:
                    G = G + gamma**n * q[states[tau + n]][actions[tau + n]]
                q_value = q[states[tau]][actions[tau]]
                # update the q function
                q[states[tau]][actions[tau]] = q_value + alpha * (G - q_value)
            # don't update the terminal state
            if tau == T - 1:
                break
            t = t + 1
    return q
def main():
    goals = [(7,0)]
    anti_goals = [(1,0),(2,0),(3,0),(4,0),(5,0),(6,0)]
    env = Gridworld(8, 4, goals, anti_goals)

    # get baseline random performance
    q = init_state_action_map(env)
    estimate_performance(env, q, 1)

    # learn q
    print("running double q-learning...")
    q1, q2 = double_q_learning(env)
    print("double q-learning complete")

    # determine post-training performance
    estimate_performance(env, q2, 0.01)
    visualize_performance(env, q2)
Esempio n. 10
0
def main():
    goals = [(7, 0)]
    anti_goals = [(1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)]
    env = Gridworld(8, 4, goals, anti_goals)

    # init q and get baseline random performance
    q = init_state_action_map(env)
    estimate_performance(env, q, 1)

    # learn q
    print("running sarsa...")
    q = sarsa(env, q)
    print("sarsa complete")

    # determine post-training performance
    estimate_performance(env, q, 0.01)
    visualize_performance(env, q)
def main():
    x_limit = 8
    y_limit = 5
    goals = [(0, 4)]
    walls = [(0, 2), (1, 2), (2, 2), (3, 2)]

    env = Maze(x_limit, y_limit, goals, walls)
    num_episodes = 10

    # determine the baseline performance that results from taking random moves
    avg = sum([len(generate_random_episode(env))
               for _ in range(num_episodes)]) / float(num_episodes)
    print "baseline random performance: " + str(avg)

    # learn q
    print "running tabular dyna-q..."
    q = init_state_action_map(env)
    q = tabular_dyna_q(env, q)
    print "tabular dyna-q complete"

    # evaluate performance
    avg = sum([
        len(generate_epsilon_greedy_episode(env, q))
        for _ in range(num_episodes)
    ]) / float(num_episodes)
    print "post learning performance: " + str(avg)

    # visualize post-training episode
    state = env.reset()
    while True:
        env.render()
        time.sleep(0.25)
        action = choose_epsilon_greedy_action(q, state, 0.1)
        state, _, done, _ = env.step(action)  # take a random action
        if done:
            env.render(close=True)
            break