Example #1
0
def create_grid_world_state_graphs_q_learning(world: GridWorld,
                                              stats: IterationStats,
                                              name,
                                              base_folder,
                                              iterations=None):
    def get_policy(w: GridWorld, t, V):
        policy = []
        Q = numpy.array(V)
        for i, state in enumerate(world.get_states()):
            if len(state.get_actions()) > 0:
                policy.append(numpy.nanargmax(Q[:, i]))
            else:
                policy.append(0)
        return policy

    def get_value(Q, i):
        q_values = []
        for a in range(0, len(Q)):
            q_values.append(Q[a][i])
        if numpy.sum(numpy.isnan(q_values)) == len(q_values):
            return 0.0
        return numpy.nanmax(q_values)

    on_iteration = functools.partial(__create_world_graph__, world, None,
                                     iterations, name, base_folder, get_policy,
                                     get_value)

    stats.load_and_run_analysis(on_iteration)
Example #2
0
def find_converged_policy(world, transitions):
    stats = IterationStats('stats/pi_simple_grid_0-99.csv')
    pi_policy = analysis.get_pi_policy(world, stats, transitions, 26)
    stats = IterationStats('stats/vi_simple_grid_0-99.csv')
    equal_iters = analysis.first_equal_iteration(world, stats, transitions,
                                                 pi_policy)
    print(equal_iters)
Example #3
0
def get_graphs_and_time_stats_forest_mdp():
    stats = IterationStats('stats/vi_forest_0-90.csv')
    time_taken = analysis.create_iteration_value_graph(
        stats, 'variation', 'Variation for Forest Value Iteration',
        'forest_results/vi')
    print(sum(time_taken) * 1000)
    stats = IterationStats('stats/pi_forest.csv')
    time_taken = analysis.create_iteration_value_graph(
        stats, 'variation', 'Changed Elements for Forest Policy Iteration',
        'forest_results')
    print(sum(time_taken) * 1000)
Example #4
0
def get_pi_policy(world: GridWorld, stats: IterationStats, transitions,
                  iteration):
    policy = []

    def get_policy(iteration_number, iteration_time, iteration_value, values):
        if iteration_number == iteration:
            policy.append(values)

    stats.load_and_run_analysis(get_policy)

    return policy[0]
Example #5
0
def get_vi_policy(world: GridWorld, stats: IterationStats, transitions,
                  iteration):
    policy = []

    def get_policy(iteration_number, iteration_time, iteration_value, values):
        if iteration_number == iteration:
            Q = numpy.empty((5, len(world.all_states) + 1))
            for aa in range(0, 5):
                Q[aa] = transitions[aa].dot(values)
            policy.append(Q.argmax(axis=0))

    stats.load_and_run_analysis(get_policy)

    return policy[0]
Example #6
0
def create_grid_world_state_graphs_value_iteration(world: GridWorld,
                                                   stats: IterationStats,
                                                   transitions,
                                                   name,
                                                   base_folder,
                                                   iterations=None):
    def get_policy(w: GridWorld, t, V):
        Q = numpy.empty((5, len(w.all_states) + 1))
        for aa in range(0, 5):
            Q[aa] = t[aa].dot(V)
        return Q.argmax(axis=0)

    on_iteration = functools.partial(__create_world_graph__, world,
                                     transitions, iterations, name,
                                     base_folder, get_policy, None)

    stats.load_and_run_analysis(on_iteration)
Example #7
0
def create_grid_world_state_graphs_policy_iteration(world: GridWorld,
                                                    stats: IterationStats,
                                                    transitions,
                                                    name,
                                                    base_folder,
                                                    iterations=None):
    def get_policy(w: GridWorld, t, p):
        return p

    def get_value(v, i):
        return 0.0

    on_iteration = functools.partial(__create_world_graph__, world,
                                     transitions, iterations, name,
                                     base_folder, get_policy, get_value)

    stats.load_and_run_analysis(on_iteration)
Example #8
0
def first_equal_iteration(world: GridWorld, stats: IterationStats, transitions,
                          policy_to_compare):
    equal_iterations = []
    policy_to_compare = [int(p) for p in policy_to_compare]

    def compare_policy(iteration_number, iteration_time, iteration_value,
                       values):
        Q = numpy.empty((5, len(world.all_states) + 1))
        for aa in range(0, 5):
            Q[aa] = transitions[aa].dot(values)
        policy = Q.argmax(axis=0)
        diffs = numpy.sum(policy != policy_to_compare)
        if diffs == 0:
            equal_iterations.append(iteration_number)

    stats.load_and_run_analysis(compare_policy)
    return equal_iterations
Example #9
0
def get_graphs_and_time_stats_grid_world_mdp(world, transitions):
    stats = IterationStats('stats/vi_simple_grid_0-99.csv')
    analysis.create_grid_world_state_graphs_value_iteration(
        world,
        stats,
        transitions,
        'vi_graph',
        'grid_world_results/vi',
        iterations=[252])
    times = analysis.create_iteration_value_graph(
        stats, 'variation', 'Variation for Grid World Value Iteration',
        'grid_world_results/vi')
    print('vi total time: {} ms'.format(numpy.sum(times) * 1000.0))
    stats = IterationStats('stats/pi_simple_grid_0-99.csv')
    times = analysis.create_iteration_value_graph(
        stats, '# of elements changed',
        'Changed Elements for Grid World Policy Iteration',
        'grid_world_results/pi')
    print('pi total time: {} ms'.format(numpy.sum(times) * 1000.0))
Example #10
0
def compare_different_gamma_policies(world, transitions, reward):
    stats = IterationStats('stats/pi_simple_grid_0-99.csv')
    analysis.create_grid_world_state_graphs_policy_iteration(
        world,
        stats,
        transitions,
        'PI state graph',
        'grid_world_results/pi',
        iterations=[8])
    stats = IterationStats('stats/vi_simple_grid_0-99.csv')
    vi_policy = analysis.get_vi_policy(world, stats, transitions, 252)
    stats = IterationStats('stats/pi_simple_grid_0-99.csv')
    pi_policy = analysis.get_pi_policy(world, stats, transitions, 26)
    analysis.compare_policies(world, vi_policy, pi_policy,
                              'grid_world_results', 'comparison_pi_vi')
    stats = IterationStats('stats/vi_simple_grid_0-8.csv')
    vi_policy = analysis.get_vi_policy(world, stats, transitions, 55)
    analysis.compare_policies(world, vi_policy, pi_policy,
                              'grid_world_results',
                              'comparison pi vi low gamma')
Example #11
0
def create_iteration_value_graph(stats: IterationStats, value_name, name,
                                 basefolder):
    iteration_values = []
    times = []

    def add_variation(iteration_number, iteration_time, iteration_value,
                      values):
        iteration_values.append(iteration_value)
        times.append(iteration_time)

    stats.load_and_run_analysis(add_variation)

    plot.figure(1, clear=True)
    plot.title(name)
    plot.ylabel(value_name)
    plot.xlabel('iteration')
    plot.plot(range(1, len(iteration_values) + 1), iteration_values)
    plot.pause(0.001)
    plot.savefig('{}/{}'.format(basefolder, name.replace(' ', '_').lower()))
    plot.close()
    return times
Example #12
0
def run_policy_iteration_grid_world(world, transitions, reward):
    def write_pi_stats(pi_iter, iteration, time, variation):
        stats.save_iteration(iteration, time, variation, pi_iter.policy)
        print('[{}]:\t{}'.format(iteration, variation))

    for gamma in numpy.linspace(0.8, 1.0, num=5):
        if gamma == 1.0:
            gamma = 0.99
        stats = IterationStats('stats/pi_simple_grid_{}.csv'.format(
            str(gamma).replace('.', '-')))
        pi = cmdptoolbox.mdp.PolicyIteration(transitions,
                                             reward,
                                             gamma,
                                             max_iter=1000,
                                             skip_check=True)

        print("set up before run")
        pi.setVerbose()
        pi.setPrint(write_pi_stats)
        stats.start_writing()
        pi.run()
        stats.done_writing()
        print('found in {} iterations'.format(pi.iter))
        print('took {}'.format(pi.time))
        world.print_policy(print, pi.policy)
        last_policy = pi.policy
    return last_policy
Example #13
0
def run_value_iteration_forest(S, r1, r2):
    forest = WrappedForest(S, r1, r2)
    transitions, reward = mdptoolbox.example.forest(S, r1, r2)
    for gamma in numpy.linspace(0.5, 1.0, num=11):
        if gamma == 1.0:
            gamma = 0.99
        stats = IterationStats('stats/vi_forest_{}.csv'.format(
            '{:.2f}'.format(gamma).replace('.', '-')))

        def write_vi_stats(vi_iter, iteration, time, variation):
            stats.save_iteration(iteration, time, variation, vi_iter.V)
            print('[{}]:\t{}'.format(iteration, variation))

        print('gamma={}'.format(gamma))
        vi = cmdptoolbox.mdp.ValueIteration(transitions,
                                            reward,
                                            gamma,
                                            epsilon=0.0001,
                                            max_iter=10000,
                                            skip_check=True)
        vi.setVerbose()
        vi.setPrint(write_vi_stats)
        stats.start_writing()
        vi.run()
        stats.done_writing()
        print('found in {} iterations'.format(vi.iter))
        print('took {}'.format(vi.time))
        forest.print_policy(print, vi.policy)
Example #14
0
def run_value_iteration_grid_world(world, transitions, reward):
    def write_vi_stats(vi_iter, iteration, time, variation):
        stats.save_iteration(iteration, time, variation, vi_iter.V)
        print('[{}]:\t{}'.format(iteration, variation))

    for gamma in numpy.linspace(0.8, 0.99, num=20):
        stats = IterationStats('stats/vi_simple_grid_{}.csv'.format(
            str(gamma).replace('.', '-')))

        vi = cmdptoolbox.mdp.ValueIteration(transitions,
                                            reward,
                                            gamma,
                                            epsilon=0.0001,
                                            max_iter=10000,
                                            skip_check=True)
        vi.setVerbose()
        vi.setPrint(write_vi_stats)
        stats.start_writing()
        vi.run()
        stats.done_writing()
        print('found in {} iterations'.format(vi.iter))
        print('took {}'.format(vi.time))
        world.print_policy(print, vi.policy)
        last_policy = vi.policy
    return last_policy
Example #15
0
def run_q_learning_forest(S, r1, r2):
    forest = WrappedForest(S, r1, r2)
    n_episodes = 10000
    how_often = n_episodes / 100

    stats = IterationStats('stats/ql_forest.csv', dims=2)

    def on_episode(episode, time, q_learner, q):
        forest.print_policy(print, q_learner.get_policy())
        stats.save_iteration(episode, time,
                             numpy.nanmean(numpy.nanmax(q, axis=0)), q)

    def is_done(state, action, next_state):
        if next_state.state_num == 0:
            return True
        return False

    gamma = 0.99
    start = time.time()
    numpy.random.seed(5263228)
    q_l = QLearning(forest,
                    0.5,
                    0.2,
                    gamma,
                    on_episode=on_episode,
                    start_at_0=True,
                    alpha=0.1,
                    is_done=is_done,
                    every_n_episode=how_often)
    stats.start_writing()
    q_l.run(n_episodes)
    stats.done_writing()
    forest.print_policy(print, q_l.get_policy())
    print('took {} s'.format(time.time() - start))

    stats = IterationStats('stats/ql_forest.csv', dims=2)
    analysis.create_iteration_value_graph(
        stats, 'average Q',
        'Average Q for each iteration on Forest Q Learning', 'forest_results')
Example #16
0
def run_policy_iteration_forest(S, r1, r2):
    forest = WrappedForest(S, r1, r2)
    transitions, reward = mdptoolbox.example.forest(S, r1, r2)

    stats = IterationStats('stats/pi_forest.csv')

    def write_pi_stats(pi_iter, iteration, time, variation):
        stats.save_iteration(iteration, time, variation, pi_iter.policy)
        print('[{}]:\t{}'.format(iteration, variation))

    pi = cmdptoolbox.mdp.PolicyIteration(transitions,
                                         reward,
                                         0.9,
                                         max_iter=1000)

    pi.setVerbose()
    pi.setPrint(write_pi_stats)
    stats.start_writing()
    pi.run()
    stats.done_writing()
    print('found in {} iterations'.format(pi.iter))
    print('took {}'.format(pi.time))
    forest.print_policy(print, pi.policy)
Example #17
0
def get_graph_q_learning():
    stats = IterationStats('stats/ql_simple_grid.csv', dims=5)
    analysis.create_iteration_value_graph(
        stats, 'average Q',
        'Average Q for each iteration on Grid World Q Learning',
        'grid_world_results/ql')
Example #18
0
def run_q_learning_grid_world():
    world = GridWorld('simple_grid.txt', -0.01, include_treasure=False)
    n_episodes = 500000
    how_often = n_episodes / 500

    stats = IterationStats('stats/ql_simple_grid.csv', dims=5)

    def on_update(state, action, next_state, q_learner):
        #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y))
        pass

    def on_episode(episode, time, q_learner, q):
        world.print_policy(print, q_learner.get_policy())
        stats.save_iteration(episode, time,
                             numpy.nanmean(numpy.nanmax(q, axis=0)), q)
        #time.sleep(1)

    for state in world.get_states():
        if state.tile_type == GridWorldTile.GOAL:
            goal_state = state
            break

    def initialize_toward_goal(state: GridWorldTile):
        actions = state.get_actions()
        if len(actions) == 0:
            return []
        diff_x = goal_state.x - state.x
        diff_y = goal_state.y - state.y
        best_value = 0.1
        if len(actions) == 5 and actions[4][0].startswith('get treasure'):
            best_action = actions[4][0]
        elif abs(diff_x) >= abs(diff_y):
            if diff_x > 0:
                best_action = 'move east'
            else:
                best_action = 'move west'
        else:
            if diff_y < 0:
                best_action = 'move north'
            else:
                best_action = 'move south'
        values = [-0.1] * len(actions)
        for i, action in enumerate(actions):
            if action[0] == best_action:
                values[i] = best_value
        return values

    gamma = 0.99
    q_l = QLearning(world,
                    0.5,
                    0.05,
                    gamma,
                    on_update=on_update,
                    on_episode=on_episode,
                    initializer=initialize_toward_goal,
                    start_at_0=True,
                    alpha=0.1,
                    every_n_episode=how_often)
    stats.start_writing()
    q_l.run(n_episodes)
    stats.done_writing()
    world.print_policy(print, q_l.get_policy())