def create_grid_world_state_graphs_q_learning(world: GridWorld, stats: IterationStats, name, base_folder, iterations=None): def get_policy(w: GridWorld, t, V): policy = [] Q = numpy.array(V) for i, state in enumerate(world.get_states()): if len(state.get_actions()) > 0: policy.append(numpy.nanargmax(Q[:, i])) else: policy.append(0) return policy def get_value(Q, i): q_values = [] for a in range(0, len(Q)): q_values.append(Q[a][i]) if numpy.sum(numpy.isnan(q_values)) == len(q_values): return 0.0 return numpy.nanmax(q_values) on_iteration = functools.partial(__create_world_graph__, world, None, iterations, name, base_folder, get_policy, get_value) stats.load_and_run_analysis(on_iteration)
def find_converged_policy(world, transitions): stats = IterationStats('stats/pi_simple_grid_0-99.csv') pi_policy = analysis.get_pi_policy(world, stats, transitions, 26) stats = IterationStats('stats/vi_simple_grid_0-99.csv') equal_iters = analysis.first_equal_iteration(world, stats, transitions, pi_policy) print(equal_iters)
def get_graphs_and_time_stats_forest_mdp(): stats = IterationStats('stats/vi_forest_0-90.csv') time_taken = analysis.create_iteration_value_graph( stats, 'variation', 'Variation for Forest Value Iteration', 'forest_results/vi') print(sum(time_taken) * 1000) stats = IterationStats('stats/pi_forest.csv') time_taken = analysis.create_iteration_value_graph( stats, 'variation', 'Changed Elements for Forest Policy Iteration', 'forest_results') print(sum(time_taken) * 1000)
def get_pi_policy(world: GridWorld, stats: IterationStats, transitions, iteration): policy = [] def get_policy(iteration_number, iteration_time, iteration_value, values): if iteration_number == iteration: policy.append(values) stats.load_and_run_analysis(get_policy) return policy[0]
def get_vi_policy(world: GridWorld, stats: IterationStats, transitions, iteration): policy = [] def get_policy(iteration_number, iteration_time, iteration_value, values): if iteration_number == iteration: Q = numpy.empty((5, len(world.all_states) + 1)) for aa in range(0, 5): Q[aa] = transitions[aa].dot(values) policy.append(Q.argmax(axis=0)) stats.load_and_run_analysis(get_policy) return policy[0]
def create_grid_world_state_graphs_value_iteration(world: GridWorld, stats: IterationStats, transitions, name, base_folder, iterations=None): def get_policy(w: GridWorld, t, V): Q = numpy.empty((5, len(w.all_states) + 1)) for aa in range(0, 5): Q[aa] = t[aa].dot(V) return Q.argmax(axis=0) on_iteration = functools.partial(__create_world_graph__, world, transitions, iterations, name, base_folder, get_policy, None) stats.load_and_run_analysis(on_iteration)
def create_grid_world_state_graphs_policy_iteration(world: GridWorld, stats: IterationStats, transitions, name, base_folder, iterations=None): def get_policy(w: GridWorld, t, p): return p def get_value(v, i): return 0.0 on_iteration = functools.partial(__create_world_graph__, world, transitions, iterations, name, base_folder, get_policy, get_value) stats.load_and_run_analysis(on_iteration)
def first_equal_iteration(world: GridWorld, stats: IterationStats, transitions, policy_to_compare): equal_iterations = [] policy_to_compare = [int(p) for p in policy_to_compare] def compare_policy(iteration_number, iteration_time, iteration_value, values): Q = numpy.empty((5, len(world.all_states) + 1)) for aa in range(0, 5): Q[aa] = transitions[aa].dot(values) policy = Q.argmax(axis=0) diffs = numpy.sum(policy != policy_to_compare) if diffs == 0: equal_iterations.append(iteration_number) stats.load_and_run_analysis(compare_policy) return equal_iterations
def get_graphs_and_time_stats_grid_world_mdp(world, transitions): stats = IterationStats('stats/vi_simple_grid_0-99.csv') analysis.create_grid_world_state_graphs_value_iteration( world, stats, transitions, 'vi_graph', 'grid_world_results/vi', iterations=[252]) times = analysis.create_iteration_value_graph( stats, 'variation', 'Variation for Grid World Value Iteration', 'grid_world_results/vi') print('vi total time: {} ms'.format(numpy.sum(times) * 1000.0)) stats = IterationStats('stats/pi_simple_grid_0-99.csv') times = analysis.create_iteration_value_graph( stats, '# of elements changed', 'Changed Elements for Grid World Policy Iteration', 'grid_world_results/pi') print('pi total time: {} ms'.format(numpy.sum(times) * 1000.0))
def compare_different_gamma_policies(world, transitions, reward): stats = IterationStats('stats/pi_simple_grid_0-99.csv') analysis.create_grid_world_state_graphs_policy_iteration( world, stats, transitions, 'PI state graph', 'grid_world_results/pi', iterations=[8]) stats = IterationStats('stats/vi_simple_grid_0-99.csv') vi_policy = analysis.get_vi_policy(world, stats, transitions, 252) stats = IterationStats('stats/pi_simple_grid_0-99.csv') pi_policy = analysis.get_pi_policy(world, stats, transitions, 26) analysis.compare_policies(world, vi_policy, pi_policy, 'grid_world_results', 'comparison_pi_vi') stats = IterationStats('stats/vi_simple_grid_0-8.csv') vi_policy = analysis.get_vi_policy(world, stats, transitions, 55) analysis.compare_policies(world, vi_policy, pi_policy, 'grid_world_results', 'comparison pi vi low gamma')
def create_iteration_value_graph(stats: IterationStats, value_name, name, basefolder): iteration_values = [] times = [] def add_variation(iteration_number, iteration_time, iteration_value, values): iteration_values.append(iteration_value) times.append(iteration_time) stats.load_and_run_analysis(add_variation) plot.figure(1, clear=True) plot.title(name) plot.ylabel(value_name) plot.xlabel('iteration') plot.plot(range(1, len(iteration_values) + 1), iteration_values) plot.pause(0.001) plot.savefig('{}/{}'.format(basefolder, name.replace(' ', '_').lower())) plot.close() return times
def run_policy_iteration_grid_world(world, transitions, reward): def write_pi_stats(pi_iter, iteration, time, variation): stats.save_iteration(iteration, time, variation, pi_iter.policy) print('[{}]:\t{}'.format(iteration, variation)) for gamma in numpy.linspace(0.8, 1.0, num=5): if gamma == 1.0: gamma = 0.99 stats = IterationStats('stats/pi_simple_grid_{}.csv'.format( str(gamma).replace('.', '-'))) pi = cmdptoolbox.mdp.PolicyIteration(transitions, reward, gamma, max_iter=1000, skip_check=True) print("set up before run") pi.setVerbose() pi.setPrint(write_pi_stats) stats.start_writing() pi.run() stats.done_writing() print('found in {} iterations'.format(pi.iter)) print('took {}'.format(pi.time)) world.print_policy(print, pi.policy) last_policy = pi.policy return last_policy
def run_value_iteration_forest(S, r1, r2): forest = WrappedForest(S, r1, r2) transitions, reward = mdptoolbox.example.forest(S, r1, r2) for gamma in numpy.linspace(0.5, 1.0, num=11): if gamma == 1.0: gamma = 0.99 stats = IterationStats('stats/vi_forest_{}.csv'.format( '{:.2f}'.format(gamma).replace('.', '-'))) def write_vi_stats(vi_iter, iteration, time, variation): stats.save_iteration(iteration, time, variation, vi_iter.V) print('[{}]:\t{}'.format(iteration, variation)) print('gamma={}'.format(gamma)) vi = cmdptoolbox.mdp.ValueIteration(transitions, reward, gamma, epsilon=0.0001, max_iter=10000, skip_check=True) vi.setVerbose() vi.setPrint(write_vi_stats) stats.start_writing() vi.run() stats.done_writing() print('found in {} iterations'.format(vi.iter)) print('took {}'.format(vi.time)) forest.print_policy(print, vi.policy)
def run_value_iteration_grid_world(world, transitions, reward): def write_vi_stats(vi_iter, iteration, time, variation): stats.save_iteration(iteration, time, variation, vi_iter.V) print('[{}]:\t{}'.format(iteration, variation)) for gamma in numpy.linspace(0.8, 0.99, num=20): stats = IterationStats('stats/vi_simple_grid_{}.csv'.format( str(gamma).replace('.', '-'))) vi = cmdptoolbox.mdp.ValueIteration(transitions, reward, gamma, epsilon=0.0001, max_iter=10000, skip_check=True) vi.setVerbose() vi.setPrint(write_vi_stats) stats.start_writing() vi.run() stats.done_writing() print('found in {} iterations'.format(vi.iter)) print('took {}'.format(vi.time)) world.print_policy(print, vi.policy) last_policy = vi.policy return last_policy
def run_q_learning_forest(S, r1, r2): forest = WrappedForest(S, r1, r2) n_episodes = 10000 how_often = n_episodes / 100 stats = IterationStats('stats/ql_forest.csv', dims=2) def on_episode(episode, time, q_learner, q): forest.print_policy(print, q_learner.get_policy()) stats.save_iteration(episode, time, numpy.nanmean(numpy.nanmax(q, axis=0)), q) def is_done(state, action, next_state): if next_state.state_num == 0: return True return False gamma = 0.99 start = time.time() numpy.random.seed(5263228) q_l = QLearning(forest, 0.5, 0.2, gamma, on_episode=on_episode, start_at_0=True, alpha=0.1, is_done=is_done, every_n_episode=how_often) stats.start_writing() q_l.run(n_episodes) stats.done_writing() forest.print_policy(print, q_l.get_policy()) print('took {} s'.format(time.time() - start)) stats = IterationStats('stats/ql_forest.csv', dims=2) analysis.create_iteration_value_graph( stats, 'average Q', 'Average Q for each iteration on Forest Q Learning', 'forest_results')
def run_policy_iteration_forest(S, r1, r2): forest = WrappedForest(S, r1, r2) transitions, reward = mdptoolbox.example.forest(S, r1, r2) stats = IterationStats('stats/pi_forest.csv') def write_pi_stats(pi_iter, iteration, time, variation): stats.save_iteration(iteration, time, variation, pi_iter.policy) print('[{}]:\t{}'.format(iteration, variation)) pi = cmdptoolbox.mdp.PolicyIteration(transitions, reward, 0.9, max_iter=1000) pi.setVerbose() pi.setPrint(write_pi_stats) stats.start_writing() pi.run() stats.done_writing() print('found in {} iterations'.format(pi.iter)) print('took {}'.format(pi.time)) forest.print_policy(print, pi.policy)
def get_graph_q_learning(): stats = IterationStats('stats/ql_simple_grid.csv', dims=5) analysis.create_iteration_value_graph( stats, 'average Q', 'Average Q for each iteration on Grid World Q Learning', 'grid_world_results/ql')
def run_q_learning_grid_world(): world = GridWorld('simple_grid.txt', -0.01, include_treasure=False) n_episodes = 500000 how_often = n_episodes / 500 stats = IterationStats('stats/ql_simple_grid.csv', dims=5) def on_update(state, action, next_state, q_learner): #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y)) pass def on_episode(episode, time, q_learner, q): world.print_policy(print, q_learner.get_policy()) stats.save_iteration(episode, time, numpy.nanmean(numpy.nanmax(q, axis=0)), q) #time.sleep(1) for state in world.get_states(): if state.tile_type == GridWorldTile.GOAL: goal_state = state break def initialize_toward_goal(state: GridWorldTile): actions = state.get_actions() if len(actions) == 0: return [] diff_x = goal_state.x - state.x diff_y = goal_state.y - state.y best_value = 0.1 if len(actions) == 5 and actions[4][0].startswith('get treasure'): best_action = actions[4][0] elif abs(diff_x) >= abs(diff_y): if diff_x > 0: best_action = 'move east' else: best_action = 'move west' else: if diff_y < 0: best_action = 'move north' else: best_action = 'move south' values = [-0.1] * len(actions) for i, action in enumerate(actions): if action[0] == best_action: values[i] = best_value return values gamma = 0.99 q_l = QLearning(world, 0.5, 0.05, gamma, on_update=on_update, on_episode=on_episode, initializer=initialize_toward_goal, start_at_0=True, alpha=0.1, every_n_episode=how_often) stats.start_writing() q_l.run(n_episodes) stats.done_writing() world.print_policy(print, q_l.get_policy())