def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--grid_size', nargs='?', const=1, type=int, default=8)
	parser.add_argument('--macro_block', nargs='?', const=1, type=int, default=2)
	parser.add_argument('--gamma', nargs='?', const=1, type=float, default=0.90)
	args = parser.parse_args()

	start_time = time.time()
	env = GridWorld(args.grid_size, args.macro_block, args.gamma)
	env_time = time.time() - start_time
	print ("Time to create environment: {}".format(env_time))

	start_time = time.time()
	policy, v = value_iteration_sparse(env)
	value_iteration_time = time.time() - start_time
	print ("Time to find optimal policy: {}".format(value_iteration_time))

	
	start_time = time.time()
	weights_inv, policy_inv = irl(env, policy)
	irl_time = time.time() - start_time
	print ("Time to solve IRL problem: {}".format(irl_time))
	

	print ("Displaying comparison between optimal policy and IRL policy:")
	env.draw(policy, policy_inv, weights_inv)
Ejemplo n.º 2
0
def Q_learning(env: GridWorld,
               epsilon: float,
               lr: float,
               initQ: np.ndarray,
               converge=False) -> (np.ndarray, float):
    """
    Performs Q learning for single episode in environment and returns learned policy.
    :param env: GridWorld subclass
    :param epsilon: Exploitation rate
    :param lr: learning rate
    :param initQ: Q table to update
    :param converge: Flag to determine if delta of Q-values need to be tracked
    for convergence within set bound.
    :return: Updated Q table after a single episode of training and maximum change for any Q-value
    """

    # keep track of maximum state-action value change
    delta = 0.0

    state = env.reset()

    done = False

    while not done:

        # explore action space
        if np.random.uniform(0, 1) > epsilon:
            action = env.sample()

        # exploit
        else:
            action = np.argmax(initQ[state])

        # take step in env
        obs, r, done = env.step(action)

        # update Q table
        prev_value = initQ[state, action]

        new_value = prev_value + lr * (r + env.gamma * np.max(initQ[obs]) -
                                       prev_value)

        initQ[state, action] = new_value

        # update state
        state = obs

        if converge:
            delta = max(delta, np.abs(new_value - prev_value))

    return initQ, delta
Ejemplo n.º 3
0
 def control(self):
     map_names = ['map{}'.format(i) for i in range(1, 21)]
     train_combos = list(product(range(10), range(10)))
     test_combos = train_combos
     env = PORGBEnv(
         ComboEnv(
             GridWorld(map_names,
                       num_obj_types=5,
                       train_combos=train_combos,
                       test_combos=test_combos,
                       window=1,
                       seed=0)))
     control(env)
     env.render()
Ejemplo n.º 4
0
    def testNeighbors(self):
        """
        #Check that 1, 5, and 9 have the correct neighbors
                 B B B B B
        1 2 3    B 1 2 3 B
        4 5 6 => B 4 5 6 B
        7 8 9    B 7 8 9 B
                 B B B B B
        """
        gridworld = GridWorld()
        
        #Check 1, index is 0
        grid_1 = gridworld.grids[gridworld.map_states(0)]
        grid_1_neighbors = list(grid_1.neighbors.values())
        grid_1_neighbors_ids = [grid.id for grid in grid_1_neighbors] 
        results = list(set(grid_1_neighbors_ids) - set([None]))
        results.sort()
        target = [2,4]
        self.assertEqual(results,target)
        
        #Check 5, index is 4
        grid_5 = gridworld.grids[gridworld.map_states(4)]
        grid_5_neighbors = list(grid_5.neighbors.values())
        grid_5_neighbors_ids = [grid.id for grid in grid_5_neighbors] 
        results = list(set(grid_5_neighbors_ids) - set([None]))
        results.sort()
        target = [2,4,6,8]
        self.assertEqual(results,target)

        #Check 9, index is 8
        grid_9 = gridworld.grids[gridworld.map_states(8)]
        grid_9_neighbors = list(grid_9.neighbors.values())
        grid_9_neighbors_ids = [grid.id for grid in grid_9_neighbors] 
        results = list(set(grid_9_neighbors_ids) - set([None]))
        results.sort()
        target = [6,8]
        self.assertEqual(results,target)
Ejemplo n.º 5
0
def main(arguments):
    parser = create_argparser({
        "alpha": {
            "default": 0.1
        },
        "--use_ep_func": {
            "dest": "use_ep_func",
            "action": "store_true",
            "default": True
        }
    })
    args = parser.parse_args(arguments)

    grid_world = GridWorld(default_grid, args.p1, args.p2)

    default_args = {"epsilon": 0.1, "discount_factor": 0.9}

    for arg in default_args:
        if arg not in args:
            setattr(args, arg, default_args[arg])

    run_dict = {}

    num_episodes = args.num_episodes

    globals()['args'] = args

    num_runs = 3 if args.AVERAGE_RUNS else 1

    for i in range(num_runs):
        start_time = time.time()
        q_s_a, q_s_a2 = initialize(grid_world)
        if not args.use_ep_func:
            _, _, ep_length_log, time_log, avg_ep_length_log, avg_time_log = double_q(
                grid_world,
                q_s_a,
                q_s_a2,
                args.epsilon,
                num_episodes=num_episodes)
        else:
            _, _, ep_length_log, time_log, avg_ep_length_log, avg_time_log = double_q(
                grid_world,
                q_s_a,
                q_s_a2,
                epsilon_func,
                num_episodes=num_episodes)

        total_time = time.time() - start_time

        run_dict[i] = {
            "Episode Length": ep_length_log,
            "Time Per Episode": time_log,
            "Total Time": total_time,
            "Average Time Log": avg_time_log,
            "Average Ep Length": avg_ep_length_log
        }

        print("\nTook {}s to finish {} episodes".format(
            total_time, num_episodes))

    average_ep_lengths = np.average(np.array(
        [run_dict[key]["Episode Length"] for key in run_dict]),
                                    axis=0)
    average_ep_time = np.average(np.array(
        [run_dict[key]["Time Per Episode"] for key in run_dict]),
                                 axis=0)
    average_time = np.average(np.array(
        [run_dict[key]["Total Time"] for key in run_dict]),
                              axis=0)
    average_avg_time_log = np.average(np.array(
        [run_dict[key]["Average Time Log"] for key in run_dict]),
                                      axis=0)
    average_avg_ep_length = np.average(np.array(
        [run_dict[key]["Average Ep Length"] for key in run_dict]),
                                       axis=0)

    output_deterministic_policy(q_s_a, q_s_a2, grid_world)

    return average_ep_lengths, average_ep_time, average_time, average_avg_time_log, average_avg_ep_length
def main():
    grid_size = 10
    grid_world = GridWorld(grid_size,
                           num_obstacles=20,
                           stochastic_cell_ratio=0.1)

    params = {}
    params['type'] = 'value_iteration'
    params['grid_size'] = grid_size
    params['rewards'] = grid_world.rewards
    params['transition_matrix'] = grid_world.transition_matrix
    params['step_func'] = GridWorld.deterministic_step
    params['discount'] = 0.9
    agent = AgentFactory.create_agent(params)

    episode_ended = False
    while True:
        grid_world.get_user_input()
        grid_world.draw_with_state_values(
            agent.v, policy=agent.pi if grid_world.render_policy else None)

        if not grid_world.pause:
            if episode_ended:
                grid_world.restart_episode()
                grid_world.draw_black_screen()
                episode_ended = False
            else:
                agent.do_job()
                if agent.ready_to_play():
                    action = agent.get_action(grid_world.pos)
                    episode_ended, _, _ = grid_world.step(action)

        grid_world.tick_tock()
Ejemplo n.º 7
0
from helper import create_argparser
from env import GridWorld
import random
import math
from copy import copy
import time

#Creating command line parser
parser = create_argparser()
args = parser.parse_args()

grid_world = GridWorld(args.p1,
                       args.p2,
                       args.r_up,
                       args.r_down,
                       args.r_left,
                       args.r_right,
                       grid_world_size=4,
                       starting_state=8)
'''
Policy Evaluation
'''


def policy_evaluation(v_s, pi_s, grid_world):
    #Policy evaluation implementation
    delta = math.inf
    while delta > args.theta:
        delta = 0
        for i, s in enumerate(v_s):
            if i == grid_world.terminal_state:
Ejemplo n.º 8
0
        print(row)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--gamma", type=float, default=1.0)
    parser.add_argument("--alpha", type=float, default=0.0001)
    parser.add_argument("--map_size", type=int, default=4)
    parser.add_argument("--num_ep", type=int, default=50000)
    parser.add_argument("--method", type=to_str, default='mc')

    args = parser.parse_args()

    # Set hyper-parameters
    gamma = args.gamma  #
    alpha = args.alpha
    map_size = args.map_size
    num_ep = args.num_ep  # 에피소드 진행 횟수
    method = args.method

    env = GridWorld()
    agent = Agent()

    data = np.zeros((map_size, map_size))

    if method == 'mc':
        mc(data, gamma, alpha, num_ep)
    elif method == 'td':
        print(data, gamma, alpha, num_ep, method)
        td(data, gamma, alpha, num_ep)
Ejemplo n.º 9
0
def main(arguments):
    parser = create_argparser()
    args = parser.parse_args(arguments)

    grid_world = GridWorld(default_grid, args.p1, args.p2)

    default_args = {"epsilon": 0.1, "discount_factor": 0.9}

    #For nice syntax
    for arg in default_args:
        if arg not in args:
            setattr(args, arg, default_args[arg])

    num_episodes = args.num_episodes

    run_dict = {}

    #injecting into global scope
    globals()['args'] = args

    num_runs = 3 if args.AVERAGE_RUNS else 1

    for i in range(num_runs):
        start_time = time.time()
        pi, q_s_a, returns = initialize(grid_world)
        _, _, ep_length_log, time_log, avg_ep_length_log, avg_time_log = gpi(
            grid_world, pi, q_s_a, returns, num_episodes=num_episodes)
        total_time = time.time() - start_time

        run_dict[i] = {
            "Episode Length": ep_length_log,
            "Time Per Episode": time_log,
            "Total Time": total_time,
            "Average Time Log": avg_time_log,
            "Average Ep Length": avg_ep_length_log
        }

        print("\nTook {}s to finish {} episodes".format(
            total_time, num_episodes))

    average_ep_lengths = np.average(np.array(
        [run_dict[key]["Episode Length"] for key in run_dict]),
                                    axis=0)
    average_ep_time = np.average(np.array(
        [run_dict[key]["Time Per Episode"] for key in run_dict]),
                                 axis=0)
    average_time = np.average(np.array(
        [run_dict[key]["Total Time"] for key in run_dict]),
                              axis=0)
    average_avg_time_log = np.average(np.array(
        [run_dict[key]["Average Time Log"] for key in run_dict]),
                                      axis=0)
    average_avg_ep_length = np.average(np.array(
        [run_dict[key]["Average Ep Length"] for key in run_dict]),
                                       axis=0)

    res = [
        average_ep_lengths, average_ep_time, average_time,
        average_avg_time_log, average_avg_ep_length
    ]

    graph_names = [
        "Episode Length", "Time Per Episode", "Total Time in Seconds",
        "Time Per Episode (Moving Average 10 ep)",
        "Episode Length (Moving Average 10 ep)"
    ]
    y_axis_names = [
        "Episode Length in Steps", "Time Per Episode in Seconds",
        "Total Time in Seconds", "Time Per Episode in Seconds",
        "Episode Length in Steps"
    ]

    #outputting policy
    output_deterministic_policy(pi, grid_world)

    for i in [0, 1]:
        t = np.linspace(1, num_episodes, num=num_episodes)[0::10]
        plt.plot(t, res[i][0::10], label="mc")
        plt.title(graph_names[i])
        plt.xlabel("Episode Number")
        plt.ylabel(y_axis_names[i])

        plt.legend()
        plt.savefig(graph_names[i] + "_mc" + ".jpg")
        plt.close()

    for i in [-2, -1]:
        t = np.linspace(1, num_episodes, num=num_episodes / 10)
        plt.plot(t, res[i], label="mc")
        plt.title(graph_names[i])
        plt.xlabel("Episode Number")
        plt.ylabel(y_axis_names[i])

        plt.legend()
        plt.savefig(graph_names[i] + "_mc" + ".jpg")
        plt.close()

    return average_ep_lengths, average_ep_time, average_time, average_avg_time_log, average_avg_ep_length
Ejemplo n.º 10
0
 def testCreate(self):
     gridworld = GridWorld()
     results = gridworld.get_pretty()
     target = "bbbbb\nbaaab\nbaaab\nbaaab\nbbbbb\n"
     self.assertEqual(results,target)
Ejemplo n.º 11
0
 def testMap_states(self):
     gridworld = GridWorld(size= (3,4))
     start = [0,1,2,3,4,5,6,7,8,9,10,11]
     target = [6,7,8,11,12,13,16,17,18,21,22,23]
     results = list(map(gridworld.map_states, start))
     self.assertEqual(results,target)