Beispiel #1
0
def main(open_plot=True):
    # Setup MDP.
    mdp = GridWorldMDP(width=4,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(4, 3)],
                       lava_locs=[(4, 2)],
                       gamma=0.95,
                       walls=[(2, 2)],
                       slip_prob=0.05)

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())
    tabular_agent = CherryQAgent(mdp,
                                 model=lambda *x: ActionValueFunction(*x, init=1.0),
                                 name='Tabular',
                                 lr=0.7)
    linear_agent = CherryQAgent(mdp,
                                model=lambda *x: nn.Linear(*x),
                                name='Linear',
                                lr=0.1)
    mlp_agent = CherryQAgent(mdp,
                             model=lambda *x: MLP(*x),
                             name='MLP',
                             lr=0.07)

    # Run experiment and make plot.
    agents = [rand_agent, ql_agent, tabular_agent, linear_agent, mlp_agent]
    run_agents_on_mdp(agents,
                      mdp,
                      instances=10,
                      episodes=50,
                      steps=50,
                      open_plot=open_plot)
def main():
    # Command line args.
    task, rom = parse_args()

    # Setup the MDP.
    mdp = choose_mdp(task, rom)
    actions = mdp.get_actions()
    gamma = mdp.get_gamma()

    # Setup agents.
    from simple_rl.agents import RandomAgent, QLearningAgent

    random_agent = RandomAgent(actions)
    qlearner_agent = QLearningAgent(actions, gamma=gamma, explore="uniform")
    agents = [qlearner_agent, random_agent]

    # Run Agents.
    if isinstance(mdp, MarkovGameMDP):
        # Markov Game.
        agents = {
            qlearner_agent.name: qlearner_agent,
            random_agent.name: random_agent
        }
        play_markov_game(agents, mdp, instances=100, episodes=1, steps=500)
    else:
        # Regular experiment.
        run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
Beispiel #3
0
def main(open_plot=True):
    #     gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)])
    #     num_feats = gym_mdp.get_num_state_feats()
    #     lin_agent = QLearnerAgent(gym_mdp.actions, alpha=0.4, epsilon=0.4)
    #     rand_agent = RandomAgent(gym_mdp.actions)
    #     run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot)

    #     gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)])
    #     num_feats = gym_mdp.get_num_state_feats()
    #     lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=False,rbf=True)
    #     rand_agent = RandomAgent(gym_mdp.actions)
    #     run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot,verbose=True)

    gym_mdp = GymMDP(env_name='CartPole-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()
    lin_agent = LinearQLearnerAgent(gym_mdp.actions,
                                    num_features=num_feats,
                                    alpha=0.4,
                                    epsilon=0.4,
                                    anneal=False,
                                    rbf=True)
    rand_agent = RandomAgent(gym_mdp.actions)
    run_agents_on_mdp([lin_agent, rand_agent],
                      gym_mdp,
                      instances=5,
                      episodes=1000,
                      steps=100,
                      open_plot=open_plot)
Beispiel #4
0
def main():
    # Setup MDP, Agents.
    size = 5
    agent = {
        "x": 1,
        "y": 1,
        "dx": 1,
        "dy": 0,
        "dest_x": size,
        "dest_y": size,
        "has_block": 0
    }
    blocks = [{"x": size, "y": 1}]
    lavas = [{
        "x": x,
        "y": y
    } for x, y in map(lambda z: (z + 1, (size + 1) / 2), range(size))]

    mdp = TrenchOOMDP(size, size, agent, blocks, lavas)
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=30,
                      episodes=250,
                      steps=250)
Beispiel #5
0
def main(open_plot=True):

    # Setup MDP.
    mdp = GridWorldMDP(width=8,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(8, 3)],
                       lava_locs=[(4, 2)],
                       gamma=0.95,
                       walls=[(2, 2)],
                       slip_prob=0.05)

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=20,
                      episodes=300,
                      steps=20,
                      open_plot=open_plot,
                      track_success=True,
                      success_reward=1)
def main(open_plot=True):
    # Taxi initial state attributes..
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
    walls = []
    mdp = TaxiOOMDP(width=4,
                    height=4,
                    agent=agent,
                    walls=walls,
                    passengers=passengers)

    # Agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    viz = False
    if viz:
        # Visualize Taxi.
        run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
        mdp.visualize_agent(ql_agent)
    else:
        # Run experiment and make plot.
        run_agents_on_mdp([ql_agent, rand_agent],
                          mdp,
                          instances=10,
                          episodes=1,
                          steps=500,
                          reset_at_terminal=True,
                          open_plot=open_plot)
Beispiel #7
0
def main():
    # Setup MDP, Agents.
    size = 5
    agent = {
        "x": 1,
        "y": 1,
        "dx": 1,
        "dy": 0,
        "dest_x": size,
        "dest_y": size,
        "has_block": 0
    }
    blocks = [{"x": size, "y": 1}]
    lavas = [{
        "x": x,
        "y": y
    } for x, y in map(lambda z: (z + 1, (size + 1) / 2), xrange(size))]

    mdp = TrenchOOMDP(size, size, agent, blocks, lavas)
    ql_agent = QLearnerAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    # run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250)

    vi = ValueIteration(mdp, delta=0.0001, max_iterations=5000)
    iters, val = vi.run_vi()
    print " done."
    states = vi.get_states()
    num_states = len(states)
    print num_states, states
def main(open_plot=True):
    # Make MDP distribution, agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")
    ql_agent = QLearnerAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Run experiment and make plot.
    run_agents_multi_task([ql_agent, rand_agent], mdp_distr, task_samples=50, episodes=1, steps=1500, reset_at_terminal=True, open_plot=open_plot)
Beispiel #9
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='CartPole-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=True)
    rand_agent = RandomAgent(gym_mdp.actions)
    run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=10, episodes=30, steps=10000, open_plot=open_plot)
Beispiel #10
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)])
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())
    abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9}, actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
Beispiel #11
0
def main():

    # Setup MDP.

    actual_args = {
        "width":
        10,
        "height":
        10,
        "init_loc": (1, 1),
        "goal_locs": [(10, 10)],
        "lava_locs": [(1, 10), (3, 10), (5, 10), (7, 10), (9, 10)],
        "gamma":
        0.9,
        "walls": [
            (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9),
            (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9),
            (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9),
            (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9)
        ],
        "slip_prob":
        0.01,
        "lava_cost":
        1.0,
        "step_cost":
        0.1
    }

    mdp = GridWorldMDP(**actual_args)

    # Initialize the custom Q function for a q-learning agent. This should be equivalent to potential shaping.
    # This should cause the Q agent to learn more quickly.
    custom_q = defaultdict(lambda: defaultdict(lambda: 0))
    custom_q[GridWorldState(5, 1)]['right'] = 1.0
    custom_q[GridWorldState(2, 1)]['right'] = 1.0

    # Make a normal q-learning agent and another initialized with the custom_q above.
    # Finally, make a random agent to compare against.
    ql_agent = QLearningAgent(actions=mdp.get_actions(),
                              epsilon=0.2,
                              alpha=0.4)
    ql_agent_pot = QLearningAgent(actions=mdp.get_actions(),
                                  epsilon=0.2,
                                  alpha=0.4,
                                  custom_q_init=custom_q,
                                  name="PotQ")
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, ql_agent_pot, rand_agent],
                      mdp,
                      instances=2,
                      episodes=60,
                      steps=200,
                      open_plot=True,
                      verbose=True)
Beispiel #12
0
def main(open_plot=True):
    state_colors = defaultdict(lambda:defaultdict(lambda:"white"))
    state_colors[3][2] = "red"

    # Setup MDP, Agents.
    mdp = ColoredGridWorldMDP(state_colors)
    ql_agent = QLearnerAgent(actions=mdp.get_actions()) 
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot) 
def collect_dataset(mdp, samples=10000, learning_agent=None):
    '''
    Args:
        mdp (simple_rl.MDP)
        samples (int)
        learning_agent (simple_rl.Agent): If None, a random agent is used.
            Otherwise collects data based on its learning.

    Returns:
        (set)
    '''
    if learning_agent is None:
        learning_agent = RandomAgent(mdp.get_actions())

    cur_state = mdp.get_init_state()
    reward = 0
    visited_states = set([cur_state])

    # Set initial state params.
    init_state_params = {}
    last_x = 0 + np.random.randn(1)[0]
    init_state_params["x"] = last_x
    init_state_params["x_dot"] = 0
    init_state_params["theta"] = 0
    init_state_params["theta_dot"] = 0

    for i in range(samples):
        action = learning_agent.act(cur_state, reward)
        reward, next_state = mdp.execute_agent_action(action)

        visited_states.add(next_state)
        if next_state.is_terminal():
            init_state_params["x"] = np.random.randn(1)[0]
            mdp.reset(init_state_params)
            learning_agent.end_of_episode()
            cur_state = mdp.get_init_state()
            reward = 0
        else:
            cur_state = next_state

    return visited_states
def _setup_agents(solar_mdp):
    '''
    Args:
        solar_mdp (SolarOOMDP)

    Returns:
        (list): of Agents
    '''
    # Get relevant MDP params.
    actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma(
    ), solar_mdp.get_panel_step()

    # Setup fixed agent.
    static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel")
    optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal")

    # Grena single axis and double axis trackers from time/loc.
    grena_tracker = SolarTracker(tb.grena_tracker,
                                 panel_step=panel_step,
                                 dual_axis=True)
    grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(),
                                           name="grena-tracker")

    # Setup RL agents
    alpha, epsilon = 0.3, 0.3
    num_features = solar_mdp.get_num_state_feats()
    lin_ucb_agent = LinUCBAgent(actions, name="lin-ucb",
                                alpha=0.3)  #, alpha=0.2)
    ql_lin_approx_agent_g0 = LinearQLearnerAgent(actions,
                                                 num_features=num_features,
                                                 name="ql-lin-g0",
                                                 alpha=alpha,
                                                 epsilon=epsilon,
                                                 gamma=0,
                                                 rbf=True,
                                                 anneal=True)
    ql_lin_approx_agent = LinearQLearnerAgent(actions,
                                              num_features=num_features,
                                              name="ql-lin",
                                              alpha=alpha,
                                              epsilon=epsilon,
                                              gamma=gamma,
                                              rbf=True,
                                              anneal=True)
    # sarsa_lin_rbf_agent = LinearApproxSarsaAgent(actions, name="sarsa-lin", alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=True, anneal=False)
    random_agent = RandomAgent(actions)

    # Regular experiments.
    agents = [
        ql_lin_approx_agent, lin_ucb_agent, grena_tracker_agent, static_agent
    ]

    return agents
Beispiel #15
0
def main():
    import OptimalBeliefAgentClass

    # Setup multitask setting.
    # R ~ D : Puddle, Rock Sample
    # G ~ D : octo, four_room
    # T ~ D : grid

    mdp_class, is_goal_terminal, samples = parse_args()

    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()
    print "done."  #, iters, value
    sys.stdout.flush()

    # Agents.
    print "Making agents...",
    sys.stdout.flush()
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy)
    opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy,
                                              name="$\pi_{prior}$")
    opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent(
        mdp_distr, actions)
    vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$")
    rand_agent = RandomAgent(actions, name="$\pi^u$")
    ql_agent = QLearningAgent(actions)
    print "done."

    agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=False,
                          track_disc_reward=False,
                          cumulative_plot=True)
Beispiel #16
0
def main():
    # create mdp using own definition
    mdp = tfeMDP()

    # Three different agents to compare how each do against each other
    rand_agent = RandomAgent(actions=mdp.get_actions())
    rmax_agent = RMaxAgent(actions=mdp.get_actions())
    agent = QLearningAgent(actions=mdp.get_actions())

    # Function that actually runs everything and generates the appropriate
    # graphs and statistics defining how each agent did
    run_agents_on_mdp([agent, rmax_agent, rand_agent], mdp,
                      instances=200, episodes=100, steps=1000)
def main():
    # Make MDP distribution, agents.
    mdp_distr = make_mdp_distr(mdp_class="grid")
    ql_agent = QLearnerAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Run experiment and make plot.
    run_agents_multi_task([ql_agent, rand_agent],
                          mdp_distr,
                          task_samples=30,
                          episodes=100,
                          steps=50,
                          reset_at_terminal=True,
                          include_optimal=True)
Beispiel #18
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = BanditMDP()

    lin_agent = LinUCBAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, lin_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=1,
                      steps=500,
                      open_plot=open_plot)
Beispiel #19
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='Breakout-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    rand_agent = RandomAgent(gym_mdp.get_actions())
    lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats)
    run_agents_on_mdp([lin_q_agent, rand_agent],
                      gym_mdp,
                      instances=5,
                      episodes=50000,
                      steps=200,
                      open_plot=open_plot,
                      verbose=False)
Beispiel #20
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=10,
                       height=10,
                       init_loc=(1, 1),
                       goal_locs=[(10, 10)])
    ql_agent = QLearnerAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=150,
                      open_plot=open_plot)
def _setup_agents(solar_mdp):
    '''
    Args:
        solar_mdp (SolarOOMDP)

    Returns:
        (list): of Agents
    '''
    # Get relevant MDP params.
    actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma(
    ), solar_mdp.get_panel_step()

    # Setup fixed agent.
    static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel")
    optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal")

    # Grena single axis and double axis trackers from time/loc.
    grena_tracker = SolarTracker(tb.grena_tracker,
                                 panel_step=panel_step,
                                 dual_axis=solar_mdp.dual_axis,
                                 actions=solar_mdp.get_bandit_actions())
    grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(),
                                           name="grena-tracker")

    # Setup RL agents
    alpha, epsilon = 0.1, 0.05
    rand_init = True
    num_features = solar_mdp.get_num_state_feats()
    lin_ucb_agent = LinUCBAgent(solar_mdp.get_bandit_actions(),
                                context_size=num_features,
                                name="lin-ucb",
                                rand_init=rand_init,
                                alpha=2.0)
    # sarsa_agent_g0 = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin-g0", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=0, rbf=False, anneal=True)
    # sarsa_agent = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=False, anneal=True)
    ql_agent = QLearningAgent(actions,
                              alpha=alpha,
                              epsilon=epsilon,
                              gamma=gamma)
    random_agent = RandomAgent(actions)

    # Regular experiments.
    # agents = [lin_ucb_agent, sarsa_agent, sarsa_agent_g0, grena_tracker_agent, static_agent]
    agents = [grena_tracker_agent, static_agent]

    return agents
Beispiel #22
0
def main():

    # ========================
    # === Make Environment ===
    # ========================
    mdp_class = "four_room"
    gamma = 1.0
    environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, step_cost=0.01, grid_dim=15, gamma=gamma)
    actions = environment.get_actions()


    # ==========================
    # === Make SA, AA Stacks ===
    # ==========================
    sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=2)

    # Debug.
    print "\n" + ("=" * 30) + "\n== Done making abstraction. ==\n" + ("=" * 30) + "\n"
    sa_stack.print_state_space_sizes()
    aa_stack.print_action_spaces_sizes()

    # ===================
    # === Make Agents ===
    # ===================
    # baseline_agent = QLearnerAgent(actions)
    agent_class = QLearnerAgent
    baseline_agent = agent_class(actions, gamma=gamma)
    rand_agent = RandomAgent(actions)
    # hierarch_r_max = HRMaxAgent(actions, sa_stack=sa_stack, aa_stack=aa_stack)
    l0_hierarch_agent = HierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$")
    l1_hierarch_agent = HierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$")
    # l2_hierarch_agent = HierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$")
    dynamic_hierarch_agent = DynamicHierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")
    # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")

    print "\n" + ("=" * 26)
    print "== Running experiments. =="
    print "=" * 26 + "\n"

    # ======================
    # === Run Experiment ===
    # ======================
    agents = [dynamic_hierarch_agent, baseline_agent]

    run_agents_multi_task(agents, environment, task_samples=10, steps=20000, episodes=1, reset_at_terminal=True)
Beispiel #23
0
def main():

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    ql_agent = QLearningAgent(mdp.get_actions(),
                              epsilon=args.epsilon,
                              alpha=args.alpha,
                              explore=args.explore,
                              anneal=args.anneal)
    viz = args.mode

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        rand_agent = RandomAgent(actions=mdp.get_actions())
        run_agents_on_mdp([rand_agent, ql_agent],
                          mdp,
                          open_plot=True,
                          episodes=60,
                          steps=200,
                          instances=5,
                          success_reward=1)
        # mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent,
                               delay=0.005,
                               num_ep=500,
                               num_steps=200)
def main():

    # ========================
    # === Make Environment ===
    # ========================
    mdp_class = "hrooms"
    environment = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = environment.get_actions()

    # ==========================
    # === Make SA, AA Stacks ===
    # ==========================
    # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3)
    sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3)

    # Debug.
    print("\n" + ("=" * 30))
    print("== Done making abstraction. ==")
    print("=" * 30 + "\n")
    sa_stack.print_state_space_sizes()
    print("Num Action Abstractions:", len(aa_stack.get_aa_list()))

    # ===================
    # === Make Agents ===
    # ===================
    baseline_agent = QLearningAgent(actions)
    rmax_agent = RMaxAgent(actions)
    rand_agent = RandomAgent(actions)
    l0_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$")
    l1_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$")
    # l2_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$")
    dynamic_hierarch_agent = DynamicHierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")
    # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")

    print("\n" + ("=" * 26))
    print("== Running experiments. ==")
    print("=" * 26 + "\n")

    # ======================
    # === Run Experiment ===
    # ======================
    agents = [l1_hierarch_agent, dynamic_hierarch_agent, baseline_agent]
    run_agents_multi_task(agents, environment, task_samples=10, steps=1500, episodes=1, reset_at_terminal=True)
Beispiel #25
0
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy_lambda (lambda : simple_rl.State --> str)
        beta (float)
        is_deterministic_ib (bool): If True, run DIB, else IB.
        is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead.

    Summary:
        Runs info_sa and compares the value of the found policy with the demonstrator policy.
    '''
    if is_agent_in_control:
        # Run info_sa with the agent controlling the MDP.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib)
    else:
        # Run info_sa.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib)

    # Make demonstrator agent and random agent.
    demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$")
    rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$")

    # Make abstract agent.
    lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf)
    prob_s_phi = ProbStateAbstraction(phi_pmf)
    crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
    abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="")
    
    # Run.
    run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000)


    non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0]
    # Print state space sizes.
    demo_vi = ValueIteration(mdp)
    print "\nState Spaces Sizes:"
    print "\t|S| =", demo_vi.get_num_states()
    print "\tH(S_\\phi) =", entropy(pmf_s_phi)
    print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states()
    print "\tdelta_min =", min(non_zero_abstr_states)
    print "\tnum non zero states =", len(non_zero_abstr_states)
    print
Beispiel #26
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=4,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(4, 3)],
                       gamma=0.95,
                       walls=[(2, 2)])

    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=1,
                      steps=20,
                      open_plot=open_plot)
Beispiel #27
0
def main():
    # Setup MDP.
    w = 6
    h = 6
    mdp = GridWorld(width=w,
                    height=h,
                    init_loc=(1, 1),
                    goal_locs=[(6, 6)],
                    slip_prob=.1)

    # Setup Agents.
    rand_agent = RandomAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())

    # Compute number of samples for R-MAX to achieve epsilon optimal behavior with high probability (1 - delta)
    compute_n_samples = False
    if compute_n_samples:
        epsilon = .1
        delta = .05
        m_r = np.log(2. / delta) / (2. * epsilon**2)
        m_t = 2. * (np.log(2**(float(w * h)) - 2.) - np.log(delta)) / (epsilon
                                                                       **2)
        n_samples = int(max(m_r, m_t))
    else:
        n_samples = 30

    simple_rl_rmax_agent = RMaxAgent(actions=mdp.get_actions(),
                                     gamma=.9,
                                     horizon=3,
                                     s_a_threshold=n_samples,
                                     name='SimpleRL-R-MAX')
    rmax_agent = RMax(actions=mdp.get_actions(),
                      gamma=.9,
                      count_threshold=n_samples)

    # Run experiment and make plot.
    run_agents_on_mdp([rand_agent, ql_agent, rmax_agent, simple_rl_rmax_agent],
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=20,
                      reset_at_terminal=True,
                      verbose=False)
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")
    ql_agent = QLearningAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Make goal-based option agent.
    goal_based_options = aa_helpers.make_goal_based_options(mdp_distr)
    goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(),
                                      options=goal_based_options)
    option_agent = AbstractionWrapper(QLearningAgent,
                                      actions=mdp_distr.get_actions(),
                                      action_abstr=goal_based_aa)

    # Run experiment and make plot.
    run_agents_lifelong([ql_agent, rand_agent, option_agent],
                        mdp_distr,
                        samples=10,
                        episodes=100,
                        steps=150,
                        open_plot=open_plot)
def main(open_plot=True):

    # Setup MDP.

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    if args.visualize:
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))

    else:
        custom_q = parse_custom_q_table(args.custom_q, args.default_q)

        agents = []
        for agent in args.agents:
            if agent == 'q_learning':
                agents.append(QLearningAgent(actions=mdp.get_actions()))
            elif agent == 'potential_q':
                agents.append(
                    QLearningAgent(actions=mdp.get_actions(),
                                   custom_q_init=custom_q,
                                   name="Potential_Q"))
            elif agent == 'random':
                agents.append(RandomAgent(actions=mdp.get_actions()))
            elif agent == 'rmax':
                agents.append(RMaxAgent(mdp.get_actions()))

        # Run experiment and make plot.
        run_agents_on_mdp(agents,
                          mdp,
                          instances=1,
                          episodes=100,
                          steps=100,
                          open_plot=open_plot,
                          verbose=True)
Beispiel #30
0
def main(open_plot=True):
    # Setup MDP.
    mdp = GridWorldMDP(width=4,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(4, 3)],
                       lava_locs=[(4, 2)],
                       gamma=0.95,
                       walls=[(2, 2)])

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=5,
                      episodes=50,
                      steps=25,
                      open_plot=open_plot,
                      track_disc_reward=False)