Exemple #1
0
def main(open_plot=True):
    # Setup MDP.
    mdp = GridWorldMDP(width=4,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(4, 3)],
                       lava_locs=[(4, 2)],
                       gamma=0.95,
                       walls=[(2, 2)],
                       slip_prob=0.05)

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())
    tabular_agent = CherryQAgent(mdp,
                                 model=lambda *x: ActionValueFunction(*x, init=1.0),
                                 name='Tabular',
                                 lr=0.7)
    linear_agent = CherryQAgent(mdp,
                                model=lambda *x: nn.Linear(*x),
                                name='Linear',
                                lr=0.1)
    mlp_agent = CherryQAgent(mdp,
                             model=lambda *x: MLP(*x),
                             name='MLP',
                             lr=0.07)

    # Run experiment and make plot.
    agents = [rand_agent, ql_agent, tabular_agent, linear_agent, mlp_agent]
    run_agents_on_mdp(agents,
                      mdp,
                      instances=10,
                      episodes=50,
                      steps=50,
                      open_plot=open_plot)
Exemple #2
0
def save(args):

    mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    agent = DiaynAgent(sess=None,
                       obs_dim=obs_dim,
                       num_actions=num_actions,
                       num_options=args.noptions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       batch_size=32,
                       update_freq=32,
                       alpha=1.0)

    agent.set_diversity(True)

    run_agents_on_mdp([agent],
                      mdp,
                      episodes=args.snepisodes,
                      steps=args.snsteps,
                      instances=1,
                      cumulative_plot=True)

    if args.trajdir == '__default':
        prefix = '.'
    else:
        prefix = args.trajdir

    agent.save(directory=prefix + '/vis' + '/' + str(args.task) + 'option' +
               str(args.noptions) + 'diayn',
               name='diayn-pretrain')
Exemple #3
0
def main():
    # Setup MDP, Agents.
    size = 5
    agent = {
        "x": 1,
        "y": 1,
        "dx": 1,
        "dy": 0,
        "dest_x": size,
        "dest_y": size,
        "has_block": 0
    }
    blocks = [{"x": size, "y": 1}]
    lavas = [{
        "x": x,
        "y": y
    } for x, y in map(lambda z: (z + 1, (size + 1) / 2), range(size))]

    mdp = TrenchOOMDP(size, size, agent, blocks, lavas)
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=30,
                      episodes=250,
                      steps=250)
def main():

    # Setup experiment parameters, agents, mdp.
    num_days = 3
    per_hour = False
    time_per_step = 20.0  # in minutes.
    loc, steps = "nola", int(24 * (60 / time_per_step) * num_days)
    panel_step = 1.0  # Angle movement per action.

    # If per hour is true, plots every hour long reward chunk, otherwise every day.
    rew_step_count = (steps / num_days) / 24 if per_hour else (steps /
                                                               num_days)
    sun_agents, sun_solar_mdp = setup_experiment("sun_percept",
                                                 loc=loc,
                                                 panel_step=panel_step,
                                                 time_per_step=time_per_step)

    # # Run experiments.
    run_agents_on_mdp(sun_agents,
                      sun_solar_mdp,
                      instances=5,
                      episodes=1,
                      steps=steps,
                      clear_old_results=True,
                      rew_step_count=rew_step_count,
                      verbose=False)
Exemple #5
0
def main(open_plot=True):

    # Setup MDP.
    mdp = GridWorldMDP(width=8,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(8, 3)],
                       lava_locs=[(4, 2)],
                       gamma=0.95,
                       walls=[(2, 2)],
                       slip_prob=0.05)

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=20,
                      episodes=300,
                      steps=20,
                      open_plot=open_plot,
                      track_success=True,
                      success_reward=1)
Exemple #6
0
def main():

    # Grab experiment params.
    mdp = BadChainMDP(gamma=0.95, kappa=0.001)
    actions = mdp.get_actions()

    # =======================
    # == Make Abstractions ==
    # =======================
    sa_q_eps = get_sa(mdp,
                      indic_func=indicator_funcs._q_eps_approx_indicator,
                      epsilon=0.1)

    # RMax Agents.
    rmax_agent = RMaxAgent(actions)
    abstr_rmax_agent = AbstractionWrapper(RMaxAgent,
                                          state_abstr=sa_q_eps,
                                          agent_params={"actions": actions},
                                          name_ext="-$\\phi_{Q_\\epsilon^*}$")

    # Delayed Q Agents.
    del_q_agent = DelayedQAgent(actions)
    abstr_del_q_agent = AbstractionWrapper(DelayedQAgent,
                                           state_abstr=sa_q_eps,
                                           agent_params={"actions": actions},
                                           name_ext="-$\\phi_{Q_\\epsilon^*}$")

    run_agents_on_mdp(
        [rmax_agent, abstr_rmax_agent, del_q_agent, abstr_del_q_agent],
        mdp,
        instances=50,
        steps=250,
        episodes=1)
def main(open_plot=True):
    # Taxi initial state attributes..
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
    walls = []
    mdp = TaxiOOMDP(width=4,
                    height=4,
                    agent=agent,
                    walls=walls,
                    passengers=passengers)

    # Agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    viz = False
    if viz:
        # Visualize Taxi.
        run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
        mdp.visualize_agent(ql_agent)
    else:
        # Run experiment and make plot.
        run_agents_on_mdp([ql_agent, rand_agent],
                          mdp,
                          instances=10,
                          episodes=1,
                          steps=500,
                          reset_at_terminal=True,
                          open_plot=open_plot)
Exemple #8
0
def main(open_plot=True):
    #     gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)])
    #     num_feats = gym_mdp.get_num_state_feats()
    #     lin_agent = QLearnerAgent(gym_mdp.actions, alpha=0.4, epsilon=0.4)
    #     rand_agent = RandomAgent(gym_mdp.actions)
    #     run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot)

    #     gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)])
    #     num_feats = gym_mdp.get_num_state_feats()
    #     lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=False,rbf=True)
    #     rand_agent = RandomAgent(gym_mdp.actions)
    #     run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot,verbose=True)

    gym_mdp = GymMDP(env_name='CartPole-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()
    lin_agent = LinearQLearnerAgent(gym_mdp.actions,
                                    num_features=num_feats,
                                    alpha=0.4,
                                    epsilon=0.4,
                                    anneal=False,
                                    rbf=True)
    rand_agent = RandomAgent(gym_mdp.actions)
    run_agents_on_mdp([lin_agent, rand_agent],
                      gym_mdp,
                      instances=5,
                      episodes=1000,
                      steps=100,
                      open_plot=open_plot)
Exemple #9
0
def main():

    # Set Params.
    mdp_class, task_samples, episodes, steps, grid_dim, AgentClass = get_params(
        set_manually=False)
    experiment_type = "sa"
    lifelong = True
    resample_at_terminal = False
    reset_at_terminal = False
    gamma = 0.95

    # ======================
    # == Make Environment ==
    # ======================
    environment = make_mdp.make_mdp_distr(
        mdp_class=mdp_class,
        grid_dim=grid_dim) if lifelong else make_mdp.make_mdp(
            mdp_class=mdp_class, grid_dim=grid_dim)
    environment.set_gamma(gamma)

    # =================
    # == Make Agents ==
    # =================
    agents = []
    if experiment_type == "sa":
        # SA experiment.
        agents = get_sa_experiment_agents(environment, AgentClass)
    elif experiment_type == "combo":
        # AA experiment.
        agents = get_combo_experiment_agents(environment)
    elif experiment_type == "exact_v_approx":
        agents = get_exact_vs_approx_agents(environment,
                                            incl_opt=(not multi_task))
    elif experiment_type == "opt":
        agents = get_optimal_policies(environment)
    else:
        print "Experiment Error: experiment type unknown (" + experiment_type + "). Must be one of {sa, combo, exact_v_approx}."
        quit()

    # Run!
    if lifelong:
        run_agents_lifelong(agents,
                            environment,
                            samples=task_samples,
                            steps=steps,
                            episodes=episodes,
                            reset_at_terminal=reset_at_terminal,
                            resample_at_terminal=resample_at_terminal,
                            cumulative_plot=True,
                            clear_old_results=True)
    else:
        run_agents_on_mdp(agents,
                          environment,
                          instances=task_samples,
                          steps=steps,
                          episodes=episodes,
                          reset_at_terminal=reset_at_terminal,
                          track_disc_reward=False)
Exemple #10
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='CartPole-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=True)
    rand_agent = RandomAgent(gym_mdp.actions)
    run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=10, episodes=30, steps=10000, open_plot=open_plot)
Exemple #11
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='Breakout-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    rand_agent = RandomAgent(gym_mdp.get_actions())
    lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats)
    run_agents_on_mdp([lin_q_agent, rand_agent], gym_mdp, instances=5, episodes=50000, steps=200, open_plot=open_plot, verbose=False)
Exemple #12
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)])
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())
    abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9}, actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)])
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())
    abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9, "actions":mdp.get_actions()})

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
Exemple #14
0
def main():

    # Setup MDP.

    actual_args = {
        "width":
        10,
        "height":
        10,
        "init_loc": (1, 1),
        "goal_locs": [(10, 10)],
        "lava_locs": [(1, 10), (3, 10), (5, 10), (7, 10), (9, 10)],
        "gamma":
        0.9,
        "walls": [
            (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9),
            (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9),
            (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9),
            (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9)
        ],
        "slip_prob":
        0.01,
        "lava_cost":
        1.0,
        "step_cost":
        0.1
    }

    mdp = GridWorldMDP(**actual_args)

    # Initialize the custom Q function for a q-learning agent. This should be equivalent to potential shaping.
    # This should cause the Q agent to learn more quickly.
    custom_q = defaultdict(lambda: defaultdict(lambda: 0))
    custom_q[GridWorldState(5, 1)]['right'] = 1.0
    custom_q[GridWorldState(2, 1)]['right'] = 1.0

    # Make a normal q-learning agent and another initialized with the custom_q above.
    # Finally, make a random agent to compare against.
    ql_agent = QLearningAgent(actions=mdp.get_actions(),
                              epsilon=0.2,
                              alpha=0.4)
    ql_agent_pot = QLearningAgent(actions=mdp.get_actions(),
                                  epsilon=0.2,
                                  alpha=0.4,
                                  custom_q_init=custom_q,
                                  name="PotQ")
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, ql_agent_pot, rand_agent],
                      mdp,
                      instances=2,
                      episodes=60,
                      steps=200,
                      open_plot=True,
                      verbose=True)
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = BanditMDP()

    lin_agent = LinUCBAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, lin_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, open_plot=open_plot)
def main(open_plot=True):
    state_colors = defaultdict(lambda: defaultdict(lambda: "white"))
    state_colors[3][2] = "red"

    # Setup MDP, Agents.
    mdp = ColoredGridWorldMDP(state_colors)
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot)
def main(open_plot=True):

    # Setup MDP.
    mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05)

    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=50, steps=10, open_plot=open_plot)
Exemple #18
0
def main(open_plot=True):
    state_colors = defaultdict(lambda:defaultdict(lambda:"white"))
    state_colors[3][2] = "red"

    # Setup MDP, Agents.
    mdp = ColoredGridWorldMDP(state_colors)
    ql_agent = QLearnerAgent(actions=mdp.get_actions()) 
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot) 
Exemple #19
0
def test_utility(args, mdp):
    # The number of options to the performance
    # TODO: Compare the utility of point options vs. subgoal options?
    now_ts = str(datetime.now().timestamp())
    origMatrix, intToS = GetAdjacencyMatrix(mdp)
    known_region = list(intToS.values())  # Known region is a set of MDPStates.

    n_ops_list = [2, 4, 8, 16, 32]

    agents = []
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    agents.append(ql_agent)

    method = 'fiedler'

    for n_ops in n_ops_list:
        _, foptions, _, fvectors = GetOption(mdp,
                                             n_ops,
                                             matrix=origMatrix,
                                             intToS=intToS,
                                             option_type=args.optiontype,
                                             method=method)
        print('#options=', n_ops)
        print(foptions)

        if args.optiontype == 'subgoal':
            known_region = list(
                intToS.values())  # Known region is a set of MDPStates.
            eigenoption_agent = build_subgoal_option_agent(
                mdp,
                foptions,
                known_region,
                vectors=fvectors,
                name='-' + method + '-' + args.optiontype + '-' + str(n_ops))
        else:
            eigenoption_agent = build_point_option_agent(
                mdp,
                foptions,
                agent=QLearningAgent,
                policy='vi',
                name='-' + method + '-' + args.optiontype + '-' + str(n_ops))

        agents.append(eigenoption_agent)

    run_agents_on_mdp(agents,
                      mdp,
                      instances=args.ninstances,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      open_plot=True,
                      track_disc_reward=True,
                      cumulative_plot=True,
                      dir_for_plot="results/")
Exemple #20
0
def main():
    # create mdp using own definition
    mdp = tfeMDP()

    # Three different agents to compare how each do against each other
    rand_agent = RandomAgent(actions=mdp.get_actions())
    rmax_agent = RMaxAgent(actions=mdp.get_actions())
    agent = QLearningAgent(actions=mdp.get_actions())

    # Function that actually runs everything and generates the appropriate
    # graphs and statistics defining how each agent did
    run_agents_on_mdp([agent, rmax_agent, rand_agent], mdp,
                      instances=200, episodes=100, steps=1000)
def main():
    # Setup MDP, Agents.
    size = 5
    agent = {"x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": size, "dest_y": size, "has_block": 0}
    blocks = [{"x": size, "y": 1}]
    lavas = [{"x": x, "y": y} for x, y in map(lambda z: (z + 1, (size + 1) / 2), range(size))]

    mdp = TrenchOOMDP(size, size, agent, blocks, lavas)
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250)
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='CartPole-v0', render=True)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    q_learning_agent = LinearQAgent(gym_mdp.get_actions(), num_feats)
    run_agents_on_mdp([q_learning_agent],
                      gym_mdp,
                      instances=1,
                      episodes=400,
                      steps=210,
                      open_plot=open_plot,
                      verbose=True)
Exemple #23
0
def restore(args):

    mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    rst = DiaynAgent(sess=None,
                     obs_dim=obs_dim,
                     num_actions=num_actions,
                     action_dim=action_dim,
                     action_bound=action_bound,
                     num_options=args.noptions,
                     batch_size=1,
                     update_freq=1,
                     alpha=1.0)
    rst.restore(directory=prefix + '/vis' + '/' + str(args.task) + 'option' +
                str(args.noptions) + 'diayn',
                name='diayn-pretrain')

    rst.set_diversity(False)

    oagent = OptionAgent(sess=None,
                         obs_dim=obs_dim,
                         obs_bound=state_bound,
                         num_actions=num_actions,
                         action_dim=action_dim,
                         action_bound=action_bound,
                         num_options=1 + args.noptions,
                         init_all=args.initall,
                         high_method=args.highmethod,
                         low_method=args.lowmethod,
                         f_func=args.ffunction,
                         batch_size=args.batchsize,
                         buffer_size=args.buffersize,
                         low_update_freq=args.lowupdatefreq,
                         option_batch_size=args.obatchsize,
                         option_buffer_size=args.obuffersize,
                         high_update_freq=args.highupdatefreq,
                         name='diayn' + str(args.noptions))

    for i in range(args.noptions):
        op = DiaynOption(rst, i, args.termprob)
        oagent.add_option(op)

    run_agents_on_mdp([oagent],
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)
Exemple #24
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = BanditMDP()

    lin_agent = LinUCBAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, lin_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=1,
                      steps=500,
                      open_plot=open_plot)
Exemple #25
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='Breakout-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    rand_agent = RandomAgent(gym_mdp.get_actions())
    lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats)
    run_agents_on_mdp([lin_q_agent, rand_agent],
                      gym_mdp,
                      instances=5,
                      episodes=50000,
                      steps=200,
                      open_plot=open_plot,
                      verbose=False)
Exemple #26
0
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=10,
                       height=10,
                       init_loc=(1, 1),
                       goal_locs=[(10, 10)])
    ql_agent = QLearnerAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=150,
                      open_plot=open_plot)
Exemple #27
0
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy (lambda : simple_rl.State --> str)
        beta_list (list)
        is_deterministic_ib (bool)

    Summary:
        Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction.
    '''
    # Run info_sa.
    dict_of_phi_pmfs = {}
    for beta in beta_list:
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib)

        # Translate abstractions.
        prob_s_phi = ProbStateAbstraction(phi_pmf)
        crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
        #ground state to abstract state
        dict_of_phi_pmfs[beta] = crisp_s_phi
        print("crisp_s_phi:" )
        for single_state in crisp_s_phi.get_abs_states():
            print(str(type(single_state)))
            print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state)))
        print("ground states:")
        for ground_states in crisp_s_phi.get_ground_states():
            print(str(type(ground_states)))
        print(len(crisp_s_phi.get_ground_states()))
        print(len(crisp_s_phi.get_abs_states()))

    # Make agents.
    demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$")
    ql_agent = QLearningAgent(mdp.get_actions())
    agent_dict = {}
    for beta in beta_list:
        beta_phi = dict_of_phi_pmfs[beta]
        ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True})
        agent_dict[beta] = ql_abstr_agent

    # Learn.
    run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5)

    # Print num abstract states.
    for beta in dict_of_phi_pmfs.keys():
        print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states()
    print
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params={}
    params['multitask']=False
    params['env_name']="LunarLander-v2"
    params['obs_size']=8
    params['num_iterations_for_abstraction_learning']=500
    params['learning_rate_for_abstraction_learning']=0.005
    params['abstraction_network_hidden_layers']=2
    params['abstraction_network_hidden_nodes']=200
    params['num_samples_from_demonstrator']=10000
    params['episodes']=200
    params['steps']=1000
    params['num_instances']=5
    params['rl_learning_rate']=0.005
    mdp_demo_policy_dict = {}
    env_name = "LunarLander-v2"
    env_gym = gym.make(env_name)
    obs_size = len(env_gym.observation_space.high)
    env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20)
    test_mdp = env #test mdp is the same
    mdp_demo_policy_dict[env]=lpd.expert_lunar_policy

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "lunar_nn_sa"
    num_iterations = 300
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()

    # ============================
    # == Make test and train environments
    # == along with demonstrator(s)
    # ============================
    mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(
        multitask=params['multitask'])
    expert_puddle_policy = ppd.get_demo_policy_given_goal(
        test_mdp.get_goal_locs()[0])
    demo_agent = FixedPolicyAgent(expert_puddle_policy)

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features)
    sa_agent = AbstractionWrapper(
        QLearningAgent,
        agent_params={"actions": test_mdp.get_actions()},
        state_abstr=nn_sa,
        name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent],
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)
def main():

    # Experiments from IAAI paper:
    # num_days = 1
    # time_per_step = 10 for single axis, 20 for dual.
    # panel_step = 5, dual: 20
    # reflective = 0.55
    # instances = 10
    # episodes = 50, dual: 100

    # Setup experiment parameters, agents, mdp.
    num_days = 200
    per_hour = True
    loc, percept_type, dual_axis = parse_args()
    time_per_step = 10.0 if not dual_axis else 20.0  # in minutes.
    steps = int(24 * (60 / time_per_step) * num_days)
    panel_step = 10 if not dual_axis else 20
    reflective_index = 0.55

    # Set experiment # episodes and # instances.
    episodes = 1 if not dual_axis else 100
    episodes = 1 if num_days == 365 else episodes
    instances = 50

    # If per hour is true, plots every hour long reward chunk, otherwise every day.
    rew_step_count = (steps / num_days) / 24 if per_hour else (steps /
                                                               num_days)
    sun_agents, sun_solar_mdp = setup_experiment(
        percept_type=percept_type,
        loc=loc,
        dual_axis=dual_axis,
        panel_step=panel_step,
        time_per_step=time_per_step,
        reflective_index=reflective_index,
        instances=instances)

    # Run experiments.
    run_agents_on_mdp(sun_agents,
                      sun_solar_mdp,
                      instances=instances,
                      episodes=episodes,
                      steps=steps,
                      clear_old_results=True,
                      rew_step_count=rew_step_count,
                      verbose=True)
def main(open_plot=True):
    # Setup MDP.
    mdp = PuddleMDP()

    # Make feature mappers.
    tile_coder = TileCoding(ranges=[[0, 1.0], [0, 1.0]], num_tiles=[4, 5], num_tilings=4)
    bucket_coder = BucketCoding(feature_max_vals=[1.0, 1.0], num_buckets=5)
    rbf_coder = RBFCoding()
    
    # Make agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions())

    # Tabular agent w/ features.
    tile_coding_agent = FeatureWrapper(QLearningAgent, feature_mapper=tile_coder, agent_params={"actions":mdp.get_actions()})
    bucket_coding_agent = FeatureWrapper(QLearningAgent, feature_mapper=bucket_coder, agent_params={"actions":mdp.get_actions()})

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, bucket_coding_agent], mdp, instances=10, episodes=100, steps=150, open_plot=open_plot)
Exemple #32
0
def main():

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    ql_agent = QLearningAgent(mdp.get_actions(),
                              epsilon=args.epsilon,
                              alpha=args.alpha,
                              explore=args.explore,
                              anneal=args.anneal)
    viz = args.mode

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        rand_agent = RandomAgent(actions=mdp.get_actions())
        run_agents_on_mdp([rand_agent, ql_agent],
                          mdp,
                          open_plot=True,
                          episodes=60,
                          steps=200,
                          instances=5,
                          success_reward=1)
        # mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent,
                               delay=0.005,
                               num_ep=500,
                               num_steps=200)
Exemple #33
0
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy_lambda (lambda : simple_rl.State --> str)
        beta (float)
        is_deterministic_ib (bool): If True, run DIB, else IB.
        is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead.

    Summary:
        Runs info_sa and compares the value of the found policy with the demonstrator policy.
    '''
    if is_agent_in_control:
        # Run info_sa with the agent controlling the MDP.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib)
    else:
        # Run info_sa.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib)

    # Make demonstrator agent and random agent.
    demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$")
    rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$")

    # Make abstract agent.
    lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf)
    prob_s_phi = ProbStateAbstraction(phi_pmf)
    crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
    abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="")
    
    # Run.
    run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000)


    non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0]
    # Print state space sizes.
    demo_vi = ValueIteration(mdp)
    print "\nState Spaces Sizes:"
    print "\t|S| =", demo_vi.get_num_states()
    print "\tH(S_\\phi) =", entropy(pmf_s_phi)
    print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states()
    print "\tdelta_min =", min(non_zero_abstr_states)
    print "\tnum non zero states =", len(non_zero_abstr_states)
    print
Exemple #34
0
def main():
    # Setup MDP.
    w = 6
    h = 6
    mdp = GridWorld(width=w,
                    height=h,
                    init_loc=(1, 1),
                    goal_locs=[(6, 6)],
                    slip_prob=.1)

    # Setup Agents.
    rand_agent = RandomAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())

    # Compute number of samples for R-MAX to achieve epsilon optimal behavior with high probability (1 - delta)
    compute_n_samples = False
    if compute_n_samples:
        epsilon = .1
        delta = .05
        m_r = np.log(2. / delta) / (2. * epsilon**2)
        m_t = 2. * (np.log(2**(float(w * h)) - 2.) - np.log(delta)) / (epsilon
                                                                       **2)
        n_samples = int(max(m_r, m_t))
    else:
        n_samples = 30

    simple_rl_rmax_agent = RMaxAgent(actions=mdp.get_actions(),
                                     gamma=.9,
                                     horizon=3,
                                     s_a_threshold=n_samples,
                                     name='SimpleRL-R-MAX')
    rmax_agent = RMax(actions=mdp.get_actions(),
                      gamma=.9,
                      count_threshold=n_samples)

    # Run experiment and make plot.
    run_agents_on_mdp([rand_agent, ql_agent, rmax_agent, simple_rl_rmax_agent],
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=20,
                      reset_at_terminal=True,
                      verbose=False)
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=4,
                       height=3,
                       init_loc=(1, 1),
                       goal_locs=[(4, 3)],
                       gamma=0.95,
                       walls=[(2, 2)])

    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=10,
                      episodes=1,
                      steps=20,
                      open_plot=open_plot)
Exemple #36
0
def main(open_plot=True):
    # Taxi initial state attributes..
    agent = {"x":1, "y":1, "has_passenger":0}
    passengers = [{"x":3, "y":2, "dest_x":2, "dest_y":3, "in_taxi":0}]
    walls = []
    mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers)

    # Agents.
    ql_agent = QLearningAgent(actions=mdp.get_actions()) 
    rand_agent = RandomAgent(actions=mdp.get_actions())

    viz = False
    if viz:
        # Visualize Taxi.
        run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000)
        mdp.visualize_agent(ql_agent)
    else:
        # Run experiment and make plot.
        run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, reset_at_terminal=True, open_plot=open_plot)
Exemple #37
0
def main():

    # Paper experiments:
    # num_days = 1
    # time_per_step = 10
    # panel_step = 5, dual: 20
    # reflective = 0.55
    # instances = 10
    # episodes = 50, dual: 100

    # Setup experiment parameters, agents, mdp.
    num_days = 1
    per_hour = True
    time_per_step = 10.0  # in minutes.
    loc, percept_type, dual_axis = parse_args()
    steps = int(24 * (60 / time_per_step) * num_days)
    panel_step = 5
    reflective_index = 0.55

    energy_breakdown_experiment = False

    # If per hour is true, plots every hour long reward chunk, otherwise every day.
    rew_step_count = (steps / num_days) / 24 if per_hour else (steps /
                                                               num_days)
    sun_agents, sun_solar_mdp = setup_experiment(
        percept_type=percept_type,
        loc=loc,
        dual_axis=dual_axis,
        panel_step=panel_step,
        time_per_step=time_per_step,
        reflective_index=reflective_index,
        energy_breakdown_experiment=energy_breakdown_experiment)

    # # Run experiments.
    run_agents_on_mdp(sun_agents,
                      sun_solar_mdp,
                      instances=10,
                      episodes=50,
                      steps=steps,
                      clear_old_results=True,
                      rew_step_count=rew_step_count,
                      verbose=False)
def main(open_plot=True):

    # Setup MDP.

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    if args.visualize:
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))

    else:
        custom_q = parse_custom_q_table(args.custom_q, args.default_q)

        agents = []
        for agent in args.agents:
            if agent == 'q_learning':
                agents.append(QLearningAgent(actions=mdp.get_actions()))
            elif agent == 'potential_q':
                agents.append(
                    QLearningAgent(actions=mdp.get_actions(),
                                   custom_q_init=custom_q,
                                   name="Potential_Q"))
            elif agent == 'random':
                agents.append(RandomAgent(actions=mdp.get_actions()))
            elif agent == 'rmax':
                agents.append(RMaxAgent(mdp.get_actions()))

        # Run experiment and make plot.
        run_agents_on_mdp(agents,
                          mdp,
                          instances=1,
                          episodes=100,
                          steps=100,
                          open_plot=open_plot,
                          verbose=True)