Esempio n. 1
0
def get_optimal_policies(environment):
    '''
    Args:
        environment (simple_rl.MDPDistribution)

    Returns:
        (list)
    '''

    # Make State Abstraction
    approx_qds_test = get_sa(environment,
                             indic_func=ind_funcs._q_eps_approx_indicator,
                             epsilon=0.05)

    # True Optimal
    true_opt_vi = ValueIteration(environment)
    true_opt_vi.run_vi()
    opt_agent = FixedPolicyAgent(true_opt_vi.policy, "$\pi^*$")

    # Optimal Abstraction
    opt_det_vi = AbstractValueIteration(environment,
                                        state_abstr=approx_qds_test,
                                        sample_rate=30)
    opt_det_vi.run_vi()
    opt_det_agent = FixedPolicyAgent(opt_det_vi.policy, name="$\pi_{\phi}^*$")

    stoch_policy_obj = StochasticSAPolicy(approx_qds_test, environment)
    stoch_agent = FixedPolicyAgent(stoch_policy_obj.policy,
                                   "$\pi(a \mid s_\phi )$")

    ql_agents = [opt_agent, stoch_agent, opt_det_agent]

    return ql_agents
def _setup_agents(solar_mdp):
    '''
    Args:
        solar_mdp (SolarOOMDP)

    Returns:
        (list): of Agents
    '''
    # Get relevant MDP params.
    actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma(
    ), solar_mdp.get_panel_step()

    # Setup fixed agent.
    static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel")
    optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal")

    # Grena single axis and double axis trackers from time/loc.
    grena_tracker = SolarTracker(tb.grena_tracker,
                                 panel_step=panel_step,
                                 dual_axis=True)
    grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(),
                                           name="grena-tracker")

    # Setup RL agents
    alpha, epsilon = 0.3, 0.3
    num_features = solar_mdp.get_num_state_feats()
    lin_ucb_agent = LinUCBAgent(actions, name="lin-ucb",
                                alpha=0.3)  #, alpha=0.2)
    ql_lin_approx_agent_g0 = LinearQLearnerAgent(actions,
                                                 num_features=num_features,
                                                 name="ql-lin-g0",
                                                 alpha=alpha,
                                                 epsilon=epsilon,
                                                 gamma=0,
                                                 rbf=True,
                                                 anneal=True)
    ql_lin_approx_agent = LinearQLearnerAgent(actions,
                                              num_features=num_features,
                                              name="ql-lin",
                                              alpha=alpha,
                                              epsilon=epsilon,
                                              gamma=gamma,
                                              rbf=True,
                                              anneal=True)
    # sarsa_lin_rbf_agent = LinearApproxSarsaAgent(actions, name="sarsa-lin", alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=True, anneal=False)
    random_agent = RandomAgent(actions)

    # Regular experiments.
    agents = [
        ql_lin_approx_agent, lin_ucb_agent, grena_tracker_agent, static_agent
    ]

    return agents
Esempio n. 3
0
def main():
    import OptimalBeliefAgentClass

    # Setup multitask setting.
    # R ~ D : Puddle, Rock Sample
    # G ~ D : octo, four_room
    # T ~ D : grid

    mdp_class, is_goal_terminal, samples = parse_args()

    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()
    print "done."  #, iters, value
    sys.stdout.flush()

    # Agents.
    print "Making agents...",
    sys.stdout.flush()
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy)
    opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy,
                                              name="$\pi_{prior}$")
    opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent(
        mdp_distr, actions)
    vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$")
    rand_agent = RandomAgent(actions, name="$\pi^u$")
    ql_agent = QLearningAgent(actions)
    print "done."

    agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=False,
                          track_disc_reward=False,
                          cumulative_plot=True)
def _setup_agents(solar_mdp):
    '''
    Args:
        solar_mdp (SolarOOMDP)

    Returns:
        (list): of Agents
    '''
    # Get relevant MDP params.
    actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma(
    ), solar_mdp.get_panel_step()

    # Setup fixed agent.
    static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel")
    optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal")

    # Grena single axis and double axis trackers from time/loc.
    grena_tracker = SolarTracker(tb.grena_tracker,
                                 panel_step=panel_step,
                                 dual_axis=solar_mdp.dual_axis,
                                 actions=solar_mdp.get_bandit_actions())
    grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(),
                                           name="grena-tracker")

    # Setup RL agents
    alpha, epsilon = 0.1, 0.05
    rand_init = True
    num_features = solar_mdp.get_num_state_feats()
    lin_ucb_agent = LinUCBAgent(solar_mdp.get_bandit_actions(),
                                context_size=num_features,
                                name="lin-ucb",
                                rand_init=rand_init,
                                alpha=2.0)
    # sarsa_agent_g0 = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin-g0", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=0, rbf=False, anneal=True)
    # sarsa_agent = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=False, anneal=True)
    ql_agent = QLearningAgent(actions,
                              alpha=alpha,
                              epsilon=epsilon,
                              gamma=gamma)
    random_agent = RandomAgent(actions)

    # Regular experiments.
    # agents = [lin_ucb_agent, sarsa_agent, sarsa_agent_g0, grena_tracker_agent, static_agent]
    agents = [grena_tracker_agent, static_agent]

    return agents
Esempio n. 5
0
def main(open_plot=True):
	# Setup MDP, Agents.
	markov_game = GatheringMDP()
	ql_agent = QLearnerAgent(actions=markov_game.get_actions())
	fixed_action = random.choice(markov_game.get_actions())
	fixed_agent = FixedPolicyAgent(policy=lambda s:fixed_action)

	# Run experiment and make plot.
	play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40, open_plot=open_plot)
Esempio n. 6
0
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy (lambda : simple_rl.State --> str)
        beta_list (list)
        is_deterministic_ib (bool)

    Summary:
        Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction.
    '''
    # Run info_sa.
    dict_of_phi_pmfs = {}
    for beta in beta_list:
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib)

        # Translate abstractions.
        prob_s_phi = ProbStateAbstraction(phi_pmf)
        crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
        #ground state to abstract state
        dict_of_phi_pmfs[beta] = crisp_s_phi
        print("crisp_s_phi:" )
        for single_state in crisp_s_phi.get_abs_states():
            print(str(type(single_state)))
            print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state)))
        print("ground states:")
        for ground_states in crisp_s_phi.get_ground_states():
            print(str(type(ground_states)))
        print(len(crisp_s_phi.get_ground_states()))
        print(len(crisp_s_phi.get_abs_states()))

    # Make agents.
    demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$")
    ql_agent = QLearningAgent(mdp.get_actions())
    agent_dict = {}
    for beta in beta_list:
        beta_phi = dict_of_phi_pmfs[beta]
        ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True})
        agent_dict[beta] = ql_abstr_agent

    # Learn.
    run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5)

    # Print num abstract states.
    for beta in dict_of_phi_pmfs.keys():
        print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states()
    print
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()

    # ============================
    # == Make test and train environments
    # == along with demonstrator(s)
    # ============================
    mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(
        multitask=params['multitask'])
    expert_puddle_policy = ppd.get_demo_policy_given_goal(
        test_mdp.get_goal_locs()[0])
    demo_agent = FixedPolicyAgent(expert_puddle_policy)

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features)
    sa_agent = AbstractionWrapper(
        QLearningAgent,
        agent_params={"actions": test_mdp.get_actions()},
        state_abstr=nn_sa,
        name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent],
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)
Esempio n. 8
0
def generate_agent(mdp_class, data_loc, mdp_parameters, visualize=False):
    try:
        with open('models/' + data_loc + '/vi_agent.pickle', 'rb') as f:
            mdp_agent, vi_agent = pickle.load(f)
    except:
        mdp_agent = make_mdp.make_custom_mdp(mdp_class, mdp_parameters)
        vi_agent = ValueIteration(mdp_agent, sample_rate=1)
        vi_agent.run_vi()

        with open('models/' + data_loc + '/vi_agent.pickle', 'wb') as f:
            pickle.dump((mdp_agent, vi_agent), f)

    # Visualize agent
    if visualize:
        fixed_agent = FixedPolicyAgent(vi_agent.policy)
        mdp_agent.visualize_agent(fixed_agent)
        mdp_agent.reset()  # reset the current state to the initial state
        mdp_agent.visualize_interaction()
Esempio n. 9
0
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy_lambda (lambda : simple_rl.State --> str)
        beta (float)
        is_deterministic_ib (bool): If True, run DIB, else IB.
        is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead.

    Summary:
        Runs info_sa and compares the value of the found policy with the demonstrator policy.
    '''
    if is_agent_in_control:
        # Run info_sa with the agent controlling the MDP.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib)
    else:
        # Run info_sa.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib)

    # Make demonstrator agent and random agent.
    demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$")
    rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$")

    # Make abstract agent.
    lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf)
    prob_s_phi = ProbStateAbstraction(phi_pmf)
    crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
    abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="")
    
    # Run.
    run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000)


    non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0]
    # Print state space sizes.
    demo_vi = ValueIteration(mdp)
    print "\nState Spaces Sizes:"
    print "\t|S| =", demo_vi.get_num_states()
    print "\tH(S_\\phi) =", entropy(pmf_s_phi)
    print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states()
    print "\tdelta_min =", min(non_zero_abstr_states)
    print "\tnum non zero states =", len(non_zero_abstr_states)
    print
Esempio n. 10
0
def get_all_fixed_policy_agents(mdp):
    '''
    Args:
        mdp (MDP)

    Returns:
        (list of Agent)
    '''
    states = mdp.get_states()
    actions = mdp.get_actions()

    all_policies = make_all_fixed_policies(states, actions)
    fixed_agents = []
    for i, p in enumerate(all_policies):
        policy = make_policy_from_action_str(p, actions, states)

        next_agent = FixedPolicyAgent(policy, name="rand-fixed-policy-" + str(i))

        fixed_agents.append(next_agent)

    return fixed_agents
Esempio n. 11
0
def info_sa_planning_experiment(min_grid_size=5, max_grid_size=11, beta=10.0):
    '''
    Args:
        min_grid_size (int)
        max_grid_size (int)
        beta (float): Hyperparameter for InfoSA.

    Summary:
        Writes num iterations and time (seconds) for planning with and without abstractions.
    '''
    vanilla_file = "vi.csv"
    sa_file = "vi-$\\phi$.csv"
    file_prefix = os.path.join("results", "planning-four_room")
    
    clear_files(dir_name=file_prefix)

    for grid_dim in xrange(min_grid_size, max_grid_size + 1):
        # ======================
        # == Make Environment ==
        # ======================
        mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)], gamma=0.9)
        
        # Get demo policy.
        vi = ValueIteration(mdp)
        vi.run_vi()
        demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.2))

        # =======================
        # == Make Abstractions ==
        # =======================
        pmf_s_phi, phi_pmf, abstr_policy = run_info_sa(mdp, demo_policy, iters=500, beta=beta, convergence_threshold=0.00001)
        lambda_abstr_policy = get_lambda_policy(abstr_policy)
        prob_s_phi = ProbStateAbstraction(phi_pmf)
        crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)

        # ============
        # == Run VI ==
        # ============
        vanilla_vi = ValueIteration(mdp, delta=0.0001, sample_rate=25)
        sa_vi = AbstractValueIteration(ground_mdp=mdp, state_abstr=crisp_s_phi, delta=0.0001, vi_sample_rate=25, amdp_sample_rate=25)

        # ==========
        # == Plan ==
        # ==========
        print "Running VIs."
        start_time = time.clock()
        vanilla_iters, vanilla_val = vanilla_vi.run_vi()
        vanilla_time = round(time.clock() - start_time, 2)

        mdp.reset()
        start_time = time.clock()
        sa_iters, sa_abs_val = sa_vi.run_vi()
        sa_time = round(time.clock() - start_time, 2)
        sa_val = evaluate_agent(FixedPolicyAgent(sa_vi.policy), mdp, instances=25)

        print "\n" + "*"*20
        print "Vanilla", "\n\t Iters:", vanilla_iters, "\n\t Value:", round(vanilla_val, 4), "\n\t Time:", vanilla_time
        print 
        print "Phi:", "\n\t Iters:", sa_iters, "\n\t Value:", round(sa_val, 4), "\n\t Time:", sa_time
        print "*"*20 + "\n\n"

        write_datum(os.path.join(file_prefix, "iters", vanilla_file), vanilla_iters)
        write_datum(os.path.join(file_prefix, "iters", sa_file), sa_iters)

        write_datum(os.path.join(file_prefix, "times", vanilla_file), vanilla_time)
        write_datum(os.path.join(file_prefix, "times", sa_file), sa_time)
def diff_sampling_distr_experiment():
    '''
    Summary:
        Compares performance of different sample styles to compute phi.
    '''
    # Make MDP and Demo Policy.
    params = get_params()
    mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(multitask=False)
    expert_puddle_policy = ppd.get_demo_policy_given_goal(
        test_mdp.get_goal_locs()[0])
    demo_agent = FixedPolicyAgent(expert_puddle_policy)

    # Make a NN for each sampling param.
    agents = {}
    sess = tf.Session()
    sampling_params = [0.0, 0.5, 1.0]

    for epsilon in sampling_params:
        with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope:
            # tf.reset_default_graph()
            params["epsilon"] = epsilon
            abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                         sess,
                                         params,
                                         verbose=False,
                                         sample_type="demo")
            nn_sa = NNStateAbstr(abstraction_net)
            sa_agent = AbstractionWrapper(
                QLearningAgent,
                agent_params={
                    "actions":
                    test_mdp.get_actions(),
                    "name":
                    "$D \\sim \\rho_E^\\epsilon, \\epsilon=" + str(epsilon) +
                    "$"
                },
                state_abstr=nn_sa,
                name_ext="")
            agents[epsilon] = sa_agent

    with tf.variable_scope('demo') as scope:
        abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict,
                                          sess,
                                          params,
                                          verbose=False,
                                          sample_type="rand")
        nn_sa_rand = NNStateAbstr(abstraction_net_rand)
        sa_agent_rand = AbstractionWrapper(QLearningAgent,
                                           agent_params={
                                               "actions":
                                               test_mdp.get_actions(),
                                               "name": "$D \\sim U(S)$"
                                           },
                                           state_abstr=nn_sa_rand,
                                           name_ext="")
        agents["rand"] = sa_agent_rand

    run_agents_on_mdp(agents.values(),
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)

    sess.close()
Esempio n. 13
0
def branching_factor_experiment(min_options=0,
                                max_options=20,
                                increment=2,
                                instances=5,
                                epsilon=0.05):
    '''
    Args:
        min_options (int)
        max_options (int)
        increment (int)

    Summary:
        Runs an experiment contrasting learning performance for different # options.
    '''
    # Define MDP.
    grid_size = 7
    mdp = FourRoomMDP(width=grid_size,
                      height=grid_size,
                      goal_locs=[(grid_size, grid_size)])

    # Make State Abstraction.
    states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50)
    state_abstr = core.compute_phi_given_m(mdp,
                                           four_rooms_predicate_9x9,
                                           level=1,
                                           states=states)

    x_axis = range(min_options, max_options + 1, increment)
    y_axis = defaultdict(list)  #[] #[0] * len(x_axis)
    conf_intervals = defaultdict(list)
    num_options_performance = defaultdict(lambda: defaultdict(list))

    # Choose dependent variable (either #steps per episode or #episodes).
    d_var_range = [(20, 5), (40, 250), (400, 2500)]

    for steps, episodes in d_var_range:
        print "steps, episodes", steps, episodes

        # Evaluate.
        for i, instance in enumerate(range(instances)):
            print "\tInstance", instance + 1, "of", str(instances) + "."

            # Make initial Options.
            for num_options in x_axis:

                options, _ = make_near_optimal_phi_relative_options(
                    mdp,
                    state_abstr,
                    'eps-greedy',
                    num_rand_opts=num_options - 1,
                    eps=epsilon)
                action_abstr = ActionAbstraction(
                    options=options, prim_actions=mdp.get_actions())

                # Make agent.
                AgentClass = RMaxAgent  # DoubleQAgent, QLearningAgent, SarsaAgent
                sa_aa_agent = AbstractionWrapper(
                    AgentClass,
                    agent_params={"actions": mdp.get_actions()},
                    state_abstr=state_abstr,
                    action_abstr=action_abstr,
                    name_ext="-$\\phi,O$")

                _, _, value_per_episode = run_single_agent_on_mdp(
                    sa_aa_agent, mdp, episodes=episodes, steps=steps)
                mdp.reset()

                num_options_performance[(steps, episodes)][num_options].append(
                    value_per_episode[-1])

    ############
    # Other types

    # Just state abstraction.
    steps, episodes = d_var_range[-1][0], d_var_range[-1][1]
    sa_agent = AbstractionWrapper(AgentClass,
                                  agent_params={"actions": mdp.get_actions()},
                                  state_abstr=state_abstr,
                                  action_abstr=None,
                                  name_ext="-$\\phi$")
    _, _, value_per_episode = run_single_agent_on_mdp(sa_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["phi"].append(
                                 value_per_episode[-1])
    y_axis["phi"] = [value_per_episode[-1]]

    # Run random options.
    options = make_fixed_random_options(mdp, state_abstr)
    action_abstr = ActionAbstraction(options=options,
                                     prim_actions=mdp.get_actions())
    AgentClass = QLearningAgent
    rand_opt_agent = AbstractionWrapper(
        AgentClass,
        agent_params={"actions": mdp.get_actions()},
        state_abstr=state_abstr,
        action_abstr=action_abstr,
        name_ext="-$\\phi,O_{\text{random}}$")
    _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["random"].append(
                                 value_per_episode[-1])
    y_axis["random"] = [value_per_episode[-1]]

    # Makeoptimal agent.
    value_iter = ValueIteration(mdp)
    value_iter.run_vi()
    optimal_agent = FixedPolicyAgent(value_iter.policy)
    _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    y_axis["optimal"] = [value_per_episode[-1]]
    total_steps = d_var_range[0][0] * d_var_range[0][1]

    # Confidence intervals.
    for dependent_var in d_var_range:
        for num_options in x_axis:
            # Compute mean and standard error.
            avg_for_n = float(
                sum(num_options_performance[dependent_var]
                    [num_options])) / instances
            std_deviation = np.std(
                num_options_performance[dependent_var][num_options])
            std_error = 1.96 * (std_deviation / math.sqrt(
                len(num_options_performance[dependent_var][num_options])))
            y_axis[dependent_var].append(avg_for_n)
            conf_intervals[dependent_var].append(std_error)

    plt.xlabel("$|O_\\phi|$")
    plt.xlim([1, len(x_axis)])
    plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$")
    plt.tight_layout()  # Keeps the spacing nice.

    # Add just state abstraction.
    ep_val_del_q_phi = y_axis["phi"]
    label = "$O_{\\phi}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis),
             marker="+",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[-1],
             label=label)

    # Add random options.
    ep_val_del_q = y_axis["random"]
    label = "$O_{random}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q] * len(x_axis),
             marker="x",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[0])  #, label=label)

    # Add optimal.
    ep_val_optimal = y_axis["optimal"]
    plt.plot(x_axis, [ep_val_optimal] * len(x_axis),
             linestyle="-",
             linewidth=1.0,
             color=PLOT_COLORS[1])  #, label="$\\pi^*$")

    for i, dependent_var in enumerate(d_var_range):
        total_steps = dependent_var[0] * dependent_var[1]
        label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str(
            str(total_steps).count("0")) + "$"
        plt.plot(x_axis,
                 y_axis[dependent_var],
                 marker="x",
                 color=PLOT_COLORS[i + 2],
                 linewidth=1.5,
                 label=label)

        # Confidence intervals.
        top = np.add(y_axis[dependent_var], conf_intervals[dependent_var])
        bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var])
        plt.fill_between(x_axis,
                         top,
                         bot,
                         alpha=0.25,
                         color=PLOT_COLORS[i + 2])

    plt.legend()
    plt.savefig("branching_factor_results.pdf", format="pdf")
    plt.cla()
    plt.close()
def num_training_data_experiment():
    '''
    Summary:
        Runs an experiment that compares the performance of different
        Agent-SA combinations, where each SA is trained with a different
        number of training samples.
    '''
    # Params.
    instances = 10
    init, increment, maximum = 1, 500, 5001
    training_samples = range(init, maximum, increment)

    # Run experiment.s
    if not os.path.exists(os.path.join("results", "puddle_per_sample")):
        os.makedirs(os.path.join("results", "puddle_per_sample"))
    data_dir = os.path.join("results", "puddle_per_sample")
    with open(os.path.join(data_dir, "results.csv"), "w+") as results_file:

        # Repeat the experiment @instances # times.
        for i in range(instances):
            print "\nInstances", i + 1, "of", str(instances)
            for sample_num in training_samples:
                print "\tSamples:", sample_num

                # Make State Abstraction.
                params = get_params(default_params={
                    "num_samples_from_demonstrator": sample_num
                })
                mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(
                    multitask=params['multitask'])
                expert_puddle_policy = ppd.get_demo_policy_given_goal(
                    test_mdp.get_goal_locs()[0])
                demo_agent = FixedPolicyAgent(expert_puddle_policy)
                tf.reset_default_graph()
                sess = tf.Session()
                abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                             sess,
                                             params,
                                             verbose=False)
                nn_sa = NNStateAbstr(abstraction_net)

                # Test Performance with given param.
                sa_agent = AbstractionWrapper(
                    QLearningAgent,
                    agent_params={"actions": test_mdp.get_actions()},
                    state_abstr=nn_sa,
                    name_ext="$-\\phi$")
                val = evaluate_agent(sa_agent,
                                     test_mdp,
                                     steps=params['steps'],
                                     episodes=params['episodes'])
                results_file.write(str(val) + ",")
                results_file.flush()
                sess.close()

            results_file.write("\n")

    cu.EVERY_OTHER_X = True
    cu.CUSTOM_TITLE = "Effect of $|D_{train, \\phi}|$ on RL Performance"
    cu.X_AXIS_LABEL = "$|D_{train, \\phi}|$"
    cu.Y_AXIS_LABEL = "Avg. Reward in Last Episode"
    cu.X_AXIS_START_VAL = init
    cu.X_AXIS_INCREMENT = increment
    cu.COLOR_SHIFT = 3
    cu.format_and_make_plot(data_dir=data_dir, avg_plot=True, add_legend=False)
Esempio n. 15
0
def extract_constraints(wt_vi_traj_candidates,
                        weights,
                        step_cost_flag,
                        BEC_depth=1,
                        trajectories=None,
                        print_flag=False):
    '''
    :param wt_vi_traj_candidates: Nested list of [weight, value iteration object, trajectory]
    :param weights (numpy array): Ground truth reward weights used by agent to derive its optimal policy
    :param step_cost_flag (bool): Indicates that the last weight element is a known step cost
    :param BEC_depth (int): number of suboptimal actions to take before following the optimal policy to obtain the
                            suboptimal trajectory (and the corresponding suboptimal expected feature counts)
    :return: min_subset_constraints: List of constraints

    Summary: Obtain the minimum BEC constraints for each environment
    '''
    min_subset_constraints_record = [
    ]  # minimum BEC constraints conveyed by a trajectory
    env_record = []
    policy_constraints = [
    ]  # BEC constraints that define a policy (i.e. constraints arising from one action
    # deviations from every possible starting state and the corresponding optimal trajectories)
    traj_record = []
    processed_envs = []

    # go through each environment and corresponding optimal trajectory, and extract the behavior equivalence class (BEC) constraints
    for env_idx, wt_vi_traj_candidate in enumerate(wt_vi_traj_candidates):
        if print_flag:
            print("Extracting constraints from environment {}".format(env_idx))
        mdp = wt_vi_traj_candidate[0][1].mdp
        agent = FixedPolicyAgent(wt_vi_traj_candidate[0][1].policy)

        if trajectories is not None:
            constraints = []
            # a) demonstration-driven BEC
            # BEC constraints are obtained by ensuring that the optimal actions accumulate at least as much reward as
            # all other possible actions along a trajectory
            action_seq_list = list(
                itertools.product(mdp.actions, repeat=BEC_depth))

            traj_opt = trajectories[env_idx]
            for sas_idx in range(len(traj_opt)):
                # reward features of optimal action
                mu_sa = mdp.accumulate_reward_features(traj_opt[sas_idx:],
                                                       discount=True)

                sas = traj_opt[sas_idx]
                cur_state = sas[0]

                # currently assumes that all actions are executable from all states
                for action_seq in action_seq_list:
                    traj_hyp = mdp_helpers.rollout_policy(
                        mdp, agent, cur_state, action_seq)
                    mu_sb = mdp.accumulate_reward_features(traj_hyp,
                                                           discount=True)

                    constraints.append(mu_sa - mu_sb)

            # store the BEC constraints for each environment, along with the associated demo and environment number
            min_subset_constraints = BEC_helpers.clean_up_constraints(
                constraints, weights, step_cost_flag)
            min_subset_constraints_record.append(min_subset_constraints)
            traj_record.append(traj_opt)
            env_record.append(env_idx)
            # slightly abusing the term 'policy' here since I'm only considering a subset of possible trajectories (i.e.
            # demos) that the policy can generate in these environments
            policy_constraints.append(min_subset_constraints)
        else:
            # b) policy-driven BEC
            # wt_vi_traj_candidates can contain MDPs with the same environment but different initial states (to
            # accommodate demo BEC). by considering all reachable states of two identical MDPs with different initial
            # states, you will obtain duplicate test environments so only go through each MDP once for policy BEC.
            if mdp.env_code not in processed_envs:
                agent = FixedPolicyAgent(wt_vi_traj_candidate[0][1].policy)

                for state in mdp.states:
                    constraints = []

                    traj_opt = mdp_helpers.rollout_policy(mdp,
                                                          agent,
                                                          cur_state=state)

                    for sas_idx in range(len(traj_opt)):
                        # reward features of optimal action
                        mu_sa = mdp.accumulate_reward_features(
                            traj_opt[sas_idx:], discount=True)

                        sas = traj_opt[sas_idx]
                        cur_state = sas[0]

                        # currently assumes that all actions are executable from all states. only considering
                        # action depth of 1 currently
                        for action in mdp.actions:
                            if action != sas[1]:
                                traj_hyp = mdp_helpers.rollout_policy(
                                    mdp,
                                    agent,
                                    cur_state=cur_state,
                                    action_seq=[action])
                                mu_sb = mdp.accumulate_reward_features(
                                    traj_hyp, discount=True)

                                constraints.append(mu_sa - mu_sb)

                        # if considering only suboptimal actions of the first sas, put the corresponding constraints
                        # toward the BEC of the policy (per definition)
                        if sas_idx == 0:
                            policy_constraints.append(
                                BEC_helpers.clean_up_constraints(
                                    constraints, weights, step_cost_flag))

                    # also store the BEC constraints for optimal trajectory in each state, along with the associated
                    # demo and environment number
                    min_subset_constraints_record.append(
                        BEC_helpers.clean_up_constraints(
                            constraints, weights, step_cost_flag))
                    traj_record.append(traj_opt)
                    env_record.append(env_idx)

                processed_envs.append(mdp.env_code)

    return policy_constraints, min_subset_constraints_record, env_record, traj_record
Esempio n. 16
0
#!/usr/bin/env python

# Python imports.
import random

# Other imports.
import srl_example_setup
from simple_rl.agents import QLearnerAgent, FixedPolicyAgent
from simple_rl.tasks import RockPaperScissorsMDP
from simple_rl.run_experiments import play_markov_game

# Setup MDP, Agents.
markov_game = RockPaperScissorsMDP()
ql_agent = QLearnerAgent(actions=markov_game.get_actions())
fixed_action = random.choice(markov_game.get_actions())
fixed_agent = FixedPolicyAgent(policy=lambda s: fixed_action)

# Run experiment and make plot.
play_markov_game([ql_agent, fixed_agent],
                 markov_game,
                 instances=15,
                 episodes=1,
                 steps=40)
def main(eps=0.1, open_plot=True):

    mdp_class, is_goal_terminal, samples, alg = parse_args()

    # Setup multitask setting.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    ### Yuu

    transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy,
                                            name="transferFixed")
    rand_agent = RandomAgent(actions, name="$\pi^u$")

    opt_q_func = compute_optimistic_q_function(mdp_distr)
    avg_q_func = avg_mdp_vi.get_q_function()

    if alg == "q":
        pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0")
        qmax = 1.0 * (1 - 0.99)
        # qmax = 1.0
        pure_ql_agent_opt = QLearnerAgent(actions,
                                          epsilon=eps,
                                          default_q=qmax,
                                          name="Q-vmax")
        transfer_ql_agent_optq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-max")
        transfer_ql_agent_optq.set_init_q_function(opt_q_func)
        transfer_ql_agent_avgq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-avg")
        transfer_ql_agent_avgq.set_init_q_function(avg_q_func)

        agents = [
            pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq,
            transfer_ql_agent_avgq
        ]
    elif alg == "rmax":
        pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax")
        updating_trans_rmax_agent = UpdatingRMaxAgent(actions,
                                                      name="RMAX-updating_max")
        trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max")
        trans_rmax_agent.set_init_q_function(opt_q_func)
        agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent]
    elif alg == "delayed-q":
        pure_delayed_ql_agent = DelayedQLearnerAgent(actions,
                                                     opt_q_func,
                                                     name="DelayedQ-vmax")
        pure_delayed_ql_agent.set_vmax()
        updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent(
            actions, name="DelayedQ-updating_max")
        trans_delayed_ql_agent = DelayedQLearnerAgent(
            actions, opt_q_func, name="DelayedQ-trans-max")
        agents = [
            pure_delayed_ql_agent, updating_delayed_ql_agent,
            trans_delayed_ql_agent
        ]
    else:
        print "Unknown type of agents:", alg
        print "(q, rmax, delayed-q)"
        assert (False)

    # Run task.
    # TODO: Function for Learning on each MDP
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=is_goal_terminal,
                          is_rec_disc_reward=False,
                          cumulative_plot=True,
                          open_plot=open_plot)
Esempio n. 18
0
def run_agents_multi_task(agents,
                            mdp_distr,
                            task_samples=5,
                            episodes=1,
                            steps=100,
                            clear_old_results=True,
                            open_plot=True,
                            verbose=False,
                            is_rec_disc_reward=False,
                            reset_at_terminal=False,
                            include_optimal=False):
    '''
    Args:
        agents (list)
        mdp_distr (MDPDistribution)
        task_samples
        episodes
        steps

    Summary:
        Runs each agent on the MDP distribution according to the given parameters.
        If @mdp_distr has a non-zero horizon, then gamma is set to 1 and #steps is ignored.
    '''

    # Set number of steps if the horizon is given.
    if mdp_distr.get_horizon() > 0:
        mdp_distr.set_gamma(1.0)
        steps = mdp_distr.get_horizon()

    # Experiment (for reproducibility, plotting).
    exp_params = {"task_samples":task_samples, "episodes":episodes, "steps":steps}
    experiment = Experiment(agents=agents,
                mdp=mdp_distr,
                params=exp_params,
                is_episodic=episodes > 1,
                is_multi_task=True,
                clear_old_results=clear_old_results,
                is_rec_disc_reward=is_rec_disc_reward)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    start = time.clock()

    times = defaultdict(float)

    if include_optimal:
        fixed_policy_agent = FixedPolicyAgent(policy=lambda s: "", name="optimal")
        agents += [fixed_policy_agent]

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."
        start = time.clock()

        # --- SAMPLE NEW MDP ---
        for new_task in xrange(task_samples):
            print "  Sample " + str(new_task + 1) + " of " + str(task_samples) + "."

            # Sample the MDP.
            mdp = mdp_distr.sample()

            if include_optimal and agent.name == "optimal":
                vi = ValueIteration(mdp)
                vi.run_vi()
                agent.set_policy(vi.policy)

            # Run the agent.
            run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal)

            # Reset the agent.
            agent.reset()

            if "rmax" in agent.name:
                agent._reset_reward()

        # Track how much time this agent took.
        end = time.clock()
        times[agent] = round(end - start, 3)

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in times.keys():
        print str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds."
    print "-------------\n"

    experiment.make_plots(open_plot=open_plot)
Esempio n. 19
0
def make_info_sa_val_and_size_plots(mdp,
                                    demo_policy_lambda,
                                    beta_range,
                                    results_dir="info_sa_results",
                                    instances=3,
                                    include_stoch=False,
                                    is_agent_in_control=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy_lambda (lambda : simple_rl.State --> str)
        beta_range (list)
        results_dir (str)
        instances (int)
        include_stoch (bool): If True, also runs IB.
        is_agent_in_control (bool): If True, runs the agent_in_control.py variant of DIB-SA.

    Summary:
        Main plotting function for info_sa experiments.
    '''
    # Clear old results.
    all_policies = ["demo_val", "dibs_val", "dibs_states", "etad_states"]
    if include_stoch:
        all_policies += ["ib_val", "ib_states"]
    for policy in all_policies:
        if os.path.exists(os.path.join(results_dir, str(policy)) + ".csv"):
            os.remove(os.path.join(results_dir, str(policy)) + ".csv")

    # Set relevant params.
    param_dict = {
        "mdp": mdp,
        "iters": 500,
        "convergence_threshold": 0.0001,
        "demo_policy_lambda": demo_policy_lambda,
        "is_agent_in_control": is_agent_in_control
    }

    # Record vallue of demo policy and size of ground state space.
    demo_agent = FixedPolicyAgent(demo_policy_lambda)
    demo_val = evaluate_agent(demo_agent, mdp, instances=100)
    vi = ValueIteration(mdp)
    num_ground_states = vi.get_num_states()
    for beta in beta_range:
        write_datum_to_file(file_name="demo_val",
                            datum=demo_val,
                            extra_dir=results_dir)
        write_datum_to_file(file_name="ground_states",
                            datum=num_ground_states,
                            extra_dir=results_dir)

    # Run core algorithm for DIB and IB.
    for instance in range(instances):
        print "\nInstance", instance + 1, "of", str(instances) + "."
        random.jumpahead(1)

        # For each beta.
        for beta in beta_range:

            # Run DIB.
            dibs_val, dibs_states = _info_sa_val_and_size_plot_wrapper(
                beta=beta,
                param_dict=dict(param_dict.items() + {
                    "is_deterministic_ib": True,
                    "use_crisp_policy": False
                }.items()))
            write_datum_to_file(file_name="dibs_val",
                                datum=dibs_val,
                                extra_dir=results_dir)
            write_datum_to_file(file_name="dibs_states",
                                datum=dibs_states,
                                extra_dir=results_dir)

            if include_stoch:
                ib_val, ib_states = _info_sa_val_and_size_plot_wrapper(
                    beta=beta,
                    param_dict=dict(param_dict.items() + {
                        "is_deterministic_ib": False,
                        "use_crisp_policy": False
                    }.items()))
                write_datum_to_file(file_name="ib_val",
                                    datum=ib_val,
                                    extra_dir=results_dir)
                write_datum_to_file(file_name="ib_states",
                                    datum=ib_states,
                                    extra_dir=results_dir)

        # End instances.
        end_of_instance("dibs_val", extra_dir=results_dir)
        end_of_instance("dibs_states", extra_dir=results_dir)
        if include_stoch:
            end_of_instance("ib_val", extra_dir=results_dir)
            end_of_instance("ib_states", extra_dir=results_dir)

    beta_range_file = file(os.path.join(results_dir, "beta_range.csv"), "w")
    for beta in beta_range:
        beta_range_file.write(str(beta))
        beta_range_file.write(",")

    beta_range_file.close()

    make_beta_val_plot([p for p in all_policies if "val" in p],
                       results_dir,
                       is_agent_in_control=is_agent_in_control)
Esempio n. 20
0
def run_agents_on_mdp(agents,
                        mdp,
                        instances=5,
                        episodes=100,
                        steps=200,
                        clear_old_results=True,
                        rew_step_count=1,
                        is_rec_disc_reward=False,
                        open_plot=True,
                        verbose=False,
                        reset_at_terminal=False,
                        include_optimal=False):
    '''
    Args:
        agents (list of Agents): See agents/AgentClass.py (and friends).
        mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*.
        instances (int) [opt]: Number of times to run each agent (for confidence intervals).
        episodes (int) [opt]: Number of episodes for each learning instance.
        steps (int) [opt]: Number of steps per episode.
        clear_old_results (bool) [opt]: If true, removes all results files in the relevant results dir.
        rew_step_count (int): Number of steps before recording reward.
        is_rec_disc_reward (bool): If true, track (and plot) discounted reward.
        open_plot (bool): If true opens the plot at the end.
        verbose (bool): If true, prints status bars per episode/instance.
        reset_at_terminal (bool): If true sends the agent to the start state after terminal.
        include_optimal (bool): If true also plots optimal behavior.

    Summary:
        Runs each agent on the given mdp according to the given parameters.
        Stores results in results/<agent_name>.csv and automatically
        generates a plot and opens it.
    '''

    # Experiment (for reproducibility, plotting).
    exp_params = {"instances":instances, "episodes":episodes, "steps":steps}
    experiment = Experiment(agents=agents,
                            mdp=mdp,
                            params=exp_params,
                            is_episodic= episodes > 1,
                            clear_old_results=clear_old_results,
                            is_rec_disc_reward=is_rec_disc_reward,
                            count_r_per_n_timestep=rew_step_count)

    # Record how long each agent spends learning.
    print "Running experiment: \n" + str(experiment)
    time_dict = defaultdict(float)

    if include_optimal:
        vi = ValueIteration(mdp)
        vi.run_vi()
        fixed_policy_agent = FixedPolicyAgent(vi.policy, name="optimal")
        agents += [fixed_policy_agent]

    # Learn.
    for agent in agents:
        print str(agent) + " is learning."

        start = time.clock()

        # For each instance.
        for instance in xrange(1, instances + 1):
            print "  Instance " + str(instance) + " of " + str(instances) + "."
            sys.stdout.flush()
            run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal=reset_at_terminal)
            
            # Reset the agent.
            agent.reset()

        # Track how much time this agent took.
        end = time.clock()
        time_dict[agent] = round(end - start, 3)
        print

    # Time stuff.
    print "\n--- TIMES ---"
    for agent in time_dict.keys():
        print str(agent) + " agent took " + str(round(time_dict[agent], 2)) + " seconds."
    print "-------------\n"

    # if not isinstance(mdp, GymMDP):
    experiment.make_plots(open_plot=open_plot)
Esempio n. 21
0
def get_exact_vs_approx_agents(environment, incl_opt=True):
    '''
    Args:
        environment (simple_rl.MDPDistribution)
        incl_opt (bool)

    Returns:
        (list)
    '''

    actions = environment.get_actions()
    gamma = environment.get_gamma()

    exact_qds_test = get_sa(environment,
                            indic_func=ind_funcs._q_eps_approx_indicator,
                            epsilon=0.0)
    approx_qds_test = get_sa(environment,
                             indic_func=ind_funcs._q_eps_approx_indicator,
                             epsilon=0.05)

    ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    ql_exact_agent = AbstractionWrapper(QLearningAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    ql_approx_agent = AbstractionWrapper(QLearningAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent]

    dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    dql_exact_agent = AbstractionWrapper(DoubleQAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=exact_qds_test,
                                         name_ext="-exact")
    dql_approx_agent = AbstractionWrapper(DoubleQAgent,
                                          agent_params={"actions": actions},
                                          state_abstr=approx_qds_test,
                                          name_ext="-approx")
    dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent]

    rm_agent = RMaxAgent(actions, gamma=gamma)
    rm_exact_agent = AbstractionWrapper(RMaxAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    rm_approx_agent = AbstractionWrapper(RMaxAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent]

    if incl_opt:
        vi = ValueIteration(environment)
        vi.run_vi()
        opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$")

        sa_vi = AbstractValueIteration(
            environment,
            sample_rate=50,
            max_iterations=3000,
            delta=0.0001,
            state_abstr=approx_qds_test,
            action_abstr=ActionAbstraction(
                options=[], prim_actions=environment.get_actions()))
        sa_vi.run_vi()
        approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$")

        dql_agents += [opt_agent, approx_opt_agent]

    return ql_agents
def diff_sampling_distr_experiment():
    '''
    Summary:
        Runs
    '''
    # Make MDP and Demo Policy.
    params = get_params()
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy
    demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)

    # Make a NN for each sampling param.
    sampling_params = [0.0, 0.5, 1.0]

    test_mdp = CartPoleMDP()  #
    agents = {"demo": demo_agent}
    sess = tf.Session()
    for epsilon in sampling_params:
        with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope:
            print "epsilon", epsilon
            # tf.reset_default_graph()
            params["epsilon"] = epsilon
            abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                         sess,
                                         params,
                                         verbose=False)
            nn_sa = NNStateAbstr(abstraction_net)
            sa_agent = AbstractionWrapper(QLearningAgent,
                                          agent_params={
                                              "actions":
                                              env.get_actions(),
                                              "name":
                                              "$QL_\\phi-\\epsilon=" +
                                              str(epsilon) + "$"
                                          },
                                          state_abstr=nn_sa)
            agents[epsilon] = sa_agent

    with tf.variable_scope('demo') as scope:
        abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict,
                                          sess,
                                          params,
                                          verbose=False,
                                          sample_type="rand")
        nn_sa_rand = NNStateAbstr(abstraction_net_rand)
        sa_agent_rand = AbstractionWrapper(QLearningAgent,
                                           agent_params={
                                               "actions": env.get_actions(),
                                               "name": "$D \\sim U(S)$"
                                           },
                                           state_abstr=nn_sa_rand,
                                           name_ext="")
        agents["rand"] = sa_agent_rand

    run_agents_on_mdp(agents.values(),
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)

    sess.close()