Exemple #1
0
def info_sa_planning_experiment(min_grid_size=5, max_grid_size=11, beta=10.0):
    '''
    Args:
        min_grid_size (int)
        max_grid_size (int)
        beta (float): Hyperparameter for InfoSA.

    Summary:
        Writes num iterations and time (seconds) for planning with and without abstractions.
    '''
    vanilla_file = "vi.csv"
    sa_file = "vi-$\\phi$.csv"
    file_prefix = os.path.join("results", "planning-four_room")
    
    clear_files(dir_name=file_prefix)

    for grid_dim in xrange(min_grid_size, max_grid_size + 1):
        # ======================
        # == Make Environment ==
        # ======================
        mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)], gamma=0.9)
        
        # Get demo policy.
        vi = ValueIteration(mdp)
        vi.run_vi()
        demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.2))

        # =======================
        # == Make Abstractions ==
        # =======================
        pmf_s_phi, phi_pmf, abstr_policy = run_info_sa(mdp, demo_policy, iters=500, beta=beta, convergence_threshold=0.00001)
        lambda_abstr_policy = get_lambda_policy(abstr_policy)
        prob_s_phi = ProbStateAbstraction(phi_pmf)
        crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)

        # ============
        # == Run VI ==
        # ============
        vanilla_vi = ValueIteration(mdp, delta=0.0001, sample_rate=25)
        sa_vi = AbstractValueIteration(ground_mdp=mdp, state_abstr=crisp_s_phi, delta=0.0001, vi_sample_rate=25, amdp_sample_rate=25)

        # ==========
        # == Plan ==
        # ==========
        print "Running VIs."
        start_time = time.clock()
        vanilla_iters, vanilla_val = vanilla_vi.run_vi()
        vanilla_time = round(time.clock() - start_time, 2)

        mdp.reset()
        start_time = time.clock()
        sa_iters, sa_abs_val = sa_vi.run_vi()
        sa_time = round(time.clock() - start_time, 2)
        sa_val = evaluate_agent(FixedPolicyAgent(sa_vi.policy), mdp, instances=25)

        print "\n" + "*"*20
        print "Vanilla", "\n\t Iters:", vanilla_iters, "\n\t Value:", round(vanilla_val, 4), "\n\t Time:", vanilla_time
        print 
        print "Phi:", "\n\t Iters:", sa_iters, "\n\t Value:", round(sa_val, 4), "\n\t Time:", sa_time
        print "*"*20 + "\n\n"

        write_datum(os.path.join(file_prefix, "iters", vanilla_file), vanilla_iters)
        write_datum(os.path.join(file_prefix, "iters", sa_file), sa_iters)

        write_datum(os.path.join(file_prefix, "times", vanilla_file), vanilla_time)
        write_datum(os.path.join(file_prefix, "times", sa_file), sa_time)
Exemple #2
0
def branching_factor_experiment(min_options=0,
                                max_options=20,
                                increment=2,
                                instances=5,
                                epsilon=0.05):
    '''
    Args:
        min_options (int)
        max_options (int)
        increment (int)

    Summary:
        Runs an experiment contrasting learning performance for different # options.
    '''
    # Define MDP.
    grid_size = 7
    mdp = FourRoomMDP(width=grid_size,
                      height=grid_size,
                      goal_locs=[(grid_size, grid_size)])

    # Make State Abstraction.
    states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50)
    state_abstr = core.compute_phi_given_m(mdp,
                                           four_rooms_predicate_9x9,
                                           level=1,
                                           states=states)

    x_axis = range(min_options, max_options + 1, increment)
    y_axis = defaultdict(list)  #[] #[0] * len(x_axis)
    conf_intervals = defaultdict(list)
    num_options_performance = defaultdict(lambda: defaultdict(list))

    # Choose dependent variable (either #steps per episode or #episodes).
    d_var_range = [(20, 5), (40, 250), (400, 2500)]

    for steps, episodes in d_var_range:
        print "steps, episodes", steps, episodes

        # Evaluate.
        for i, instance in enumerate(range(instances)):
            print "\tInstance", instance + 1, "of", str(instances) + "."

            # Make initial Options.
            for num_options in x_axis:

                options, _ = make_near_optimal_phi_relative_options(
                    mdp,
                    state_abstr,
                    'eps-greedy',
                    num_rand_opts=num_options - 1,
                    eps=epsilon)
                action_abstr = ActionAbstraction(
                    options=options, prim_actions=mdp.get_actions())

                # Make agent.
                AgentClass = RMaxAgent  # DoubleQAgent, QLearningAgent, SarsaAgent
                sa_aa_agent = AbstractionWrapper(
                    AgentClass,
                    agent_params={"actions": mdp.get_actions()},
                    state_abstr=state_abstr,
                    action_abstr=action_abstr,
                    name_ext="-$\\phi,O$")

                _, _, value_per_episode = run_single_agent_on_mdp(
                    sa_aa_agent, mdp, episodes=episodes, steps=steps)
                mdp.reset()

                num_options_performance[(steps, episodes)][num_options].append(
                    value_per_episode[-1])

    ############
    # Other types

    # Just state abstraction.
    steps, episodes = d_var_range[-1][0], d_var_range[-1][1]
    sa_agent = AbstractionWrapper(AgentClass,
                                  agent_params={"actions": mdp.get_actions()},
                                  state_abstr=state_abstr,
                                  action_abstr=None,
                                  name_ext="-$\\phi$")
    _, _, value_per_episode = run_single_agent_on_mdp(sa_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["phi"].append(
                                 value_per_episode[-1])
    y_axis["phi"] = [value_per_episode[-1]]

    # Run random options.
    options = make_fixed_random_options(mdp, state_abstr)
    action_abstr = ActionAbstraction(options=options,
                                     prim_actions=mdp.get_actions())
    AgentClass = QLearningAgent
    rand_opt_agent = AbstractionWrapper(
        AgentClass,
        agent_params={"actions": mdp.get_actions()},
        state_abstr=state_abstr,
        action_abstr=action_abstr,
        name_ext="-$\\phi,O_{\text{random}}$")
    _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["random"].append(
                                 value_per_episode[-1])
    y_axis["random"] = [value_per_episode[-1]]

    # Makeoptimal agent.
    value_iter = ValueIteration(mdp)
    value_iter.run_vi()
    optimal_agent = FixedPolicyAgent(value_iter.policy)
    _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    y_axis["optimal"] = [value_per_episode[-1]]
    total_steps = d_var_range[0][0] * d_var_range[0][1]

    # Confidence intervals.
    for dependent_var in d_var_range:
        for num_options in x_axis:
            # Compute mean and standard error.
            avg_for_n = float(
                sum(num_options_performance[dependent_var]
                    [num_options])) / instances
            std_deviation = np.std(
                num_options_performance[dependent_var][num_options])
            std_error = 1.96 * (std_deviation / math.sqrt(
                len(num_options_performance[dependent_var][num_options])))
            y_axis[dependent_var].append(avg_for_n)
            conf_intervals[dependent_var].append(std_error)

    plt.xlabel("$|O_\\phi|$")
    plt.xlim([1, len(x_axis)])
    plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$")
    plt.tight_layout()  # Keeps the spacing nice.

    # Add just state abstraction.
    ep_val_del_q_phi = y_axis["phi"]
    label = "$O_{\\phi}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis),
             marker="+",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[-1],
             label=label)

    # Add random options.
    ep_val_del_q = y_axis["random"]
    label = "$O_{random}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q] * len(x_axis),
             marker="x",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[0])  #, label=label)

    # Add optimal.
    ep_val_optimal = y_axis["optimal"]
    plt.plot(x_axis, [ep_val_optimal] * len(x_axis),
             linestyle="-",
             linewidth=1.0,
             color=PLOT_COLORS[1])  #, label="$\\pi^*$")

    for i, dependent_var in enumerate(d_var_range):
        total_steps = dependent_var[0] * dependent_var[1]
        label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str(
            str(total_steps).count("0")) + "$"
        plt.plot(x_axis,
                 y_axis[dependent_var],
                 marker="x",
                 color=PLOT_COLORS[i + 2],
                 linewidth=1.5,
                 label=label)

        # Confidence intervals.
        top = np.add(y_axis[dependent_var], conf_intervals[dependent_var])
        bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var])
        plt.fill_between(x_axis,
                         top,
                         bot,
                         alpha=0.25,
                         color=PLOT_COLORS[i + 2])

    plt.legend()
    plt.savefig("branching_factor_results.pdf", format="pdf")
    plt.cla()
    plt.close()