def main(): # Setup MDP, Agents. mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01) # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) rm_agent = RMaxAgent(mdp.get_actions()) viz = parse_args() viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent)
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4) viz = parse_args() # Choose viz type. viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent) elif viz == "interactive": mdp.visualize_interaction()
def main(open_plot=True): # Setup MDP, Agents. # mdp = GridWorldMDP(width=4, height=3, init_loc=(1,1), goal_locs=[(4,3)], gamma=0.95, walls=[(2,2)]) mdp = FourRoomMDP(width=11, height=11, init_loc=(1, 1), goal_locs=[(9, 3)], is_goal_terminal=True, slip_prob=0.2) # mdp = ComboLockMDP(combo=[3,1,2], num_actions=3, num_states=3) dq_agent = DoubleQAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=5, episodes=1, steps=10000, open_plot=open_plot)
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9)], gamma=0.95) ql_agent = QLearnerAgent(mdp.get_actions()) viz = parse_args() if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy vi = ValueIteration(mdp) vi.run_vi() policy = vi.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print "\n", str(ql_agent), "interacting with", str(mdp) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent)
def main(): # Make MDP. grid_dim = 11 mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), slip_prob=0.05, goal_locs=[(grid_dim, grid_dim)], gamma=0.99) # Experiment Type. exp_type = "learn_w_abstr" # For comparing policies and visualizing. beta = 1 is_deterministic_ib = True is_agent_in_control = True # For main plotting experiment. beta_range = list(chart_utils.drange(0.0, 4.0, 1.0)) instances = 1 # Get demo policy. vi = ValueIteration(mdp) _, val = vi.run_vi() # Epsilon greedy policy demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.1)) if exp_type == "plot_info_sa_val_and_num_states": # Makes the main two plots. make_info_sa_val_and_size_plots(mdp, demo_policy, beta_range, instances=instances, is_agent_in_control=is_agent_in_control) elif exp_type == "compare_policies": # Makes a plot comparing value of pi-phi combo from info_sa with \pi_d. info_sa_compare_policies(mdp, demo_policy, beta=beta, is_deterministic_ib=is_deterministic_ib, is_agent_in_control=is_agent_in_control) elif exp_type == "visualize_info_sa_abstr": # Visualize the state abstraction found by info_sa. info_sa_visualize_abstr(mdp, demo_policy, beta=beta, is_deterministic_ib=is_deterministic_ib, is_agent_in_control=is_agent_in_control) elif exp_type == "learn_w_abstr": # Run learning experiments for different settings of \beta. learn_w_abstr(mdp, demo_policy, is_deterministic_ib=is_deterministic_ib) elif exp_type == "planning": info_sa_planning_experiment()
def info_sa_planning_experiment(min_grid_size=5, max_grid_size=11, beta=10.0): ''' Args: min_grid_size (int) max_grid_size (int) beta (float): Hyperparameter for InfoSA. Summary: Writes num iterations and time (seconds) for planning with and without abstractions. ''' vanilla_file = "vi.csv" sa_file = "vi-$\\phi$.csv" file_prefix = os.path.join("results", "planning-four_room") clear_files(dir_name=file_prefix) for grid_dim in xrange(min_grid_size, max_grid_size + 1): # ====================== # == Make Environment == # ====================== mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)], gamma=0.9) # Get demo policy. vi = ValueIteration(mdp) vi.run_vi() demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.2)) # ======================= # == Make Abstractions == # ======================= pmf_s_phi, phi_pmf, abstr_policy = run_info_sa(mdp, demo_policy, iters=500, beta=beta, convergence_threshold=0.00001) lambda_abstr_policy = get_lambda_policy(abstr_policy) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) # ============ # == Run VI == # ============ vanilla_vi = ValueIteration(mdp, delta=0.0001, sample_rate=25) sa_vi = AbstractValueIteration(ground_mdp=mdp, state_abstr=crisp_s_phi, delta=0.0001, vi_sample_rate=25, amdp_sample_rate=25) # ========== # == Plan == # ========== print "Running VIs." start_time = time.clock() vanilla_iters, vanilla_val = vanilla_vi.run_vi() vanilla_time = round(time.clock() - start_time, 2) mdp.reset() start_time = time.clock() sa_iters, sa_abs_val = sa_vi.run_vi() sa_time = round(time.clock() - start_time, 2) sa_val = evaluate_agent(FixedPolicyAgent(sa_vi.policy), mdp, instances=25) print "\n" + "*"*20 print "Vanilla", "\n\t Iters:", vanilla_iters, "\n\t Value:", round(vanilla_val, 4), "\n\t Time:", vanilla_time print print "Phi:", "\n\t Iters:", sa_iters, "\n\t Value:", round(sa_val, 4), "\n\t Time:", sa_time print "*"*20 + "\n\n" write_datum(os.path.join(file_prefix, "iters", vanilla_file), vanilla_iters) write_datum(os.path.join(file_prefix, "iters", sa_file), sa_iters) write_datum(os.path.join(file_prefix, "times", vanilla_file), vanilla_time) write_datum(os.path.join(file_prefix, "times", sa_file), sa_time)
def branching_factor_experiment(min_options=0, max_options=20, increment=2, instances=5, epsilon=0.05): ''' Args: min_options (int) max_options (int) increment (int) Summary: Runs an experiment contrasting learning performance for different # options. ''' # Define MDP. grid_size = 7 mdp = FourRoomMDP(width=grid_size, height=grid_size, goal_locs=[(grid_size, grid_size)]) # Make State Abstraction. states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50) state_abstr = core.compute_phi_given_m(mdp, four_rooms_predicate_9x9, level=1, states=states) x_axis = range(min_options, max_options + 1, increment) y_axis = defaultdict(list) #[] #[0] * len(x_axis) conf_intervals = defaultdict(list) num_options_performance = defaultdict(lambda: defaultdict(list)) # Choose dependent variable (either #steps per episode or #episodes). d_var_range = [(20, 5), (40, 250), (400, 2500)] for steps, episodes in d_var_range: print "steps, episodes", steps, episodes # Evaluate. for i, instance in enumerate(range(instances)): print "\tInstance", instance + 1, "of", str(instances) + "." # Make initial Options. for num_options in x_axis: options, _ = make_near_optimal_phi_relative_options( mdp, state_abstr, 'eps-greedy', num_rand_opts=num_options - 1, eps=epsilon) action_abstr = ActionAbstraction( options=options, prim_actions=mdp.get_actions()) # Make agent. AgentClass = RMaxAgent # DoubleQAgent, QLearningAgent, SarsaAgent sa_aa_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O$") _, _, value_per_episode = run_single_agent_on_mdp( sa_aa_agent, mdp, episodes=episodes, steps=steps) mdp.reset() num_options_performance[(steps, episodes)][num_options].append( value_per_episode[-1]) ############ # Other types # Just state abstraction. steps, episodes = d_var_range[-1][0], d_var_range[-1][1] sa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=None, name_ext="-$\\phi$") _, _, value_per_episode = run_single_agent_on_mdp(sa_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["phi"].append( value_per_episode[-1]) y_axis["phi"] = [value_per_episode[-1]] # Run random options. options = make_fixed_random_options(mdp, state_abstr) action_abstr = ActionAbstraction(options=options, prim_actions=mdp.get_actions()) AgentClass = QLearningAgent rand_opt_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O_{\text{random}}$") _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["random"].append( value_per_episode[-1]) y_axis["random"] = [value_per_episode[-1]] # Makeoptimal agent. value_iter = ValueIteration(mdp) value_iter.run_vi() optimal_agent = FixedPolicyAgent(value_iter.policy) _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent, mdp, episodes=episodes, steps=steps) y_axis["optimal"] = [value_per_episode[-1]] total_steps = d_var_range[0][0] * d_var_range[0][1] # Confidence intervals. for dependent_var in d_var_range: for num_options in x_axis: # Compute mean and standard error. avg_for_n = float( sum(num_options_performance[dependent_var] [num_options])) / instances std_deviation = np.std( num_options_performance[dependent_var][num_options]) std_error = 1.96 * (std_deviation / math.sqrt( len(num_options_performance[dependent_var][num_options]))) y_axis[dependent_var].append(avg_for_n) conf_intervals[dependent_var].append(std_error) plt.xlabel("$|O_\\phi|$") plt.xlim([1, len(x_axis)]) plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$") plt.tight_layout() # Keeps the spacing nice. # Add just state abstraction. ep_val_del_q_phi = y_axis["phi"] label = "$O_{\\phi}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis), marker="+", linestyle="--", linewidth=1.0, color=PLOT_COLORS[-1], label=label) # Add random options. ep_val_del_q = y_axis["random"] label = "$O_{random}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q] * len(x_axis), marker="x", linestyle="--", linewidth=1.0, color=PLOT_COLORS[0]) #, label=label) # Add optimal. ep_val_optimal = y_axis["optimal"] plt.plot(x_axis, [ep_val_optimal] * len(x_axis), linestyle="-", linewidth=1.0, color=PLOT_COLORS[1]) #, label="$\\pi^*$") for i, dependent_var in enumerate(d_var_range): total_steps = dependent_var[0] * dependent_var[1] label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str( str(total_steps).count("0")) + "$" plt.plot(x_axis, y_axis[dependent_var], marker="x", color=PLOT_COLORS[i + 2], linewidth=1.5, label=label) # Confidence intervals. top = np.add(y_axis[dependent_var], conf_intervals[dependent_var]) bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var]) plt.fill_between(x_axis, top, bot, alpha=0.25, color=PLOT_COLORS[i + 2]) plt.legend() plt.savefig("branching_factor_results.pdf", format="pdf") plt.cla() plt.close()
def run_learning_experiment(): """ Summary: Builds different sets of options and contrasts how RL algorithms perform when learning with them. """ # Define MDP. width, height = 11, 11 mdp = FourRoomMDP(width=width, height=height, goal_locs=[(width, height)], slip_prob=0.05) actions = mdp.get_actions() # Make State Abstraction. states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50) if isinstance(mdp, FourRoomMDP): predicate = four_rooms_predicate_11x11 else: predicate = reachable_in_n_steps_predicate state_abstr = core.compute_phi_given_m(mdp, predicate, level=1, states=states) # Make initial Options. num_rand_opts_to_add = 2 options, _ = make_near_optimal_phi_relative_options( mdp, state_abstr, 'eps-greedy', num_rand_opts=num_rand_opts_to_add, eps=0.05) action_abstr = ActionAbstraction(options=options, prim_actions=actions) action_abstr_w_prims = ActionAbstraction(options=options, prim_actions=actions, incl_primitives=True) # Find eigen options. # num_eigen_options = max(1, num_rand_opts_to_add - 1) # eigen_options_init_all = find_eigenoptions(mdp, num_options=num_eigen_options, init_everywhere=True) # eigen_options_w_prims = find_eigenoptions(mdp, num_options=num_eigen_options, init_everywhere=False) # eigen_aa_init_all = ActionAbstraction(options=eigen_options_init_all, prim_actions=actions, incl_primitives=False) # eigen_aa_w_prims = ActionAbstraction(options=eigen_options_w_prims, prim_actions=actions, incl_primitives=True) # Make agent. AgentClass = QLearningAgent #QLearningAgent #DoubleQAgent #DelayedQAgent ql_agent = AgentClass(mdp.get_actions()) sa_aa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=state_abstr, action_abstr=action_abstr_w_prims, name_ext="-$\\phi,O$") aa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=action_abstr_w_prims, name_ext="-$O$") # aa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=action_abstr_w_prims, name_ext="-$\\phi$") # Eigen agents. # eigen_agent_init_all = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=eigen_aa_init_all, name_ext="-eigen_all") # eigen_agent_w_prims = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=eigen_aa_w_prims, name_ext="-eigen_w_prims") agents = [ql_agent, aa_agent, sa_aa_agent] #, eigen_agent_init_all, eigen_agent_w_prims] # Run. if isinstance(mdp, FourRoomMDP): run_agents_on_mdp(agents, mdp, instances=10, episodes=500, steps=50) else: run_agents_on_mdp(agents, mdp, instances=10, episodes=100, steps=10)