def get_optimal_policies(environment): ''' Args: environment (simple_rl.MDPDistribution) Returns: (list) ''' # Make State Abstraction approx_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.05) # True Optimal true_opt_vi = ValueIteration(environment) true_opt_vi.run_vi() opt_agent = FixedPolicyAgent(true_opt_vi.policy, "$\pi^*$") # Optimal Abstraction opt_det_vi = AbstractValueIteration(environment, state_abstr=approx_qds_test, sample_rate=30) opt_det_vi.run_vi() opt_det_agent = FixedPolicyAgent(opt_det_vi.policy, name="$\pi_{\phi}^*$") stoch_policy_obj = StochasticSAPolicy(approx_qds_test, environment) stoch_agent = FixedPolicyAgent(stoch_policy_obj.policy, "$\pi(a \mid s_\phi )$") ql_agents = [opt_agent, stoch_agent, opt_det_agent] return ql_agents
def _setup_agents(solar_mdp): ''' Args: solar_mdp (SolarOOMDP) Returns: (list): of Agents ''' # Get relevant MDP params. actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma( ), solar_mdp.get_panel_step() # Setup fixed agent. static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel") optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal") # Grena single axis and double axis trackers from time/loc. grena_tracker = SolarTracker(tb.grena_tracker, panel_step=panel_step, dual_axis=True) grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(), name="grena-tracker") # Setup RL agents alpha, epsilon = 0.3, 0.3 num_features = solar_mdp.get_num_state_feats() lin_ucb_agent = LinUCBAgent(actions, name="lin-ucb", alpha=0.3) #, alpha=0.2) ql_lin_approx_agent_g0 = LinearQLearnerAgent(actions, num_features=num_features, name="ql-lin-g0", alpha=alpha, epsilon=epsilon, gamma=0, rbf=True, anneal=True) ql_lin_approx_agent = LinearQLearnerAgent(actions, num_features=num_features, name="ql-lin", alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=True, anneal=True) # sarsa_lin_rbf_agent = LinearApproxSarsaAgent(actions, name="sarsa-lin", alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=True, anneal=False) random_agent = RandomAgent(actions) # Regular experiments. agents = [ ql_lin_approx_agent, lin_ucb_agent, grena_tracker_agent, static_agent ] return agents
def main(): import OptimalBeliefAgentClass # Setup multitask setting. # R ~ D : Puddle, Rock Sample # G ~ D : octo, four_room # T ~ D : grid mdp_class, is_goal_terminal, samples = parse_args() mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() print "done." #, iters, value sys.stdout.flush() # Agents. print "Making agents...", sys.stdout.flush() mdp_distr_copy = copy.deepcopy(mdp_distr) opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy) opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy, name="$\pi_{prior}$") opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent( mdp_distr, actions) vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$") rand_agent = RandomAgent(actions, name="$\pi^u$") ql_agent = QLearningAgent(actions) print "done." agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent] # Run task. run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=False, track_disc_reward=False, cumulative_plot=True)
def _setup_agents(solar_mdp): ''' Args: solar_mdp (SolarOOMDP) Returns: (list): of Agents ''' # Get relevant MDP params. actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma( ), solar_mdp.get_panel_step() # Setup fixed agent. static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel") optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal") # Grena single axis and double axis trackers from time/loc. grena_tracker = SolarTracker(tb.grena_tracker, panel_step=panel_step, dual_axis=solar_mdp.dual_axis, actions=solar_mdp.get_bandit_actions()) grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(), name="grena-tracker") # Setup RL agents alpha, epsilon = 0.1, 0.05 rand_init = True num_features = solar_mdp.get_num_state_feats() lin_ucb_agent = LinUCBAgent(solar_mdp.get_bandit_actions(), context_size=num_features, name="lin-ucb", rand_init=rand_init, alpha=2.0) # sarsa_agent_g0 = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin-g0", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=0, rbf=False, anneal=True) # sarsa_agent = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=False, anneal=True) ql_agent = QLearningAgent(actions, alpha=alpha, epsilon=epsilon, gamma=gamma) random_agent = RandomAgent(actions) # Regular experiments. # agents = [lin_ucb_agent, sarsa_agent, sarsa_agent_g0, grena_tracker_agent, static_agent] agents = [grena_tracker_agent, static_agent] return agents
def main(open_plot=True): # Setup MDP, Agents. markov_game = GatheringMDP() ql_agent = QLearnerAgent(actions=markov_game.get_actions()) fixed_action = random.choice(markov_game.get_actions()) fixed_agent = FixedPolicyAgent(policy=lambda s:fixed_action) # Run experiment and make plot. play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40, open_plot=open_plot)
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False): ''' Args: mdp (simple_rl.MDP) demo_policy (lambda : simple_rl.State --> str) beta_list (list) is_deterministic_ib (bool) Summary: Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction. ''' # Run info_sa. dict_of_phi_pmfs = {} for beta in beta_list: pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib) # Translate abstractions. prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) #ground state to abstract state dict_of_phi_pmfs[beta] = crisp_s_phi print("crisp_s_phi:" ) for single_state in crisp_s_phi.get_abs_states(): print(str(type(single_state))) print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state))) print("ground states:") for ground_states in crisp_s_phi.get_ground_states(): print(str(type(ground_states))) print(len(crisp_s_phi.get_ground_states())) print(len(crisp_s_phi.get_abs_states())) # Make agents. demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$") ql_agent = QLearningAgent(mdp.get_actions()) agent_dict = {} for beta in beta_list: beta_phi = dict_of_phi_pmfs[beta] ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True}) agent_dict[beta] = ql_abstr_agent # Learn. run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5) # Print num abstract states. for beta in dict_of_phi_pmfs.keys(): print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states() print
def main(): # ====================== # == Make Environment == # ====================== params = get_params() # ============================ # == Make test and train environments # == along with demonstrator(s) # ============================ mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict( multitask=params['multitask']) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features) sa_agent = AbstractionWrapper( QLearningAgent, agent_params={"actions": test_mdp.get_actions()}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
def generate_agent(mdp_class, data_loc, mdp_parameters, visualize=False): try: with open('models/' + data_loc + '/vi_agent.pickle', 'rb') as f: mdp_agent, vi_agent = pickle.load(f) except: mdp_agent = make_mdp.make_custom_mdp(mdp_class, mdp_parameters) vi_agent = ValueIteration(mdp_agent, sample_rate=1) vi_agent.run_vi() with open('models/' + data_loc + '/vi_agent.pickle', 'wb') as f: pickle.dump((mdp_agent, vi_agent), f) # Visualize agent if visualize: fixed_agent = FixedPolicyAgent(vi_agent.policy) mdp_agent.visualize_agent(fixed_agent) mdp_agent.reset() # reset the current state to the initial state mdp_agent.visualize_interaction()
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False): ''' Args: mdp (simple_rl.MDP) demo_policy_lambda (lambda : simple_rl.State --> str) beta (float) is_deterministic_ib (bool): If True, run DIB, else IB. is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead. Summary: Runs info_sa and compares the value of the found policy with the demonstrator policy. ''' if is_agent_in_control: # Run info_sa with the agent controlling the MDP. pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib) else: # Run info_sa. pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib) # Make demonstrator agent and random agent. demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$") rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$") # Make abstract agent. lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="") # Run. run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000) non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0] # Print state space sizes. demo_vi = ValueIteration(mdp) print "\nState Spaces Sizes:" print "\t|S| =", demo_vi.get_num_states() print "\tH(S_\\phi) =", entropy(pmf_s_phi) print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states() print "\tdelta_min =", min(non_zero_abstr_states) print "\tnum non zero states =", len(non_zero_abstr_states) print
def get_all_fixed_policy_agents(mdp): ''' Args: mdp (MDP) Returns: (list of Agent) ''' states = mdp.get_states() actions = mdp.get_actions() all_policies = make_all_fixed_policies(states, actions) fixed_agents = [] for i, p in enumerate(all_policies): policy = make_policy_from_action_str(p, actions, states) next_agent = FixedPolicyAgent(policy, name="rand-fixed-policy-" + str(i)) fixed_agents.append(next_agent) return fixed_agents
def info_sa_planning_experiment(min_grid_size=5, max_grid_size=11, beta=10.0): ''' Args: min_grid_size (int) max_grid_size (int) beta (float): Hyperparameter for InfoSA. Summary: Writes num iterations and time (seconds) for planning with and without abstractions. ''' vanilla_file = "vi.csv" sa_file = "vi-$\\phi$.csv" file_prefix = os.path.join("results", "planning-four_room") clear_files(dir_name=file_prefix) for grid_dim in xrange(min_grid_size, max_grid_size + 1): # ====================== # == Make Environment == # ====================== mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)], gamma=0.9) # Get demo policy. vi = ValueIteration(mdp) vi.run_vi() demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.2)) # ======================= # == Make Abstractions == # ======================= pmf_s_phi, phi_pmf, abstr_policy = run_info_sa(mdp, demo_policy, iters=500, beta=beta, convergence_threshold=0.00001) lambda_abstr_policy = get_lambda_policy(abstr_policy) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) # ============ # == Run VI == # ============ vanilla_vi = ValueIteration(mdp, delta=0.0001, sample_rate=25) sa_vi = AbstractValueIteration(ground_mdp=mdp, state_abstr=crisp_s_phi, delta=0.0001, vi_sample_rate=25, amdp_sample_rate=25) # ========== # == Plan == # ========== print "Running VIs." start_time = time.clock() vanilla_iters, vanilla_val = vanilla_vi.run_vi() vanilla_time = round(time.clock() - start_time, 2) mdp.reset() start_time = time.clock() sa_iters, sa_abs_val = sa_vi.run_vi() sa_time = round(time.clock() - start_time, 2) sa_val = evaluate_agent(FixedPolicyAgent(sa_vi.policy), mdp, instances=25) print "\n" + "*"*20 print "Vanilla", "\n\t Iters:", vanilla_iters, "\n\t Value:", round(vanilla_val, 4), "\n\t Time:", vanilla_time print print "Phi:", "\n\t Iters:", sa_iters, "\n\t Value:", round(sa_val, 4), "\n\t Time:", sa_time print "*"*20 + "\n\n" write_datum(os.path.join(file_prefix, "iters", vanilla_file), vanilla_iters) write_datum(os.path.join(file_prefix, "iters", sa_file), sa_iters) write_datum(os.path.join(file_prefix, "times", vanilla_file), vanilla_time) write_datum(os.path.join(file_prefix, "times", sa_file), sa_time)
def diff_sampling_distr_experiment(): ''' Summary: Compares performance of different sample styles to compute phi. ''' # Make MDP and Demo Policy. params = get_params() mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(multitask=False) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) # Make a NN for each sampling param. agents = {} sess = tf.Session() sampling_params = [0.0, 0.5, 1.0] for epsilon in sampling_params: with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope: # tf.reset_default_graph() params["epsilon"] = epsilon abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="demo") nn_sa = NNStateAbstr(abstraction_net) sa_agent = AbstractionWrapper( QLearningAgent, agent_params={ "actions": test_mdp.get_actions(), "name": "$D \\sim \\rho_E^\\epsilon, \\epsilon=" + str(epsilon) + "$" }, state_abstr=nn_sa, name_ext="") agents[epsilon] = sa_agent with tf.variable_scope('demo') as scope: abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="rand") nn_sa_rand = NNStateAbstr(abstraction_net_rand) sa_agent_rand = AbstractionWrapper(QLearningAgent, agent_params={ "actions": test_mdp.get_actions(), "name": "$D \\sim U(S)$" }, state_abstr=nn_sa_rand, name_ext="") agents["rand"] = sa_agent_rand run_agents_on_mdp(agents.values(), test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) sess.close()
def branching_factor_experiment(min_options=0, max_options=20, increment=2, instances=5, epsilon=0.05): ''' Args: min_options (int) max_options (int) increment (int) Summary: Runs an experiment contrasting learning performance for different # options. ''' # Define MDP. grid_size = 7 mdp = FourRoomMDP(width=grid_size, height=grid_size, goal_locs=[(grid_size, grid_size)]) # Make State Abstraction. states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50) state_abstr = core.compute_phi_given_m(mdp, four_rooms_predicate_9x9, level=1, states=states) x_axis = range(min_options, max_options + 1, increment) y_axis = defaultdict(list) #[] #[0] * len(x_axis) conf_intervals = defaultdict(list) num_options_performance = defaultdict(lambda: defaultdict(list)) # Choose dependent variable (either #steps per episode or #episodes). d_var_range = [(20, 5), (40, 250), (400, 2500)] for steps, episodes in d_var_range: print "steps, episodes", steps, episodes # Evaluate. for i, instance in enumerate(range(instances)): print "\tInstance", instance + 1, "of", str(instances) + "." # Make initial Options. for num_options in x_axis: options, _ = make_near_optimal_phi_relative_options( mdp, state_abstr, 'eps-greedy', num_rand_opts=num_options - 1, eps=epsilon) action_abstr = ActionAbstraction( options=options, prim_actions=mdp.get_actions()) # Make agent. AgentClass = RMaxAgent # DoubleQAgent, QLearningAgent, SarsaAgent sa_aa_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O$") _, _, value_per_episode = run_single_agent_on_mdp( sa_aa_agent, mdp, episodes=episodes, steps=steps) mdp.reset() num_options_performance[(steps, episodes)][num_options].append( value_per_episode[-1]) ############ # Other types # Just state abstraction. steps, episodes = d_var_range[-1][0], d_var_range[-1][1] sa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=None, name_ext="-$\\phi$") _, _, value_per_episode = run_single_agent_on_mdp(sa_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["phi"].append( value_per_episode[-1]) y_axis["phi"] = [value_per_episode[-1]] # Run random options. options = make_fixed_random_options(mdp, state_abstr) action_abstr = ActionAbstraction(options=options, prim_actions=mdp.get_actions()) AgentClass = QLearningAgent rand_opt_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O_{\text{random}}$") _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["random"].append( value_per_episode[-1]) y_axis["random"] = [value_per_episode[-1]] # Makeoptimal agent. value_iter = ValueIteration(mdp) value_iter.run_vi() optimal_agent = FixedPolicyAgent(value_iter.policy) _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent, mdp, episodes=episodes, steps=steps) y_axis["optimal"] = [value_per_episode[-1]] total_steps = d_var_range[0][0] * d_var_range[0][1] # Confidence intervals. for dependent_var in d_var_range: for num_options in x_axis: # Compute mean and standard error. avg_for_n = float( sum(num_options_performance[dependent_var] [num_options])) / instances std_deviation = np.std( num_options_performance[dependent_var][num_options]) std_error = 1.96 * (std_deviation / math.sqrt( len(num_options_performance[dependent_var][num_options]))) y_axis[dependent_var].append(avg_for_n) conf_intervals[dependent_var].append(std_error) plt.xlabel("$|O_\\phi|$") plt.xlim([1, len(x_axis)]) plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$") plt.tight_layout() # Keeps the spacing nice. # Add just state abstraction. ep_val_del_q_phi = y_axis["phi"] label = "$O_{\\phi}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis), marker="+", linestyle="--", linewidth=1.0, color=PLOT_COLORS[-1], label=label) # Add random options. ep_val_del_q = y_axis["random"] label = "$O_{random}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q] * len(x_axis), marker="x", linestyle="--", linewidth=1.0, color=PLOT_COLORS[0]) #, label=label) # Add optimal. ep_val_optimal = y_axis["optimal"] plt.plot(x_axis, [ep_val_optimal] * len(x_axis), linestyle="-", linewidth=1.0, color=PLOT_COLORS[1]) #, label="$\\pi^*$") for i, dependent_var in enumerate(d_var_range): total_steps = dependent_var[0] * dependent_var[1] label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str( str(total_steps).count("0")) + "$" plt.plot(x_axis, y_axis[dependent_var], marker="x", color=PLOT_COLORS[i + 2], linewidth=1.5, label=label) # Confidence intervals. top = np.add(y_axis[dependent_var], conf_intervals[dependent_var]) bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var]) plt.fill_between(x_axis, top, bot, alpha=0.25, color=PLOT_COLORS[i + 2]) plt.legend() plt.savefig("branching_factor_results.pdf", format="pdf") plt.cla() plt.close()
def num_training_data_experiment(): ''' Summary: Runs an experiment that compares the performance of different Agent-SA combinations, where each SA is trained with a different number of training samples. ''' # Params. instances = 10 init, increment, maximum = 1, 500, 5001 training_samples = range(init, maximum, increment) # Run experiment.s if not os.path.exists(os.path.join("results", "puddle_per_sample")): os.makedirs(os.path.join("results", "puddle_per_sample")) data_dir = os.path.join("results", "puddle_per_sample") with open(os.path.join(data_dir, "results.csv"), "w+") as results_file: # Repeat the experiment @instances # times. for i in range(instances): print "\nInstances", i + 1, "of", str(instances) for sample_num in training_samples: print "\tSamples:", sample_num # Make State Abstraction. params = get_params(default_params={ "num_samples_from_demonstrator": sample_num }) mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict( multitask=params['multitask']) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) tf.reset_default_graph() sess = tf.Session() abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False) nn_sa = NNStateAbstr(abstraction_net) # Test Performance with given param. sa_agent = AbstractionWrapper( QLearningAgent, agent_params={"actions": test_mdp.get_actions()}, state_abstr=nn_sa, name_ext="$-\\phi$") val = evaluate_agent(sa_agent, test_mdp, steps=params['steps'], episodes=params['episodes']) results_file.write(str(val) + ",") results_file.flush() sess.close() results_file.write("\n") cu.EVERY_OTHER_X = True cu.CUSTOM_TITLE = "Effect of $|D_{train, \\phi}|$ on RL Performance" cu.X_AXIS_LABEL = "$|D_{train, \\phi}|$" cu.Y_AXIS_LABEL = "Avg. Reward in Last Episode" cu.X_AXIS_START_VAL = init cu.X_AXIS_INCREMENT = increment cu.COLOR_SHIFT = 3 cu.format_and_make_plot(data_dir=data_dir, avg_plot=True, add_legend=False)
def extract_constraints(wt_vi_traj_candidates, weights, step_cost_flag, BEC_depth=1, trajectories=None, print_flag=False): ''' :param wt_vi_traj_candidates: Nested list of [weight, value iteration object, trajectory] :param weights (numpy array): Ground truth reward weights used by agent to derive its optimal policy :param step_cost_flag (bool): Indicates that the last weight element is a known step cost :param BEC_depth (int): number of suboptimal actions to take before following the optimal policy to obtain the suboptimal trajectory (and the corresponding suboptimal expected feature counts) :return: min_subset_constraints: List of constraints Summary: Obtain the minimum BEC constraints for each environment ''' min_subset_constraints_record = [ ] # minimum BEC constraints conveyed by a trajectory env_record = [] policy_constraints = [ ] # BEC constraints that define a policy (i.e. constraints arising from one action # deviations from every possible starting state and the corresponding optimal trajectories) traj_record = [] processed_envs = [] # go through each environment and corresponding optimal trajectory, and extract the behavior equivalence class (BEC) constraints for env_idx, wt_vi_traj_candidate in enumerate(wt_vi_traj_candidates): if print_flag: print("Extracting constraints from environment {}".format(env_idx)) mdp = wt_vi_traj_candidate[0][1].mdp agent = FixedPolicyAgent(wt_vi_traj_candidate[0][1].policy) if trajectories is not None: constraints = [] # a) demonstration-driven BEC # BEC constraints are obtained by ensuring that the optimal actions accumulate at least as much reward as # all other possible actions along a trajectory action_seq_list = list( itertools.product(mdp.actions, repeat=BEC_depth)) traj_opt = trajectories[env_idx] for sas_idx in range(len(traj_opt)): # reward features of optimal action mu_sa = mdp.accumulate_reward_features(traj_opt[sas_idx:], discount=True) sas = traj_opt[sas_idx] cur_state = sas[0] # currently assumes that all actions are executable from all states for action_seq in action_seq_list: traj_hyp = mdp_helpers.rollout_policy( mdp, agent, cur_state, action_seq) mu_sb = mdp.accumulate_reward_features(traj_hyp, discount=True) constraints.append(mu_sa - mu_sb) # store the BEC constraints for each environment, along with the associated demo and environment number min_subset_constraints = BEC_helpers.clean_up_constraints( constraints, weights, step_cost_flag) min_subset_constraints_record.append(min_subset_constraints) traj_record.append(traj_opt) env_record.append(env_idx) # slightly abusing the term 'policy' here since I'm only considering a subset of possible trajectories (i.e. # demos) that the policy can generate in these environments policy_constraints.append(min_subset_constraints) else: # b) policy-driven BEC # wt_vi_traj_candidates can contain MDPs with the same environment but different initial states (to # accommodate demo BEC). by considering all reachable states of two identical MDPs with different initial # states, you will obtain duplicate test environments so only go through each MDP once for policy BEC. if mdp.env_code not in processed_envs: agent = FixedPolicyAgent(wt_vi_traj_candidate[0][1].policy) for state in mdp.states: constraints = [] traj_opt = mdp_helpers.rollout_policy(mdp, agent, cur_state=state) for sas_idx in range(len(traj_opt)): # reward features of optimal action mu_sa = mdp.accumulate_reward_features( traj_opt[sas_idx:], discount=True) sas = traj_opt[sas_idx] cur_state = sas[0] # currently assumes that all actions are executable from all states. only considering # action depth of 1 currently for action in mdp.actions: if action != sas[1]: traj_hyp = mdp_helpers.rollout_policy( mdp, agent, cur_state=cur_state, action_seq=[action]) mu_sb = mdp.accumulate_reward_features( traj_hyp, discount=True) constraints.append(mu_sa - mu_sb) # if considering only suboptimal actions of the first sas, put the corresponding constraints # toward the BEC of the policy (per definition) if sas_idx == 0: policy_constraints.append( BEC_helpers.clean_up_constraints( constraints, weights, step_cost_flag)) # also store the BEC constraints for optimal trajectory in each state, along with the associated # demo and environment number min_subset_constraints_record.append( BEC_helpers.clean_up_constraints( constraints, weights, step_cost_flag)) traj_record.append(traj_opt) env_record.append(env_idx) processed_envs.append(mdp.env_code) return policy_constraints, min_subset_constraints_record, env_record, traj_record
#!/usr/bin/env python # Python imports. import random # Other imports. import srl_example_setup from simple_rl.agents import QLearnerAgent, FixedPolicyAgent from simple_rl.tasks import RockPaperScissorsMDP from simple_rl.run_experiments import play_markov_game # Setup MDP, Agents. markov_game = RockPaperScissorsMDP() ql_agent = QLearnerAgent(actions=markov_game.get_actions()) fixed_action = random.choice(markov_game.get_actions()) fixed_agent = FixedPolicyAgent(policy=lambda s: fixed_action) # Run experiment and make plot. play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40)
def main(eps=0.1, open_plot=True): mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = avg_mdp_vi.get_q_function() if alg == "q": pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0") qmax = 1.0 * (1 - 0.99) # qmax = 1.0 pure_ql_agent_opt = QLearnerAgent(actions, epsilon=eps, default_q=qmax, name="Q-vmax") transfer_ql_agent_optq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-max") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-avg") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq, transfer_ql_agent_avgq ] elif alg == "rmax": pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax") updating_trans_rmax_agent = UpdatingRMaxAgent(actions, name="RMAX-updating_max") trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent] elif alg == "delayed-q": pure_delayed_ql_agent = DelayedQLearnerAgent(actions, opt_q_func, name="DelayedQ-vmax") pure_delayed_ql_agent.set_vmax() updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent( actions, name="DelayedQ-updating_max") trans_delayed_ql_agent = DelayedQLearnerAgent( actions, opt_q_func, name="DelayedQ-trans-max") agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent ] else: print "Unknown type of agents:", alg print "(q, rmax, delayed-q)" assert (False) # Run task. # TODO: Function for Learning on each MDP run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=is_goal_terminal, is_rec_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
def run_agents_multi_task(agents, mdp_distr, task_samples=5, episodes=1, steps=100, clear_old_results=True, open_plot=True, verbose=False, is_rec_disc_reward=False, reset_at_terminal=False, include_optimal=False): ''' Args: agents (list) mdp_distr (MDPDistribution) task_samples episodes steps Summary: Runs each agent on the MDP distribution according to the given parameters. If @mdp_distr has a non-zero horizon, then gamma is set to 1 and #steps is ignored. ''' # Set number of steps if the horizon is given. if mdp_distr.get_horizon() > 0: mdp_distr.set_gamma(1.0) steps = mdp_distr.get_horizon() # Experiment (for reproducibility, plotting). exp_params = {"task_samples":task_samples, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agents, mdp=mdp_distr, params=exp_params, is_episodic=episodes > 1, is_multi_task=True, clear_old_results=clear_old_results, is_rec_disc_reward=is_rec_disc_reward) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) start = time.clock() times = defaultdict(float) if include_optimal: fixed_policy_agent = FixedPolicyAgent(policy=lambda s: "", name="optimal") agents += [fixed_policy_agent] # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # --- SAMPLE NEW MDP --- for new_task in xrange(task_samples): print " Sample " + str(new_task + 1) + " of " + str(task_samples) + "." # Sample the MDP. mdp = mdp_distr.sample() if include_optimal and agent.name == "optimal": vi = ValueIteration(mdp) vi.run_vi() agent.set_policy(vi.policy) # Run the agent. run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal) # Reset the agent. agent.reset() if "rmax" in agent.name: agent._reset_reward() # Track how much time this agent took. end = time.clock() times[agent] = round(end - start, 3) # Time stuff. print "\n--- TIMES ---" for agent in times.keys(): print str(agent) + " agent took " + str(round(times[agent], 2)) + " seconds." print "-------------\n" experiment.make_plots(open_plot=open_plot)
def make_info_sa_val_and_size_plots(mdp, demo_policy_lambda, beta_range, results_dir="info_sa_results", instances=3, include_stoch=False, is_agent_in_control=False): ''' Args: mdp (simple_rl.MDP) demo_policy_lambda (lambda : simple_rl.State --> str) beta_range (list) results_dir (str) instances (int) include_stoch (bool): If True, also runs IB. is_agent_in_control (bool): If True, runs the agent_in_control.py variant of DIB-SA. Summary: Main plotting function for info_sa experiments. ''' # Clear old results. all_policies = ["demo_val", "dibs_val", "dibs_states", "etad_states"] if include_stoch: all_policies += ["ib_val", "ib_states"] for policy in all_policies: if os.path.exists(os.path.join(results_dir, str(policy)) + ".csv"): os.remove(os.path.join(results_dir, str(policy)) + ".csv") # Set relevant params. param_dict = { "mdp": mdp, "iters": 500, "convergence_threshold": 0.0001, "demo_policy_lambda": demo_policy_lambda, "is_agent_in_control": is_agent_in_control } # Record vallue of demo policy and size of ground state space. demo_agent = FixedPolicyAgent(demo_policy_lambda) demo_val = evaluate_agent(demo_agent, mdp, instances=100) vi = ValueIteration(mdp) num_ground_states = vi.get_num_states() for beta in beta_range: write_datum_to_file(file_name="demo_val", datum=demo_val, extra_dir=results_dir) write_datum_to_file(file_name="ground_states", datum=num_ground_states, extra_dir=results_dir) # Run core algorithm for DIB and IB. for instance in range(instances): print "\nInstance", instance + 1, "of", str(instances) + "." random.jumpahead(1) # For each beta. for beta in beta_range: # Run DIB. dibs_val, dibs_states = _info_sa_val_and_size_plot_wrapper( beta=beta, param_dict=dict(param_dict.items() + { "is_deterministic_ib": True, "use_crisp_policy": False }.items())) write_datum_to_file(file_name="dibs_val", datum=dibs_val, extra_dir=results_dir) write_datum_to_file(file_name="dibs_states", datum=dibs_states, extra_dir=results_dir) if include_stoch: ib_val, ib_states = _info_sa_val_and_size_plot_wrapper( beta=beta, param_dict=dict(param_dict.items() + { "is_deterministic_ib": False, "use_crisp_policy": False }.items())) write_datum_to_file(file_name="ib_val", datum=ib_val, extra_dir=results_dir) write_datum_to_file(file_name="ib_states", datum=ib_states, extra_dir=results_dir) # End instances. end_of_instance("dibs_val", extra_dir=results_dir) end_of_instance("dibs_states", extra_dir=results_dir) if include_stoch: end_of_instance("ib_val", extra_dir=results_dir) end_of_instance("ib_states", extra_dir=results_dir) beta_range_file = file(os.path.join(results_dir, "beta_range.csv"), "w") for beta in beta_range: beta_range_file.write(str(beta)) beta_range_file.write(",") beta_range_file.close() make_beta_val_plot([p for p in all_policies if "val" in p], results_dir, is_agent_in_control=is_agent_in_control)
def run_agents_on_mdp(agents, mdp, instances=5, episodes=100, steps=200, clear_old_results=True, rew_step_count=1, is_rec_disc_reward=False, open_plot=True, verbose=False, reset_at_terminal=False, include_optimal=False): ''' Args: agents (list of Agents): See agents/AgentClass.py (and friends). mdp (MDP): See mdp/MDPClass.py for the abstract class. Specific MDPs in tasks/*. instances (int) [opt]: Number of times to run each agent (for confidence intervals). episodes (int) [opt]: Number of episodes for each learning instance. steps (int) [opt]: Number of steps per episode. clear_old_results (bool) [opt]: If true, removes all results files in the relevant results dir. rew_step_count (int): Number of steps before recording reward. is_rec_disc_reward (bool): If true, track (and plot) discounted reward. open_plot (bool): If true opens the plot at the end. verbose (bool): If true, prints status bars per episode/instance. reset_at_terminal (bool): If true sends the agent to the start state after terminal. include_optimal (bool): If true also plots optimal behavior. Summary: Runs each agent on the given mdp according to the given parameters. Stores results in results/<agent_name>.csv and automatically generates a plot and opens it. ''' # Experiment (for reproducibility, plotting). exp_params = {"instances":instances, "episodes":episodes, "steps":steps} experiment = Experiment(agents=agents, mdp=mdp, params=exp_params, is_episodic= episodes > 1, clear_old_results=clear_old_results, is_rec_disc_reward=is_rec_disc_reward, count_r_per_n_timestep=rew_step_count) # Record how long each agent spends learning. print "Running experiment: \n" + str(experiment) time_dict = defaultdict(float) if include_optimal: vi = ValueIteration(mdp) vi.run_vi() fixed_policy_agent = FixedPolicyAgent(vi.policy, name="optimal") agents += [fixed_policy_agent] # Learn. for agent in agents: print str(agent) + " is learning." start = time.clock() # For each instance. for instance in xrange(1, instances + 1): print " Instance " + str(instance) + " of " + str(instances) + "." sys.stdout.flush() run_single_agent_on_mdp(agent, mdp, episodes, steps, experiment, verbose, is_rec_disc_reward, reset_at_terminal=reset_at_terminal) # Reset the agent. agent.reset() # Track how much time this agent took. end = time.clock() time_dict[agent] = round(end - start, 3) print # Time stuff. print "\n--- TIMES ---" for agent in time_dict.keys(): print str(agent) + " agent took " + str(round(time_dict[agent], 2)) + " seconds." print "-------------\n" # if not isinstance(mdp, GymMDP): experiment.make_plots(open_plot=open_plot)
def get_exact_vs_approx_agents(environment, incl_opt=True): ''' Args: environment (simple_rl.MDPDistribution) incl_opt (bool) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() exact_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0) approx_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.05) ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) ql_exact_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") ql_approx_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent] dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) dql_exact_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") dql_approx_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent] rm_agent = RMaxAgent(actions, gamma=gamma) rm_exact_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") rm_approx_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent] if incl_opt: vi = ValueIteration(environment) vi.run_vi() opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$") sa_vi = AbstractValueIteration( environment, sample_rate=50, max_iterations=3000, delta=0.0001, state_abstr=approx_qds_test, action_abstr=ActionAbstraction( options=[], prim_actions=environment.get_actions())) sa_vi.run_vi() approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$") dql_agents += [opt_agent, approx_opt_agent] return ql_agents
def diff_sampling_distr_experiment(): ''' Summary: Runs ''' # Make MDP and Demo Policy. params = get_params() mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) # Make a NN for each sampling param. sampling_params = [0.0, 0.5, 1.0] test_mdp = CartPoleMDP() # agents = {"demo": demo_agent} sess = tf.Session() for epsilon in sampling_params: with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope: print "epsilon", epsilon # tf.reset_default_graph() params["epsilon"] = epsilon abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False) nn_sa = NNStateAbstr(abstraction_net) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$QL_\\phi-\\epsilon=" + str(epsilon) + "$" }, state_abstr=nn_sa) agents[epsilon] = sa_agent with tf.variable_scope('demo') as scope: abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="rand") nn_sa_rand = NNStateAbstr(abstraction_net_rand) sa_agent_rand = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$D \\sim U(S)$" }, state_abstr=nn_sa_rand, name_ext="") agents["rand"] = sa_agent_rand run_agents_on_mdp(agents.values(), test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) sess.close()