def main(): # Set Params. mdp_class, task_samples, episodes, steps, grid_dim, AgentClass = get_params( set_manually=False) experiment_type = "sa" lifelong = True resample_at_terminal = False reset_at_terminal = False gamma = 0.95 # ====================== # == Make Environment == # ====================== environment = make_mdp.make_mdp_distr( mdp_class=mdp_class, grid_dim=grid_dim) if lifelong else make_mdp.make_mdp( mdp_class=mdp_class, grid_dim=grid_dim) environment.set_gamma(gamma) # ================= # == Make Agents == # ================= agents = [] if experiment_type == "sa": # SA experiment. agents = get_sa_experiment_agents(environment, AgentClass) elif experiment_type == "combo": # AA experiment. agents = get_combo_experiment_agents(environment) elif experiment_type == "exact_v_approx": agents = get_exact_vs_approx_agents(environment, incl_opt=(not multi_task)) elif experiment_type == "opt": agents = get_optimal_policies(environment) else: print "Experiment Error: experiment type unknown (" + experiment_type + "). Must be one of {sa, combo, exact_v_approx}." quit() # Run! if lifelong: run_agents_lifelong(agents, environment, samples=task_samples, steps=steps, episodes=episodes, reset_at_terminal=reset_at_terminal, resample_at_terminal=resample_at_terminal, cumulative_plot=True, clear_old_results=True) else: run_agents_on_mdp(agents, environment, instances=task_samples, steps=steps, episodes=episodes, reset_at_terminal=reset_at_terminal, track_disc_reward=False)
def main(open_plot=True): # Make MDP distribution, agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearningAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Run experiment and make plot. run_agents_lifelong([ql_agent, rand_agent], mdp_distr, samples=10, episodes=50, steps=100, reset_at_terminal=True, open_plot=open_plot)
def main(open_plot=True): # Setup MDP, Agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearningAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Make goal-based option agent. goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(), options=goal_based_options) option_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions":mdp_distr.get_actions()}, action_abstr=goal_based_aa) # Run experiment and make plot. run_agents_lifelong([ql_agent, rand_agent, option_agent], mdp_distr, samples=10, episodes=100, steps=150, open_plot=open_plot)
def main(): # Make MDP Distribution. mdp_distr = make_mdp_distr(mdp_class="four_room", grid_dim=11, slip_prob=0.05, gamma=0.99) # Make SA. multitask_sa_beta_1 = make_multitask_sa_info_sa(mdp_distr, beta=1.0, is_deterministic_ib=True) multitask_sa_beta_10 = make_multitask_sa_info_sa(mdp_distr, beta=10.0, is_deterministic_ib=True) multitask_sa_beta_100 = make_multitask_sa_info_sa(mdp_distr, beta=100.0, is_deterministic_ib=True) multitask_sa_beta_1000 = make_multitask_sa_info_sa( mdp_distr, beta=1000.0, is_deterministic_ib=True) # Make agent. ql_agent = QLearningAgent(mdp_distr.get_actions()) abstr_ql_b1 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_1, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 1}$") abstr_ql_b10 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_10, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 10}$") abstr_ql_b100 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_100, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 100}$") abstr_ql_b1000 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_1000, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 1000}$") run_agents_lifelong( [abstr_ql_b1, abstr_ql_b10, abstr_ql_b100, abstr_ql_b1000, ql_agent], mdp_distr, steps=200, samples=50, episodes=200)
def main(): from agents import OptimalBeliefAgentClass # Setup multitask setting. # R ~ D : Puddle, Rock Sample # G ~ D : octo, four_room # T ~ D : grid mdp_class, is_goal_terminal, samples = parse_args() mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() print "done." #, iters, value sys.stdout.flush() # Agents. print "Making agents...", sys.stdout.flush() mdp_distr_copy = copy.deepcopy(mdp_distr) # Add additional agent: opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy) opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy, name="$\pi_{prior}$") opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent(mdp_distr, actions) vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$") rand_agent = RandomAgent(actions, name="$\pi^u$") ql_agent = QLearningAgent(actions) print "done." agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent] # Run task. run_agents_lifelong(agents, mdp_distr, samples=samples, episodes=1, steps=100, reset_at_terminal=False, track_disc_reward=False, cumulative_plot=True)
def main(open_plot=True): # Setup MDP, Agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearningAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Make goal-based option agent. goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(), options=goal_based_options) option_agent = AbstractionWrapper(QLearningAgent, actions=mdp_distr.get_actions(), action_abstr=goal_based_aa) # Run experiment and make plot. run_agents_lifelong([ql_agent, rand_agent, option_agent], mdp_distr, samples=10, episodes=100, steps=150, open_plot=open_plot)
def main(open_plot=True): episodes = 100 steps = 100 gamma = 0.95 mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal, gamma=gamma) actions = mdp_distr.get_actions() # Compute average MDP. print("Making and solving avg MDP...", end='') sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu # transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = get_q_func(avg_mdp_vi) best_v = -100 # Maximum possible value an agent can get in the environment. for x in opt_q_func: for y in opt_q_func[x]: best_v = max(best_v, opt_q_func[x][y]) print("Vmax =", best_v) vmax = best_v vmax_func = defaultdict(lambda: defaultdict(lambda: vmax)) if alg == "q": eps = 0.1 lrate = 0.1 pure_ql_agent = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, name="Q-0") pure_ql_agent_opt = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, default_q=vmax, name="Q-Vmax") ql_agent_upd_maxq = UpdatingQLearnerAgent(actions, alpha=lrate, epsilon=eps, gamma=gamma, default_q=vmax, name="Q-MaxQInit") transfer_ql_agent_optq = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, name="Q-UO") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearningAgent(actions, gamma=gamma, alpha=lrate, epsilon=eps, name="Q-AverageQInit") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ transfer_ql_agent_optq, ql_agent_upd_maxq, transfer_ql_agent_avgq, pure_ql_agent_opt, pure_ql_agent ] elif alg == "rmax": """ Note that Rmax is a model-based algorithm and is very slow compared to other model-free algorithms like Q-learning and delayed Q-learning. """ known_threshold = 10 min_experience = 5 pure_rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=known_threshold, s_a_threshold=min_experience, name="RMAX-Vmax") updating_trans_rmax_agent = UpdatingRMaxAgent( actions, gamma=gamma, horizon=known_threshold, s_a_threshold=min_experience, name="RMAX-MaxQInit") trans_rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=known_threshold, s_a_threshold=min_experience, name="RMAX-UO") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [ trans_rmax_agent, updating_trans_rmax_agent, pure_rmax_agent, rand_agent ] elif alg == "delayed-q": torelance = 0.1 min_experience = 5 pure_delayed_ql_agent = DelayedQAgent(actions, gamma=gamma, m=min_experience, epsilon1=torelance, name="DelayedQ-Vmax") pure_delayed_ql_agent.set_q_function(vmax_func) updating_delayed_ql_agent = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, name="DelayedQ-MaxQInit") updating_delayed_ql_agent.set_q_function(vmax_func) trans_delayed_ql_agent = DelayedQAgent(actions, gamma=gamma, m=min_experience, epsilon1=torelance, name="DelayedQ-UO") trans_delayed_ql_agent.set_q_function(opt_q_func) agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent, rand_agent ] # agents = [updating_delayed_ql_agent, trans_delayed_ql_agent, rand_agent] elif alg == "sample-effect": """ This runs a comparison of MaxQInit with different number of MDP samples to calculate the initial Q function. Note that the performance of the sampled MDP is ignored for this experiment. It reproduces the result of Figure 4 of "Policy and Value Transfer for Lifelong Reinforcement Learning". """ torelance = 0.1 min_experience = 5 pure_delayed_ql_agent = DelayedQAgent(actions, opt_q_func, m=min_experience, epsilon1=torelance, name="DelayedQ-Vmax") pure_delayed_ql_agent.set_vmax() dql_60samples = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, num_sample_tasks=60, name="$DelayedQ-MaxQInit60$") dql_40samples = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, num_sample_tasks=40, name="$DelayedQ-MaxQInit40$") dql_20samples = UpdatingDelayedQLearningAgent( actions, default_q=vmax, gamma=gamma, m=min_experience, epsilon1=torelance, num_sample_tasks=20, name="$DelayedQ-MaxQInit20$") # Sample MDPs. Note that the performance of the sampled MDP is ignored and not included in the average in the final plot. run_agents_lifelong([dql_20samples], mdp_distr, samples=int(samples * 1 / 5.0), episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot) # mdp_distr.reset_tasks() run_agents_lifelong([dql_40samples], mdp_distr, samples=int(samples * 2 / 5.0), episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot) # mdp_distr.reset_tasks() run_agents_lifelong([dql_60samples], mdp_distr, samples=int(samples * 3 / 5.0), episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot) # mdp_distr.reset_tasks() # agents = [pure_delayed_ql_agent] agents = [ dql_60samples, dql_40samples, dql_20samples, pure_delayed_ql_agent ] else: msg = "Unknown type of agent:" + alg + ". Use -agent_type (q, rmax, delayed-q)" assert False, msg # Run task. run_agents_lifelong(agents, mdp_distr, samples=samples, episodes=episodes, steps=steps, reset_at_terminal=is_goal_terminal, track_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
def main(): # ====================== # == Make Environment == # ====================== params = get_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy if params['multitask']: # Make distribution. mdp_dist_dict = { CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps] } test_mdp = MDPDistribution(mdp_dist_dict) else: test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== if params['multitask']: run_agents_lifelong([sa_agent, linear_agent], test_mdp, samples=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) else: # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
actions = mdp_distr.keys()[0].actions gamma = mdp_distr.keys()[0].gamma ql_agent = QLearningAgent(actions, gamma=gamma) pblocks_aa = get_policy_blocks_aa(mdp_distr, num_options=5, task_samples=20, incl_prim_actions=True) regular_sa = get_sa(mdp_distr, default=True) pblocks_ql_agent = AbstractionWrapper(QLearningAgent, actions, state_abs=regular_sa, action_abs=pblocks_aa, name_ext="aa") agents = [pblocks_ql_agent, ql_agent] mdp_distr = MDPDistribution(mdp_distr) run_agents_lifelong(agents, mdp_distr, task_samples=100, episodes=1, steps=10000) from visualize_abstractions import visualize_options_grid visualize_options_grid(mdp_distr.sample(1), regular_sa.get_ground_states(), pblocks_aa)