def main(): # ====================== # == Make Environment == # ====================== params={} params['multitask']=False params['env_name']="LunarLander-v2" params['obs_size']=8 params['num_iterations_for_abstraction_learning']=500 params['learning_rate_for_abstraction_learning']=0.005 params['abstraction_network_hidden_layers']=2 params['abstraction_network_hidden_nodes']=200 params['num_samples_from_demonstrator']=10000 params['episodes']=200 params['steps']=1000 params['num_instances']=5 params['rl_learning_rate']=0.005 mdp_demo_policy_dict = {} env_name = "LunarLander-v2" env_gym = gym.make(env_name) obs_size = len(env_gym.observation_space.high) env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20) test_mdp = env #test mdp is the same mdp_demo_policy_dict[env]=lpd.expert_lunar_policy # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "lunar_nn_sa" num_iterations = 300 abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
def main(): # ====================== # == Make Environment == # ====================== params = get_params() # ============================ # == Make test and train environments # == along with demonstrator(s) # ============================ mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict( multitask=params['multitask']) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features) sa_agent = AbstractionWrapper( QLearningAgent, agent_params={"actions": test_mdp.get_actions()}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
def main(): # ====================== # == Make Environment == # ====================== params = rlec.get_cartpole_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" params['num_iterations_for_abstraction_learning'] = 500 abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ==================================== # == Visualize Abstract State Space == # ==================================== # Collect dataset based on learner. sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") #visited_states = vu.collect_dataset(test_mdp, samples=2000) #, learning_agent=sa_agent) visited_states = collect_samples_from_demo_policy_random_s0_cartpole( mdp_demo_policy_dict, num_samples=2000) # Get feature indices. features = get_feature_dicts() # Visualize. vu.visualize_state_abstrs3D(visited_states, features, nn_sa)
def diff_sampling_distr_experiment(): ''' Summary: Compares performance of different sample styles to compute phi. ''' # Make MDP and Demo Policy. params = get_params() mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(multitask=False) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) # Make a NN for each sampling param. agents = {} sess = tf.Session() sampling_params = [0.0, 0.5, 1.0] for epsilon in sampling_params: with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope: # tf.reset_default_graph() params["epsilon"] = epsilon abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="demo") nn_sa = NNStateAbstr(abstraction_net) sa_agent = AbstractionWrapper( QLearningAgent, agent_params={ "actions": test_mdp.get_actions(), "name": "$D \\sim \\rho_E^\\epsilon, \\epsilon=" + str(epsilon) + "$" }, state_abstr=nn_sa, name_ext="") agents[epsilon] = sa_agent with tf.variable_scope('demo') as scope: abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="rand") nn_sa_rand = NNStateAbstr(abstraction_net_rand) sa_agent_rand = AbstractionWrapper(QLearningAgent, agent_params={ "actions": test_mdp.get_actions(), "name": "$D \\sim U(S)$" }, state_abstr=nn_sa_rand, name_ext="") agents["rand"] = sa_agent_rand run_agents_on_mdp(agents.values(), test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) sess.close()
def num_training_data_experiment(): ''' Summary: Runs an experiment that compares the performance of different Agent-SA combinations, where each SA is trained with a different number of training samples. ''' # Params. instances = 10 init, increment, maximum = 1, 500, 5001 training_samples = range(init, maximum, increment) # Run experiment.s if not os.path.exists(os.path.join("results", "puddle_per_sample")): os.makedirs(os.path.join("results", "puddle_per_sample")) data_dir = os.path.join("results", "puddle_per_sample") with open(os.path.join(data_dir, "results.csv"), "w+") as results_file: # Repeat the experiment @instances # times. for i in range(instances): print "\nInstances", i + 1, "of", str(instances) for sample_num in training_samples: print "\tSamples:", sample_num # Make State Abstraction. params = get_params(default_params={ "num_samples_from_demonstrator": sample_num }) mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict( multitask=params['multitask']) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) tf.reset_default_graph() sess = tf.Session() abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False) nn_sa = NNStateAbstr(abstraction_net) # Test Performance with given param. sa_agent = AbstractionWrapper( QLearningAgent, agent_params={"actions": test_mdp.get_actions()}, state_abstr=nn_sa, name_ext="$-\\phi$") val = evaluate_agent(sa_agent, test_mdp, steps=params['steps'], episodes=params['episodes']) results_file.write(str(val) + ",") results_file.flush() sess.close() results_file.write("\n") cu.EVERY_OTHER_X = True cu.CUSTOM_TITLE = "Effect of $|D_{train, \\phi}|$ on RL Performance" cu.X_AXIS_LABEL = "$|D_{train, \\phi}|$" cu.Y_AXIS_LABEL = "Avg. Reward in Last Episode" cu.X_AXIS_START_VAL = init cu.X_AXIS_INCREMENT = increment cu.COLOR_SHIFT = 3 cu.format_and_make_plot(data_dir=data_dir, avg_plot=True, add_legend=False)
def diff_sampling_distr_experiment(): ''' Summary: Runs ''' # Make MDP and Demo Policy. params = get_params() mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) # Make a NN for each sampling param. sampling_params = [0.0, 0.5, 1.0] test_mdp = CartPoleMDP() # agents = {"demo": demo_agent} sess = tf.Session() for epsilon in sampling_params: with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope: print "epsilon", epsilon # tf.reset_default_graph() params["epsilon"] = epsilon abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False) nn_sa = NNStateAbstr(abstraction_net) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$QL_\\phi-\\epsilon=" + str(epsilon) + "$" }, state_abstr=nn_sa) agents[epsilon] = sa_agent with tf.variable_scope('demo') as scope: abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="rand") nn_sa_rand = NNStateAbstr(abstraction_net_rand) sa_agent_rand = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$D \\sim U(S)$" }, state_abstr=nn_sa_rand, name_ext="") agents["rand"] = sa_agent_rand run_agents_on_mdp(agents.values(), test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) sess.close()
def main(): # ====================== # == Make Environment == # ====================== params = get_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy if params['multitask']: # Make distribution. mdp_dist_dict = { CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps] } test_mdp = MDPDistribution(mdp_dist_dict) else: test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== if params['multitask']: run_agents_lifelong([sa_agent, linear_agent], test_mdp, samples=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) else: # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)