def main():

    # ======================
    # == Make Environment ==
    # ======================
    params={}
    params['multitask']=False
    params['env_name']="LunarLander-v2"
    params['obs_size']=8
    params['num_iterations_for_abstraction_learning']=500
    params['learning_rate_for_abstraction_learning']=0.005
    params['abstraction_network_hidden_layers']=2
    params['abstraction_network_hidden_nodes']=200
    params['num_samples_from_demonstrator']=10000
    params['episodes']=200
    params['steps']=1000
    params['num_instances']=5
    params['rl_learning_rate']=0.005
    mdp_demo_policy_dict = {}
    env_name = "LunarLander-v2"
    env_gym = gym.make(env_name)
    obs_size = len(env_gym.observation_space.high)
    env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20)
    test_mdp = env #test mdp is the same
    mdp_demo_policy_dict[env]=lpd.expert_lunar_policy

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "lunar_nn_sa"
    num_iterations = 300
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()

    # ============================
    # == Make test and train environments
    # == along with demonstrator(s)
    # ============================
    mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(
        multitask=params['multitask'])
    expert_puddle_policy = ppd.get_demo_policy_given_goal(
        test_mdp.get_goal_locs()[0])
    demo_agent = FixedPolicyAgent(expert_puddle_policy)

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features)
    sa_agent = AbstractionWrapper(
        QLearningAgent,
        agent_params={"actions": test_mdp.get_actions()},
        state_abstr=nn_sa,
        name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent],
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)
Ejemplo n.º 3
0
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = rlec.get_cartpole_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy
    test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    params['num_iterations_for_abstraction_learning'] = 500
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # ====================================
    # == Visualize Abstract State Space ==
    # ====================================

    # Collect dataset based on learner.
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")
    #visited_states = vu.collect_dataset(test_mdp, samples=2000) #, learning_agent=sa_agent)
    visited_states = collect_samples_from_demo_policy_random_s0_cartpole(
        mdp_demo_policy_dict, num_samples=2000)

    # Get feature indices.
    features = get_feature_dicts()

    # Visualize.
    vu.visualize_state_abstrs3D(visited_states, features, nn_sa)
def diff_sampling_distr_experiment():
    '''
    Summary:
        Compares performance of different sample styles to compute phi.
    '''
    # Make MDP and Demo Policy.
    params = get_params()
    mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(multitask=False)
    expert_puddle_policy = ppd.get_demo_policy_given_goal(
        test_mdp.get_goal_locs()[0])
    demo_agent = FixedPolicyAgent(expert_puddle_policy)

    # Make a NN for each sampling param.
    agents = {}
    sess = tf.Session()
    sampling_params = [0.0, 0.5, 1.0]

    for epsilon in sampling_params:
        with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope:
            # tf.reset_default_graph()
            params["epsilon"] = epsilon
            abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                         sess,
                                         params,
                                         verbose=False,
                                         sample_type="demo")
            nn_sa = NNStateAbstr(abstraction_net)
            sa_agent = AbstractionWrapper(
                QLearningAgent,
                agent_params={
                    "actions":
                    test_mdp.get_actions(),
                    "name":
                    "$D \\sim \\rho_E^\\epsilon, \\epsilon=" + str(epsilon) +
                    "$"
                },
                state_abstr=nn_sa,
                name_ext="")
            agents[epsilon] = sa_agent

    with tf.variable_scope('demo') as scope:
        abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict,
                                          sess,
                                          params,
                                          verbose=False,
                                          sample_type="rand")
        nn_sa_rand = NNStateAbstr(abstraction_net_rand)
        sa_agent_rand = AbstractionWrapper(QLearningAgent,
                                           agent_params={
                                               "actions":
                                               test_mdp.get_actions(),
                                               "name": "$D \\sim U(S)$"
                                           },
                                           state_abstr=nn_sa_rand,
                                           name_ext="")
        agents["rand"] = sa_agent_rand

    run_agents_on_mdp(agents.values(),
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)

    sess.close()
def num_training_data_experiment():
    '''
    Summary:
        Runs an experiment that compares the performance of different
        Agent-SA combinations, where each SA is trained with a different
        number of training samples.
    '''
    # Params.
    instances = 10
    init, increment, maximum = 1, 500, 5001
    training_samples = range(init, maximum, increment)

    # Run experiment.s
    if not os.path.exists(os.path.join("results", "puddle_per_sample")):
        os.makedirs(os.path.join("results", "puddle_per_sample"))
    data_dir = os.path.join("results", "puddle_per_sample")
    with open(os.path.join(data_dir, "results.csv"), "w+") as results_file:

        # Repeat the experiment @instances # times.
        for i in range(instances):
            print "\nInstances", i + 1, "of", str(instances)
            for sample_num in training_samples:
                print "\tSamples:", sample_num

                # Make State Abstraction.
                params = get_params(default_params={
                    "num_samples_from_demonstrator": sample_num
                })
                mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(
                    multitask=params['multitask'])
                expert_puddle_policy = ppd.get_demo_policy_given_goal(
                    test_mdp.get_goal_locs()[0])
                demo_agent = FixedPolicyAgent(expert_puddle_policy)
                tf.reset_default_graph()
                sess = tf.Session()
                abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                             sess,
                                             params,
                                             verbose=False)
                nn_sa = NNStateAbstr(abstraction_net)

                # Test Performance with given param.
                sa_agent = AbstractionWrapper(
                    QLearningAgent,
                    agent_params={"actions": test_mdp.get_actions()},
                    state_abstr=nn_sa,
                    name_ext="$-\\phi$")
                val = evaluate_agent(sa_agent,
                                     test_mdp,
                                     steps=params['steps'],
                                     episodes=params['episodes'])
                results_file.write(str(val) + ",")
                results_file.flush()
                sess.close()

            results_file.write("\n")

    cu.EVERY_OTHER_X = True
    cu.CUSTOM_TITLE = "Effect of $|D_{train, \\phi}|$ on RL Performance"
    cu.X_AXIS_LABEL = "$|D_{train, \\phi}|$"
    cu.Y_AXIS_LABEL = "Avg. Reward in Last Episode"
    cu.X_AXIS_START_VAL = init
    cu.X_AXIS_INCREMENT = increment
    cu.COLOR_SHIFT = 3
    cu.format_and_make_plot(data_dir=data_dir, avg_plot=True, add_legend=False)
def diff_sampling_distr_experiment():
    '''
    Summary:
        Runs
    '''
    # Make MDP and Demo Policy.
    params = get_params()
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy
    demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)

    # Make a NN for each sampling param.
    sampling_params = [0.0, 0.5, 1.0]

    test_mdp = CartPoleMDP()  #
    agents = {"demo": demo_agent}
    sess = tf.Session()
    for epsilon in sampling_params:
        with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope:
            print "epsilon", epsilon
            # tf.reset_default_graph()
            params["epsilon"] = epsilon
            abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                         sess,
                                         params,
                                         verbose=False)
            nn_sa = NNStateAbstr(abstraction_net)
            sa_agent = AbstractionWrapper(QLearningAgent,
                                          agent_params={
                                              "actions":
                                              env.get_actions(),
                                              "name":
                                              "$QL_\\phi-\\epsilon=" +
                                              str(epsilon) + "$"
                                          },
                                          state_abstr=nn_sa)
            agents[epsilon] = sa_agent

    with tf.variable_scope('demo') as scope:
        abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict,
                                          sess,
                                          params,
                                          verbose=False,
                                          sample_type="rand")
        nn_sa_rand = NNStateAbstr(abstraction_net_rand)
        sa_agent_rand = AbstractionWrapper(QLearningAgent,
                                           agent_params={
                                               "actions": env.get_actions(),
                                               "name": "$D \\sim U(S)$"
                                           },
                                           state_abstr=nn_sa_rand,
                                           name_ext="")
        agents["rand"] = sa_agent_rand

    run_agents_on_mdp(agents.values(),
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)

    sess.close()
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy

    if params['multitask']:
        # Make distribution.
        mdp_dist_dict = {
            CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps
            for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps]
        }
        test_mdp = MDPDistribution(mdp_dist_dict)
    else:
        test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions,
                                num_features=num_features,
                                alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================

    if params['multitask']:
        run_agents_lifelong([sa_agent, linear_agent],
                            test_mdp,
                            samples=params['num_instances'],
                            episodes=params['episodes'],
                            steps=params['steps'],
                            verbose=False)
    else:
        # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)
        run_agents_on_mdp([sa_agent, linear_agent],
                          test_mdp,
                          instances=params['num_instances'],
                          episodes=params['episodes'],
                          steps=params['steps'],
                          verbose=False)