Example #1
0
def main(open_plot=True):
    #     gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)])
    #     num_feats = gym_mdp.get_num_state_feats()
    #     lin_agent = QLearnerAgent(gym_mdp.actions, alpha=0.4, epsilon=0.4)
    #     rand_agent = RandomAgent(gym_mdp.actions)
    #     run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot)

    #     gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)])
    #     num_feats = gym_mdp.get_num_state_feats()
    #     lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=False,rbf=True)
    #     rand_agent = RandomAgent(gym_mdp.actions)
    #     run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot,verbose=True)

    gym_mdp = GymMDP(env_name='CartPole-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()
    lin_agent = LinearQLearnerAgent(gym_mdp.actions,
                                    num_features=num_feats,
                                    alpha=0.4,
                                    epsilon=0.4,
                                    anneal=False,
                                    rbf=True)
    rand_agent = RandomAgent(gym_mdp.actions)
    run_agents_on_mdp([lin_agent, rand_agent],
                      gym_mdp,
                      instances=5,
                      episodes=1000,
                      steps=100,
                      open_plot=open_plot)
Example #2
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='Breakout-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    rand_agent = RandomAgent(gym_mdp.get_actions())
    lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats)
    run_agents_on_mdp([lin_q_agent, rand_agent], gym_mdp, instances=5, episodes=50000, steps=200, open_plot=open_plot, verbose=False)
Example #3
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='CartPole-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=True)
    rand_agent = RandomAgent(gym_mdp.actions)
    run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=10, episodes=30, steps=10000, open_plot=open_plot)
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='CartPole-v0', render=True)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    q_learning_agent = LinearQAgent(gym_mdp.get_actions(), num_feats)
    run_agents_on_mdp([q_learning_agent],
                      gym_mdp,
                      instances=1,
                      episodes=400,
                      steps=210,
                      open_plot=open_plot,
                      verbose=True)
Example #5
0
def main(open_plot=True):
    # Gym MDP
    gym_mdp = GymMDP(env_name='Breakout-v0', render=False)
    num_feats = gym_mdp.get_num_state_feats()

    # Setup agents and run.
    rand_agent = RandomAgent(gym_mdp.get_actions())
    lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats)
    run_agents_on_mdp([lin_q_agent, rand_agent],
                      gym_mdp,
                      instances=5,
                      episodes=50000,
                      steps=200,
                      open_plot=open_plot,
                      verbose=False)
Example #6
0
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = rlec.get_cartpole_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy
    test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    params['num_iterations_for_abstraction_learning'] = 500
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # ====================================
    # == Visualize Abstract State Space ==
    # ====================================

    # Collect dataset based on learner.
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")
    #visited_states = vu.collect_dataset(test_mdp, samples=2000) #, learning_agent=sa_agent)
    visited_states = collect_samples_from_demo_policy_random_s0_cartpole(
        mdp_demo_policy_dict, num_samples=2000)

    # Get feature indices.
    features = get_feature_dicts()

    # Visualize.
    vu.visualize_state_abstrs3D(visited_states, features, nn_sa)
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params={}
    params['multitask']=False
    params['env_name']="LunarLander-v2"
    params['obs_size']=8
    params['num_iterations_for_abstraction_learning']=500
    params['learning_rate_for_abstraction_learning']=0.005
    params['abstraction_network_hidden_layers']=2
    params['abstraction_network_hidden_nodes']=200
    params['num_samples_from_demonstrator']=10000
    params['episodes']=200
    params['steps']=1000
    params['num_instances']=5
    params['rl_learning_rate']=0.005
    mdp_demo_policy_dict = {}
    env_name = "LunarLander-v2"
    env_gym = gym.make(env_name)
    obs_size = len(env_gym.observation_space.high)
    env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20)
    test_mdp = env #test mdp is the same
    mdp_demo_policy_dict[env]=lpd.expert_lunar_policy

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "lunar_nn_sa"
    num_iterations = 300
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
Example #8
0
def get_mdp_params(args):
    state_dim = None
    state_bound = None
    num_actions = None
    action_dim = None
    action_bound = None

    # TODO: it is very hard to have a script which contains all
    #       discrete/continuous state/actions.
    #       Should we separete out the tasks, or refactor?
    
    if args.tasktype == 'pinball' or args.tasktype == 'p':
        # TODO: Add parameter for Configuration files by --task argument
        mdp = PinballMDP(cfg=args.task, render=args.render)
        state_dim = 4
        num_actions = len(mdp.get_actions())
        # assert(args.ffunction !=  'fourier')
    elif args.tasktype == 'atari' or args.tasktype == 'atariram':
        grayscale = False
        downscale = True
        # downscale = args.tasktype == 'atari'
        mdp = GymMDP(env_name=args.task, grayscale=grayscale, downscale=downscale, render=args.render)
        # mdp = GymMDP(env_name=args.task, grayscale=True, render=args.render)
        mdp.env.seed(1234)
        state_dims = mdp.env.observation_space.shape
        # print('observation_space=', state_dims)
        if args.tasktype == 'atari':
            state_dim = 1
            for d in state_dims:
                state_dim *= d
            # state_dim = 33600
            # state_dim = 40000 # ?
            if grayscale:
                state_dim = int(state_dim / 3)
            if downscale:
                # state_dim = int(state_dim / 4)
                state_dim = 105 * 80 * 3
        else:
            state_dim = 128
        print('state_dim=', state_dim)
        num_actions = mdp.env.action_space.n

        # TODO: methods are fixed to dqn/ddpg/nn right now.
        print('args.highmethod is overwritten by dqn')
        print('args.lowmethod is overwritten by dqn')
        args.highmethod = 'dqn'
        args.lowmethod = 'dqn'
        # args.ffunction = 'nn'
        assert(args.highmethod == 'dqn')
        assert(args.lowmethod == 'dqn')
        # assert(args.ffunction == 'nn')
    elif args.tasktype == 'mujoco':
        mdp = GymMDP(env_name=args.task, render=args.render)
        mdp.env.seed(1234)
        state_dims = mdp.env.observation_space.shape
        state_dim = 1
        for d in state_dims:
            state_dim *= d
        print('state_dim=', state_dim)

        action_dim = int(mdp.env.action_space.shape[0])
        action_bound = mdp.action_bounds()

        # print(action_dim)
        # Fourier does not work for high dim space.

        # TODO: methods are fixed to dqn/ddpg/nn right now.
        print('args.highmethod is overwritten by dqn')
        print('args.lowmethod is overwritten by ddpg')
        args.highmethod = 'dqn'
        args.lowmethod = 'ddpg'
        # args.ffunction = 'nn'
        assert(args.highmethod == 'dqn')
        assert(args.lowmethod == 'ddpg')
        # assert(args.ffunction == 'nn')
        pass
    elif args.tasktype == 'grid':
        fname = '../tasks/' + args.task
        mdp = make_grid_world_from_file(fname)
        state_dim = 2
        num_actions = 4
    else:
        assert(False)
        pass

    state_bound = mdp.bounds()

    return mdp, state_dim, state_bound, num_actions, action_dim, action_bound
Example #9
0
                        help='number of steps for incidence matrix sampling')

    args = parser.parse_args()

    dom, task = args.task.split('_')

    if dom == 'grid':
        mdp = make_grid_world_from_file('../tasks/' + task + '.txt')
    elif dom == 'taxi':
        width = 4
        height = 4
        agent = {"x": 1, "y": 1, "has_passenger": 0}
        passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}]
        mdp = TaxiOOMDP(width, height, agent, walls=[], passengers=passengers)
    elif dom == 'gym':
        mdp = GymMDP(env_name=task, render=False)
    elif dom == 'hanoi':
        mdp = HanoiMDP(num_pegs=3, num_discs=4)
    elif dom == 'track':
        mdp = make_race_track_from_file('../tasks/' + task + '.txt')
    else:
        print('Unknown task name: ', task)
        assert (False)

    mdp.set_gamma(0.99)

    if args.experiment == 'online':
        print('test_online_agent')
        test_online_agent(args, mdp)
    elif args.experiment == 'offline':
        print('test_offline_agent')
def diff_sampling_distr_experiment():
    '''
    Summary:
        Runs
    '''
    # Make MDP and Demo Policy.
    params = get_params()
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy
    demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)

    # Make a NN for each sampling param.
    sampling_params = [0.0, 0.5, 1.0]

    test_mdp = CartPoleMDP()  #
    agents = {"demo": demo_agent}
    sess = tf.Session()
    for epsilon in sampling_params:
        with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope:
            print "epsilon", epsilon
            # tf.reset_default_graph()
            params["epsilon"] = epsilon
            abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                         sess,
                                         params,
                                         verbose=False)
            nn_sa = NNStateAbstr(abstraction_net)
            sa_agent = AbstractionWrapper(QLearningAgent,
                                          agent_params={
                                              "actions":
                                              env.get_actions(),
                                              "name":
                                              "$QL_\\phi-\\epsilon=" +
                                              str(epsilon) + "$"
                                          },
                                          state_abstr=nn_sa)
            agents[epsilon] = sa_agent

    with tf.variable_scope('demo') as scope:
        abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict,
                                          sess,
                                          params,
                                          verbose=False,
                                          sample_type="rand")
        nn_sa_rand = NNStateAbstr(abstraction_net_rand)
        sa_agent_rand = AbstractionWrapper(QLearningAgent,
                                           agent_params={
                                               "actions": env.get_actions(),
                                               "name": "$D \\sim U(S)$"
                                           },
                                           state_abstr=nn_sa_rand,
                                           name_ext="")
        agents["rand"] = sa_agent_rand

    run_agents_on_mdp(agents.values(),
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)

    sess.close()
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy

    if params['multitask']:
        # Make distribution.
        mdp_dist_dict = {
            CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps
            for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps]
        }
        test_mdp = MDPDistribution(mdp_dist_dict)
    else:
        test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions,
                                num_features=num_features,
                                alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================

    if params['multitask']:
        run_agents_lifelong([sa_agent, linear_agent],
                            test_mdp,
                            samples=params['num_instances'],
                            episodes=params['episodes'],
                            steps=params['steps'],
                            verbose=False)
    else:
        # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)
        run_agents_on_mdp([sa_agent, linear_agent],
                          test_mdp,
                          instances=params['num_instances'],
                          episodes=params['episodes'],
                          steps=params['steps'],
                          verbose=False)
Example #12
0
#!/usr/bin/env python

# Other imports.
import srl_example_setup
from simple_rl.agents import LinearQLearnerAgent, RandomAgent
from simple_rl.tasks import GymMDP
from simple_rl.run_experiments import run_agents_on_mdp

# Gym MDP
gym_mdp = GymMDP(env_name='CartPole-v0', render=False)

num_feats = gym_mdp.get_num_state_feats()

# Setup agents and run.
lin_agent = LinearQLearnerAgent(gym_mdp.actions,
                                num_features=num_feats,
                                alpha=0.4,
                                epsilon=0.4,
                                anneal=True)
rand_agent = RandomAgent(gym_mdp.actions)

run_agents_on_mdp([lin_agent, rand_agent],
                  gym_mdp,
                  instances=10,
                  episodes=30,
                  steps=10000)
Example #13
0
def main(open_plot=True):
    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(1234)

    parser = argparse.ArgumentParser()

    # pinball files = pinball_box.cfg  pinball_empty.cfg  pinball_hard_single.cfg  pinball_medium.cfg  pinball_simple_single.cfg

    # Parameters for the task
    parser.add_argument('--tasktype', type=str, default='pinball')
    parser.add_argument('--task', type=str, default='pinball_empty.cfg')
    parser.add_argument('--base', action='store_true')

    parser.add_argument('--nepisodes', type=int, default=10)
    parser.add_argument('--nsteps', type=int, default=100)
    parser.add_argument('--ninstances', type=int, default=1)

    # Parameters for the Agent
    parser.add_argument('--highmethod', type=str, default='linear')
    parser.add_argument('--lowmethod', type=str, default='linear')
    parser.add_argument('--ffunction', type=str, default='fourier')
    parser.add_argument(
        '--noptions', type=int,
        default=5)  # (5 = 1 for primitive actions and 4 covering options).

    # Visualization
    parser.add_argument('--render', action='store_true')

    args = parser.parse_args()

    print('tasktype=', args.tasktype)

    if args.tasktype == 'pinball' or args.tasktype == 'p':
        # TODO: Add parameter for Configuration files by --task argument
        gym_mdp = PinballMDP(cfg=args.task, render=args.render)
        state_dim = 4
    elif args.tasktype == 'atari' or args.tasktype == 'mujoco':
        # Gym MDP
        gym_mdp = GymMDP(env_name=args.task, render=args.render)
        gym_mdp.env.seed(1234)

        state_dims = gym_mdp.env.observation_space.shape
        state_dim = 1
        for d in state_dims:
            state_dim *= d
        print('state_dim=', state_dim)
    else:
        assert (False)

    # TODO: What should we compare against?
    agents = []

    if args.tasktype == 'mujoco':
        action_dim = gym_mdp.env.action_space.shape[0]
        action_bound = gym_mdp.env.action_space.high
        op_agent = OptionAgent(sess=None,
                               obs_dim=state_dim,
                               action_dim=action_dim,
                               action_bound=action_bound,
                               num_options=args.noptions,
                               name='OptionAgent')
        base_agent = DDPGAgent(sess=None,
                               obs_dim=state_dim,
                               action_dim=action_dim,
                               action_bound=action_bound,
                               name='Baseline')
    elif args.tasktype == 'atari':
        num_actions = gym_mdp.env.action_space.n
        print('num_actions=', num_actions)
        op_agent = OptionAgent(sess=None,
                               obs_dim=state_dim,
                               num_actions=num_actions,
                               num_options=args.noptions,
                               name='OptionAgent')
        base_agent = DQNAgent(sess=None,
                              obs_dim=state_dim,
                              num_actions=num_actions,
                              name='Baseline')
    elif args.tasktype == 'pinball' or args.tasktype == 'p':
        num_actions = 5
        low_bound, up_bound = mdp.bounds()
        feature = Fourier(state_dim=state_dim,
                          state_up_bound=up_bound,
                          state_low_bound=low_bound,
                          order=4)
        base_agent = LinearQAgent(actions=gym_mdp.get_actions(),
                                  feature=feature,
                                  sarsa=False,
                                  name='baseline')
        op_agent = OptionAgent(sess=None,
                               obs_dim=state_dim,
                               num_actions=num_actions,
                               num_options=args.noptions,
                               high_method='linear',
                               low_method='linear',
                               name='OptionAgent')
        # base_agent = DQNAgent(sess=None, obs_dim=state_dim, num_actions=num_actions, name='Baseline')
    else:
        assert (False)

    if args.base:
        agents.append(base_agent)
    else:
        agents.append(op_agent)

    run_agents_on_mdp(agents,
                      gym_mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      verbose=True,
                      instances=args.ninstances,
                      cumulative_plot=False,
                      open_plot=False)