def main(open_plot=True): # gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)]) # num_feats = gym_mdp.get_num_state_feats() # lin_agent = QLearnerAgent(gym_mdp.actions, alpha=0.4, epsilon=0.4) # rand_agent = RandomAgent(gym_mdp.actions) # run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot) # gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)]) # num_feats = gym_mdp.get_num_state_feats() # lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=False,rbf=True) # rand_agent = RandomAgent(gym_mdp.actions) # run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot,verbose=True) gym_mdp = GymMDP(env_name='CartPole-v0', render=False) num_feats = gym_mdp.get_num_state_feats() lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=False, rbf=True) rand_agent = RandomAgent(gym_mdp.actions) run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=5, episodes=1000, steps=100, open_plot=open_plot)
def main(open_plot=True): # Gym MDP gym_mdp = GymMDP(env_name='Breakout-v0', render=False) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. rand_agent = RandomAgent(gym_mdp.get_actions()) lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats) run_agents_on_mdp([lin_q_agent, rand_agent], gym_mdp, instances=5, episodes=50000, steps=200, open_plot=open_plot, verbose=False)
def main(open_plot=True): # Gym MDP gym_mdp = GymMDP(env_name='CartPole-v0', render=False) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=True) rand_agent = RandomAgent(gym_mdp.actions) run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=10, episodes=30, steps=10000, open_plot=open_plot)
def main(open_plot=True): # Gym MDP gym_mdp = GymMDP(env_name='CartPole-v0', render=True) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. q_learning_agent = LinearQAgent(gym_mdp.get_actions(), num_feats) run_agents_on_mdp([q_learning_agent], gym_mdp, instances=1, episodes=400, steps=210, open_plot=open_plot, verbose=True)
def main(): # ====================== # == Make Environment == # ====================== params = rlec.get_cartpole_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" params['num_iterations_for_abstraction_learning'] = 500 abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ==================================== # == Visualize Abstract State Space == # ==================================== # Collect dataset based on learner. sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") #visited_states = vu.collect_dataset(test_mdp, samples=2000) #, learning_agent=sa_agent) visited_states = collect_samples_from_demo_policy_random_s0_cartpole( mdp_demo_policy_dict, num_samples=2000) # Get feature indices. features = get_feature_dicts() # Visualize. vu.visualize_state_abstrs3D(visited_states, features, nn_sa)
def main(): # ====================== # == Make Environment == # ====================== params={} params['multitask']=False params['env_name']="LunarLander-v2" params['obs_size']=8 params['num_iterations_for_abstraction_learning']=500 params['learning_rate_for_abstraction_learning']=0.005 params['abstraction_network_hidden_layers']=2 params['abstraction_network_hidden_nodes']=200 params['num_samples_from_demonstrator']=10000 params['episodes']=200 params['steps']=1000 params['num_instances']=5 params['rl_learning_rate']=0.005 mdp_demo_policy_dict = {} env_name = "LunarLander-v2" env_gym = gym.make(env_name) obs_size = len(env_gym.observation_space.high) env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20) test_mdp = env #test mdp is the same mdp_demo_policy_dict[env]=lpd.expert_lunar_policy # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "lunar_nn_sa" num_iterations = 300 abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
def get_mdp_params(args): state_dim = None state_bound = None num_actions = None action_dim = None action_bound = None # TODO: it is very hard to have a script which contains all # discrete/continuous state/actions. # Should we separete out the tasks, or refactor? if args.tasktype == 'pinball' or args.tasktype == 'p': # TODO: Add parameter for Configuration files by --task argument mdp = PinballMDP(cfg=args.task, render=args.render) state_dim = 4 num_actions = len(mdp.get_actions()) # assert(args.ffunction != 'fourier') elif args.tasktype == 'atari' or args.tasktype == 'atariram': grayscale = False downscale = True # downscale = args.tasktype == 'atari' mdp = GymMDP(env_name=args.task, grayscale=grayscale, downscale=downscale, render=args.render) # mdp = GymMDP(env_name=args.task, grayscale=True, render=args.render) mdp.env.seed(1234) state_dims = mdp.env.observation_space.shape # print('observation_space=', state_dims) if args.tasktype == 'atari': state_dim = 1 for d in state_dims: state_dim *= d # state_dim = 33600 # state_dim = 40000 # ? if grayscale: state_dim = int(state_dim / 3) if downscale: # state_dim = int(state_dim / 4) state_dim = 105 * 80 * 3 else: state_dim = 128 print('state_dim=', state_dim) num_actions = mdp.env.action_space.n # TODO: methods are fixed to dqn/ddpg/nn right now. print('args.highmethod is overwritten by dqn') print('args.lowmethod is overwritten by dqn') args.highmethod = 'dqn' args.lowmethod = 'dqn' # args.ffunction = 'nn' assert(args.highmethod == 'dqn') assert(args.lowmethod == 'dqn') # assert(args.ffunction == 'nn') elif args.tasktype == 'mujoco': mdp = GymMDP(env_name=args.task, render=args.render) mdp.env.seed(1234) state_dims = mdp.env.observation_space.shape state_dim = 1 for d in state_dims: state_dim *= d print('state_dim=', state_dim) action_dim = int(mdp.env.action_space.shape[0]) action_bound = mdp.action_bounds() # print(action_dim) # Fourier does not work for high dim space. # TODO: methods are fixed to dqn/ddpg/nn right now. print('args.highmethod is overwritten by dqn') print('args.lowmethod is overwritten by ddpg') args.highmethod = 'dqn' args.lowmethod = 'ddpg' # args.ffunction = 'nn' assert(args.highmethod == 'dqn') assert(args.lowmethod == 'ddpg') # assert(args.ffunction == 'nn') pass elif args.tasktype == 'grid': fname = '../tasks/' + args.task mdp = make_grid_world_from_file(fname) state_dim = 2 num_actions = 4 else: assert(False) pass state_bound = mdp.bounds() return mdp, state_dim, state_bound, num_actions, action_dim, action_bound
help='number of steps for incidence matrix sampling') args = parser.parse_args() dom, task = args.task.split('_') if dom == 'grid': mdp = make_grid_world_from_file('../tasks/' + task + '.txt') elif dom == 'taxi': width = 4 height = 4 agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}] mdp = TaxiOOMDP(width, height, agent, walls=[], passengers=passengers) elif dom == 'gym': mdp = GymMDP(env_name=task, render=False) elif dom == 'hanoi': mdp = HanoiMDP(num_pegs=3, num_discs=4) elif dom == 'track': mdp = make_race_track_from_file('../tasks/' + task + '.txt') else: print('Unknown task name: ', task) assert (False) mdp.set_gamma(0.99) if args.experiment == 'online': print('test_online_agent') test_online_agent(args, mdp) elif args.experiment == 'offline': print('test_offline_agent')
def diff_sampling_distr_experiment(): ''' Summary: Runs ''' # Make MDP and Demo Policy. params = get_params() mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) # Make a NN for each sampling param. sampling_params = [0.0, 0.5, 1.0] test_mdp = CartPoleMDP() # agents = {"demo": demo_agent} sess = tf.Session() for epsilon in sampling_params: with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope: print "epsilon", epsilon # tf.reset_default_graph() params["epsilon"] = epsilon abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False) nn_sa = NNStateAbstr(abstraction_net) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$QL_\\phi-\\epsilon=" + str(epsilon) + "$" }, state_abstr=nn_sa) agents[epsilon] = sa_agent with tf.variable_scope('demo') as scope: abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="rand") nn_sa_rand = NNStateAbstr(abstraction_net_rand) sa_agent_rand = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$D \\sim U(S)$" }, state_abstr=nn_sa_rand, name_ext="") agents["rand"] = sa_agent_rand run_agents_on_mdp(agents.values(), test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) sess.close()
def main(): # ====================== # == Make Environment == # ====================== params = get_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy if params['multitask']: # Make distribution. mdp_dist_dict = { CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps] } test_mdp = MDPDistribution(mdp_dist_dict) else: test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== if params['multitask']: run_agents_lifelong([sa_agent, linear_agent], test_mdp, samples=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) else: # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
#!/usr/bin/env python # Other imports. import srl_example_setup from simple_rl.agents import LinearQLearnerAgent, RandomAgent from simple_rl.tasks import GymMDP from simple_rl.run_experiments import run_agents_on_mdp # Gym MDP gym_mdp = GymMDP(env_name='CartPole-v0', render=False) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=True) rand_agent = RandomAgent(gym_mdp.actions) run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=10, episodes=30, steps=10000)
def main(open_plot=True): # Random seeds np.random.seed(1234) tf.set_random_seed(1234) parser = argparse.ArgumentParser() # pinball files = pinball_box.cfg pinball_empty.cfg pinball_hard_single.cfg pinball_medium.cfg pinball_simple_single.cfg # Parameters for the task parser.add_argument('--tasktype', type=str, default='pinball') parser.add_argument('--task', type=str, default='pinball_empty.cfg') parser.add_argument('--base', action='store_true') parser.add_argument('--nepisodes', type=int, default=10) parser.add_argument('--nsteps', type=int, default=100) parser.add_argument('--ninstances', type=int, default=1) # Parameters for the Agent parser.add_argument('--highmethod', type=str, default='linear') parser.add_argument('--lowmethod', type=str, default='linear') parser.add_argument('--ffunction', type=str, default='fourier') parser.add_argument( '--noptions', type=int, default=5) # (5 = 1 for primitive actions and 4 covering options). # Visualization parser.add_argument('--render', action='store_true') args = parser.parse_args() print('tasktype=', args.tasktype) if args.tasktype == 'pinball' or args.tasktype == 'p': # TODO: Add parameter for Configuration files by --task argument gym_mdp = PinballMDP(cfg=args.task, render=args.render) state_dim = 4 elif args.tasktype == 'atari' or args.tasktype == 'mujoco': # Gym MDP gym_mdp = GymMDP(env_name=args.task, render=args.render) gym_mdp.env.seed(1234) state_dims = gym_mdp.env.observation_space.shape state_dim = 1 for d in state_dims: state_dim *= d print('state_dim=', state_dim) else: assert (False) # TODO: What should we compare against? agents = [] if args.tasktype == 'mujoco': action_dim = gym_mdp.env.action_space.shape[0] action_bound = gym_mdp.env.action_space.high op_agent = OptionAgent(sess=None, obs_dim=state_dim, action_dim=action_dim, action_bound=action_bound, num_options=args.noptions, name='OptionAgent') base_agent = DDPGAgent(sess=None, obs_dim=state_dim, action_dim=action_dim, action_bound=action_bound, name='Baseline') elif args.tasktype == 'atari': num_actions = gym_mdp.env.action_space.n print('num_actions=', num_actions) op_agent = OptionAgent(sess=None, obs_dim=state_dim, num_actions=num_actions, num_options=args.noptions, name='OptionAgent') base_agent = DQNAgent(sess=None, obs_dim=state_dim, num_actions=num_actions, name='Baseline') elif args.tasktype == 'pinball' or args.tasktype == 'p': num_actions = 5 low_bound, up_bound = mdp.bounds() feature = Fourier(state_dim=state_dim, state_up_bound=up_bound, state_low_bound=low_bound, order=4) base_agent = LinearQAgent(actions=gym_mdp.get_actions(), feature=feature, sarsa=False, name='baseline') op_agent = OptionAgent(sess=None, obs_dim=state_dim, num_actions=num_actions, num_options=args.noptions, high_method='linear', low_method='linear', name='OptionAgent') # base_agent = DQNAgent(sess=None, obs_dim=state_dim, num_actions=num_actions, name='Baseline') else: assert (False) if args.base: agents.append(base_agent) else: agents.append(op_agent) run_agents_on_mdp(agents, gym_mdp, episodes=args.nepisodes, steps=args.nsteps, verbose=True, instances=args.ninstances, cumulative_plot=False, open_plot=False)