def __init__(self, actions, num_features, rand_init=True, name="ql-linear", alpha=0.2, gamma=0.99, epsilon=0.2, explore="uniform", rbf=False, anneal=True): name = name + "-rbf" if rbf else name QLearnerAgent.__init__(self, actions=list(actions), name=name, alpha=alpha, gamma=gamma, epsilon=epsilon, explore=explore, anneal=anneal) self.num_features = num_features # Add a basis feature. if rand_init: self.weights = np.random.random(self.num_features * len(self.actions)) else: self.weights = np.zeros(self.num_features * len(self.actions)) self.rbf = rbf
def main(): # Setup MDP, Agents. size = 5 agent = { "x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": size, "dest_y": size, "has_block": 0 } blocks = [{"x": size, "y": 1}] lavas = [{ "x": x, "y": y } for x, y in map(lambda z: (z + 1, (size + 1) / 2), xrange(size))] mdp = TrenchOOMDP(size, size, agent, blocks, lavas) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. # run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250) vi = ValueIteration(mdp, delta=0.0001, max_iterations=5000) iters, val = vi.run_vi() print " done." states = vi.get_states() num_states = len(states) print num_states, states
def main(): # Command line args. task, rom = parse_args() # Setup the MDP. mdp = choose_mdp(task, rom) actions = mdp.get_actions() gamma = mdp.get_gamma() # Setup agents. from simple_rl.agents import RandomAgent, RMaxAgent, QLearnerAgent, LinearQLearnerAgent random_agent = RandomAgent(actions) rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=4, s_a_threshold=2) qlearner_agent = QLearnerAgent(actions, gamma=gamma, explore="uniform") lqlearner_agent = LinearQLearnerAgent(actions, gamma=gamma, explore="uniform") agents = [qlearner_agent, random_agent] # Run Agents. if isinstance(mdp, MarkovGameMDP): # Markov Game. agents = { qlearner_agent.name: qlearner_agent, random_agent.name: random_agent } play_markov_game(agents, mdp, instances=100, episodes=1, steps=500) else: # Regular experiment. run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
def main(open_plot=True): # Taxi initial state attributes.. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) # Agents. ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=50, episodes=1, steps=2000, reset_at_terminal=True, open_plot=open_plot)
def main(open_plot=True): # Make MDP distribution, agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearnerAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Run experiment and make plot. run_agents_multi_task([ql_agent, rand_agent], mdp_distr, task_samples=50, episodes=1, steps=1500, reset_at_terminal=True, open_plot=open_plot)
def main(open_plot=True): # Setup MDP, Agents. markov_game = GatheringMDP() ql_agent = QLearnerAgent(actions=markov_game.get_actions()) fixed_action = random.choice(markov_game.get_actions()) fixed_agent = FixedPolicyAgent(policy=lambda s:fixed_action) # Run experiment and make plot. play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40, open_plot=open_plot)
def main(open_plot=True): state_colors = defaultdict(lambda:defaultdict(lambda:"white")) state_colors[3][2] = "red" # Setup MDP, Agents. mdp = ColoredGridWorldMDP(state_colors) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot)
def main(): # Make MDP distribution, agents. mdp_distr = make_mdp_distr(mdp_class="grid") ql_agent = QLearnerAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Run experiment and make plot. run_agents_multi_task([ql_agent, rand_agent], mdp_distr, task_samples=30, episodes=100, steps=50, reset_at_terminal=True, include_optimal=True)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def main(open_plot=True): # Setup MDP, Agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearnerAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Make goal-based option agent. goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(), options=goal_based_options) option_agent = AbstractionWrapper(QLearnerAgent, actions=mdp_distr.get_actions(), action_abstr=goal_based_aa) # Run experiment and make plot. run_agents_multi_task([ql_agent, rand_agent, option_agent], mdp_distr, task_samples=10, episodes=100, steps=150, open_plot=open_plot)
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9)], gamma=0.95) ql_agent = QLearnerAgent(mdp.get_actions()) viz = parse_args() if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy vi = ValueIteration(mdp) vi.run_vi() policy = vi.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print "\n", str(ql_agent), "interacting with", str(mdp) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent)
def main(): # Command line args. task, rom = parse_args() # Setup the MDP. mdp = choose_mdp(task, rom) actions = mdp.get_actions() gamma = mdp.get_gamma() # Setup agents. random_agent = RandomAgent(actions) rmax_agent = RMaxAgent(actions, gamma=gamma) qlearner_agent = QLearnerAgent(actions, gamma=gamma) lin_approx_agent = LinearApproxQLearnerAgent(actions, gamma=gamma) grad_boost_agent = GradientBoostingAgent(actions, gamma=gamma, explore="softmax") # Choose agents. agents = [lin_approx_agent, random_agent] # Run experiments. run_agents_on_mdp(agents, mdp)
def main(eps=0.1, open_plot=True): mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = avg_mdp_vi.get_q_function() if alg == "q": pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0") qmax = 1.0 * (1 - 0.99) # qmax = 1.0 pure_ql_agent_opt = QLearnerAgent(actions, epsilon=eps, default_q=qmax, name="Q-vmax") transfer_ql_agent_optq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-max") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-avg") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq, transfer_ql_agent_avgq ] elif alg == "rmax": pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax") updating_trans_rmax_agent = UpdatingRMaxAgent(actions, name="RMAX-updating_max") trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent] elif alg == "delayed-q": pure_delayed_ql_agent = DelayedQLearnerAgent(actions, opt_q_func, name="DelayedQ-vmax") pure_delayed_ql_agent.set_vmax() updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent( actions, name="DelayedQ-updating_max") trans_delayed_ql_agent = DelayedQLearnerAgent( actions, opt_q_func, name="DelayedQ-trans-max") agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent ] else: print "Unknown type of agents:", alg print "(q, rmax, delayed-q)" assert (False) # Run task. # TODO: Function for Learning on each MDP run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=is_goal_terminal, is_rec_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
#!/usr/bin/env python # Python imports. import random # Other imports. import srl_example_setup from simple_rl.agents import QLearnerAgent, FixedPolicyAgent from simple_rl.tasks import RockPaperScissorsMDP from simple_rl.run_experiments import play_markov_game # Setup MDP, Agents. markov_game = RockPaperScissorsMDP() ql_agent = QLearnerAgent(actions=markov_game.get_actions()) fixed_action = random.choice(markov_game.get_actions()) fixed_agent = FixedPolicyAgent(policy=lambda s: fixed_action) # Run experiment and make plot. play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40)
def reset(self): self.weights = np.zeros(self.num_features * len(self.actions)) QLearnerAgent.reset(self)
def _setup_agents(solar_mdp): ''' Args: solar_mdp (SolarOOMDP) Returns: (list): of Agents ''' # Get relevant MDP params. actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma( ), solar_mdp.get_panel_step() # Setup fixed agent. static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel") optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal") # Grena single axis and double axis trackers from time/loc. grena_tracker = SolarTracker(tb.grena_tracker, panel_step=panel_step, dual_axis=solar_mdp.dual_axis) grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(), name="grena-tracker") # Setup RL agents alpha, epsilon = 0.1, 0.05 rand_init = True num_features = solar_mdp.get_num_state_feats() lin_ucb_agent = LinUCBAgent(solar_mdp.get_bandit_actions(), name="lin-ucb", rand_init=rand_init, alpha=2.0) sarsa_agent_g0 = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin-g0", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=0, rbf=False, anneal=True) sarsa_agent = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=False, anneal=True) ql_agent = QLearnerAgent(actions, alpha=alpha, epsilon=epsilon, gamma=gamma) random_agent = RandomAgent(actions) # Regular experiments. agents = [lin_ucb_agent, sarsa_agent, sarsa_agent_g0 ] #, grena_tracker_agent, static_agent] #, optimal_agent] # agents = [lin_ucb_agent, grena_tracker_agent] #, sarsa_agent, sarsa_agent_g0, grena_tracker_agent, static_agent] #, optimal_agent] # agents = [grena_tracker_agent, static_agent] #, optimal_agent] return agents
def main(): # Setup environment. mdp_class, agent_type, samples = parse_args() is_goal_terminal = False mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute priors. # Stochastic mixture. mdp_distr_copy = copy.deepcopy(mdp_distr) opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy) # Avg MDP avg_mdp = ape.compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() # Make agents. # Q Learning ql_agent = QLearnerAgent(actions) shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy, actions=actions, name="Prior-QLearning") shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy, actions=actions, name="AvgMDP-QLearning") # RMax rmax_agent = RMaxAgent(actions) shaped_rmax_agent_prior = ShapedRMaxAgent( shaping_policy=opt_stoch_policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="Prior-RMax") shaped_rmax_agent_avgmdp = ShapedRMaxAgent( shaping_policy=avg_mdp_vi.policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="AvgMDP-RMax") prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr) if agent_type == "rmax": agents = [ rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp, prune_rmax_agent ] else: agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp] # Run task. run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=200, is_rec_disc_reward=False, verbose=True)
import srl_example_setup from simple_rl.agents import QLearnerAgent, RandomAgent from simple_rl.tasks import TaxiOOMDP, BlockDudeOOMDP from simple_rl.run_experiments import run_agents_on_mdp, run_single_agent_on_mdp # Taxi initial state attributes.. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=100, steps=150, reset_at_terminal=True)