def __init__(self, actions, rand_init=True, name="Linear-Q", alpha=0.001, gamma=0.99, epsilon=0.2, explore="uniform", feature=None, anneal=True, sarsa=False): # name = name + "-rbf" if rbf else name QLearningAgent.__init__(self, actions=list(actions), name=name, alpha=alpha, gamma=gamma, epsilon=epsilon, explore=explore, anneal=anneal) self.sarsa = sarsa self.feature = feature # function (state, action) -> features (numpy vector) self.num_features = self.feature.num_features() # Add a basis feature. self.rand_init = rand_init if rand_init: self.weights = np.random.random(self.num_features * len(self.actions)) else: self.weights = np.zeros(self.num_features * len(self.actions)) self.max_weight = 0.0
def __init__(self, actions, num_features, rand_init=True, name="Linear-Q", alpha=0.2, gamma=0.99, epsilon=0.2, explore="uniform", rbf=False, anneal=True): name = name + "-rbf" if rbf else name QLearningAgent.__init__(self, actions=list(actions), name=name, alpha=alpha, gamma=gamma, epsilon=epsilon, explore=explore, anneal=anneal) self.num_features = num_features self.rand_init = rand_init # Add a basis feature. if rand_init: self.weights = np.random.random(self.num_features * len(self.actions)) else: self.weights = np.zeros(self.num_features * len(self.actions)) self.rbf = rbf
def main(): # Setup MDP. actual_args = { "width": 10, "height": 10, "init_loc": (1, 1), "goal_locs": [(10, 10)], "lava_locs": [(1, 10), (3, 10), (5, 10), (7, 10), (9, 10)], "gamma": 0.9, "walls": [ (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9) ], "slip_prob": 0.01, "lava_cost": 1.0, "step_cost": 0.1 } mdp = GridWorldMDP(**actual_args) # Initialize the custom Q function for a q-learning agent. This should be equivalent to potential shaping. # This should cause the Q agent to learn more quickly. custom_q = defaultdict(lambda: defaultdict(lambda: 0)) custom_q[GridWorldState(5, 1)]['right'] = 1.0 custom_q[GridWorldState(2, 1)]['right'] = 1.0 # Make a normal q-learning agent and another initialized with the custom_q above. # Finally, make a random agent to compare against. ql_agent = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4) ql_agent_pot = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4, custom_q_init=custom_q, name="PotQ") rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, ql_agent_pot, rand_agent], mdp, instances=2, episodes=60, steps=200, open_plot=True, verbose=True)
def __init__(self, actions, num_features, rand_init=True, name="Linear-Q", alpha=0.2, gamma=0.99, epsilon=0.2, explore="uniform", anneal=True): QLearningAgent.__init__(self, actions=list(actions), name=name, alpha=alpha, gamma=gamma, epsilon=epsilon, explore=explore, anneal=anneal) self.num_features = num_features self.rand_init = rand_init # Add a basis feature. if rand_init: self.weights = np.random.random(self.num_features*len(self.actions)) else: self.weights = np.zeros(self.num_features*len(self.actions))
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01) # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) rm_agent = RMaxAgent(mdp.get_actions()) viz = parse_args() viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent)
def main(): # Setup MDP, Agents. size = 5 agent = { "x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": size, "dest_y": size, "has_block": 0 } blocks = [{"x": size, "y": 1}] lavas = [{ "x": x, "y": y } for x, y in map(lambda z: (z + 1, (size + 1) / 2), range(size))] mdp = TrenchOOMDP(size, size, agent, blocks, lavas) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250)
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=8, height=3, init_loc=(1, 1), goal_locs=[(8, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=20, episodes=300, steps=20, open_plot=open_plot, track_success=True, success_reward=1)
def main(open_plot=True): # Taxi initial state attributes.. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) # Agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, reset_at_terminal=True, open_plot=open_plot)
def __init__(self, balancer_node): super().__init__(balancer_node) self.data = {} self.agent = QLearningAgent( ['NONE', 'UP', 'DOWN'], epsilon=0.3, anneal=True, gamma=0.3, alpha=0.2, # explore='softmax' ) self._set_q() self.max_node_num = len(self.balancer_node.get_nodes()) self._impossible = False
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4) viz = parse_args() # Choose viz type. viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent) elif viz == "interactive": mdp.visualize_interaction()
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.1) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.2) viz = parse_args() # Choose viz type. viz = "value" if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # --> Press <r> to reset. # Show agent's interaction with the environment. mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200) elif viz == "interactive": # Press <1>, <2>, <3>, and so on to execute action 1, action 2, etc. mdp.visualize_interaction()
def main(): # Command line args. task, rom = parse_args() # Setup the MDP. mdp = choose_mdp(task, rom) actions = mdp.get_actions() gamma = mdp.get_gamma() # Setup agents. from simple_rl.agents import RandomAgent, QLearningAgent random_agent = RandomAgent(actions) qlearner_agent = QLearningAgent(actions, gamma=gamma, explore="uniform") agents = [qlearner_agent, random_agent] # Run Agents. if isinstance(mdp, MarkovGameMDP): # Markov Game. agents = { qlearner_agent.name: qlearner_agent, random_agent.name: random_agent } play_markov_game(agents, mdp, instances=100, episodes=1, steps=500) else: # Regular experiment. run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) tabular_agent = CherryQAgent(mdp, model=lambda *x: ActionValueFunction(*x, init=1.0), name='Tabular', lr=0.7) linear_agent = CherryQAgent(mdp, model=lambda *x: nn.Linear(*x), name='Linear', lr=0.1) mlp_agent = CherryQAgent(mdp, model=lambda *x: MLP(*x), name='MLP', lr=0.07) # Run experiment and make plot. agents = [rand_agent, ql_agent, tabular_agent, linear_agent, mlp_agent] run_agents_on_mdp(agents, mdp, instances=10, episodes=50, steps=50, open_plot=open_plot)
def plot_parameters(pars, md): cur_cell_rewards = [ pars["white"][0], pars["yellow"][0], pars["red"][0], pars["green"][0], pars["purple"][0], -500 ] # cur_cell_rewards = pars print(cur_cell_rewards) md.mdp = NavigationWorldMDP(width=md.side, height=md.side, nav_cell_types=md.nav_cell_types, nav_cell_rewards=cur_cell_rewards, nav_cell_p_or_locs=md.nav_cell_p_or_locs, goal_cell_types=md.goal_cell_types, goal_cell_rewards=md.goal_rew, goal_cell_locs=md.goal_cell_loc, init_loc=md.start_loc, rand_init=False, gamma=0.95, slip_prob=0, step_cost=0) md.agent = QLearningAgent(md.mdp.get_actions(), epsilon=md.eps) run_single_agent_on_mdp(md.agent, md.mdp, episodes=md.episodes, steps=md.steps) md.agent.epsilon = 0 md.mdp.slip_prob = 0 _, steps_taken, reward, states = md.run_experiment(md.agent, md.mdp) # print('Best result observation:') print([md.count_turns(states), steps_taken, reward]) # print('Observed data result:') # print(md.observed_data) md.mdp.visualize_grid(trajectories=[states], plot=False) return [md.count_turns(states), steps_taken, reward]
def func(self, *params, n_obs=100, batch_size=1, random_state=None): """Generate a sequence of samples from the Open AI env. Parameters ---------- params : array of envs random_state : RandomState, optional """ # fix locations instead of probabilities! fixed map multiple init_locs! rewards = [] params = np.array(params).reshape(self.param_dim, -1) batches = params.shape[1] for i in range(batches): cur_cell_rewards = [x for x in params[:, i]] # reward for black cells is fixed cur_cell_rewards.append(-500) if self.prev_cell_rewards != cur_cell_rewards: self.mdp = NavigationWorldMDP( width=self.side, height=self.side, nav_cell_types=self.nav_cell_types, nav_cell_rewards=cur_cell_rewards, nav_cell_p_or_locs=self.nav_cell_p_or_locs, goal_cell_types=self.goal_cell_types, goal_cell_rewards=self.goal_rew, goal_cell_locs=self.goal_cell_loc, init_loc=self.start_loc, rand_init=False, slip_prob=0) self.agent = QLearningAgent(self.mdp.get_actions(), epsilon=self.eps) run_single_agent_on_mdp(self.agent, self.mdp, episodes=self.episodes, steps=self.steps) self.agent.epsilon = 0 self.mdp.slip_prob = self.slip # print('Parameters:') # print(cur_cell_rewards) for j in range(1): finished, steps_taken, reward, states = self.run_experiment( self.agent, self.mdp) turns = self.count_turns(states) ep_reward = [turns, steps_taken, reward] # print('Corresponding reward:') # print([turns, steps_taken, reward]) # self.mdp.visualize_grid(trajectories=[states], traj_colors_auto=False) rewards.append(ep_reward) self.prev_cell_rewards = cur_cell_rewards return rewards
def main(open_plot=True): # Setup MDP, Agents. markov_game = RockPaperScissorsMDP() ql_agent = QLearningAgent(actions=markov_game.get_actions()) fixed_action = random.choice(markov_game.get_actions()) fixed_agent = FixedPolicyAgent(policy=lambda s: fixed_action) # Run experiment and make plot. play_markov_game([ql_agent, fixed_agent], markov_game, instances=15, episodes=1, steps=40, open_plot=open_plot)
def main(open_plot=True): # Make MDP distribution, agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearningAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Run experiment and make plot. run_agents_lifelong([ql_agent, rand_agent], mdp_distr, samples=10, episodes=50, steps=100, reset_at_terminal=True, open_plot=open_plot)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9}, actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def main(open_plot=True): state_colors = defaultdict(lambda: defaultdict(lambda: "white")) state_colors[3][2] = "red" # Setup MDP, Agents. mdp = ColoredGridWorldMDP(state_colors) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot)
def test_utility(args, mdp): # The number of options to the performance # TODO: Compare the utility of point options vs. subgoal options? now_ts = str(datetime.now().timestamp()) origMatrix, intToS = GetAdjacencyMatrix(mdp) known_region = list(intToS.values()) # Known region is a set of MDPStates. n_ops_list = [2, 4, 8, 16, 32] agents = [] ql_agent = QLearningAgent(actions=mdp.get_actions()) agents.append(ql_agent) method = 'fiedler' for n_ops in n_ops_list: _, foptions, _, fvectors = GetOption(mdp, n_ops, matrix=origMatrix, intToS=intToS, option_type=args.optiontype, method=method) print('#options=', n_ops) print(foptions) if args.optiontype == 'subgoal': known_region = list( intToS.values()) # Known region is a set of MDPStates. eigenoption_agent = build_subgoal_option_agent( mdp, foptions, known_region, vectors=fvectors, name='-' + method + '-' + args.optiontype + '-' + str(n_ops)) else: eigenoption_agent = build_point_option_agent( mdp, foptions, agent=QLearningAgent, policy='vi', name='-' + method + '-' + args.optiontype + '-' + str(n_ops)) agents.append(eigenoption_agent) run_agents_on_mdp(agents, mdp, instances=args.ninstances, episodes=args.nepisodes, steps=args.nsteps, open_plot=True, track_disc_reward=True, cumulative_plot=True, dir_for_plot="results/")
def main(open_plot=True): # Setup MDP. args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) if args.visualize: value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) else: custom_q = parse_custom_q_table(args.custom_q, args.default_q) agents = [] for agent in args.agents: if agent == 'q_learning': agents.append(QLearningAgent(actions=mdp.get_actions())) elif agent == 'potential_q': agents.append( QLearningAgent(actions=mdp.get_actions(), custom_q_init=custom_q, name="Potential_Q")) elif agent == 'random': agents.append(RandomAgent(actions=mdp.get_actions())) elif agent == 'rmax': agents.append(RMaxAgent(mdp.get_actions())) # Run experiment and make plot. run_agents_on_mdp(agents, mdp, instances=1, episodes=100, steps=100, open_plot=open_plot, verbose=True)
def main(): import OptimalBeliefAgentClass # Setup multitask setting. # R ~ D : Puddle, Rock Sample # G ~ D : octo, four_room # T ~ D : grid mdp_class, is_goal_terminal, samples = parse_args() mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() print "done." #, iters, value sys.stdout.flush() # Agents. print "Making agents...", sys.stdout.flush() mdp_distr_copy = copy.deepcopy(mdp_distr) opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy) opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy, name="$\pi_{prior}$") opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent( mdp_distr, actions) vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$") rand_agent = RandomAgent(actions, name="$\pi^u$") ql_agent = QLearningAgent(actions) print "done." agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent] # Run task. run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=False, track_disc_reward=False, cumulative_plot=True)
def main(): # create mdp using own definition mdp = tfeMDP() # Three different agents to compare how each do against each other rand_agent = RandomAgent(actions=mdp.get_actions()) rmax_agent = RMaxAgent(actions=mdp.get_actions()) agent = QLearningAgent(actions=mdp.get_actions()) # Function that actually runs everything and generates the appropriate # graphs and statistics defining how each agent did run_agents_on_mdp([agent, rmax_agent, rand_agent], mdp, instances=200, episodes=100, steps=1000)
def get_combo_experiment_agents(environment): ''' Args: environment (simple_rl.MDPDistribution) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() sa, aa = get_directed_option_sa_pair( environment, indic_func=ind_funcs._q_disc_approx_indicator, max_options=100) sa_qds_test = get_sa(environment, indic_func=ind_funcs._q_disc_approx_indicator, epsilon=0.05) sa_qs_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.1) # QLearner. ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) rmax_agent = RMaxAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) # Combos. ql_sa_qds_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa_qds_test, name_ext="$\phi_{Q_d^*}$") ql_sa_qs_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa_qs_test, name_ext="$\phi_{Q_\epsilon^*}$") # sa_agent = AbstractionWrapper(QLearningAgent, actions, str(environment), state_abstr=sa, name_ext="sa") aa_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, action_abstr=aa, name_ext="aa") sa_aa_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa, action_abstr=aa, name_ext="$\phi_{Q_d^*}+aa$") agents = [ql_agent, ql_sa_qds_agent, ql_sa_qs_agent, aa_agent, sa_aa_agent] return agents
def main(): # Make MDP Distribution. mdp_distr = make_mdp_distr(mdp_class="four_room", grid_dim=11, slip_prob=0.05, gamma=0.99) # Make SA. multitask_sa_beta_1 = make_multitask_sa_info_sa(mdp_distr, beta=1.0, is_deterministic_ib=True) multitask_sa_beta_10 = make_multitask_sa_info_sa(mdp_distr, beta=10.0, is_deterministic_ib=True) multitask_sa_beta_100 = make_multitask_sa_info_sa(mdp_distr, beta=100.0, is_deterministic_ib=True) multitask_sa_beta_1000 = make_multitask_sa_info_sa( mdp_distr, beta=1000.0, is_deterministic_ib=True) # Make agent. ql_agent = QLearningAgent(mdp_distr.get_actions()) abstr_ql_b1 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_1, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 1}$") abstr_ql_b10 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_10, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 10}$") abstr_ql_b100 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_100, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 100}$") abstr_ql_b1000 = AbstractionWrapper( QLearningAgent, state_abstr=multitask_sa_beta_1000, agent_params={"actions": mdp_distr.get_actions()}, name_ext="-$\\phi_{\\beta = 1000}$") run_agents_lifelong( [abstr_ql_b1, abstr_ql_b10, abstr_ql_b100, abstr_ql_b1000, ql_agent], mdp_distr, steps=200, samples=50, episodes=200)
def main(open_plot=True): # Setup MDP, Agents. mdp = BanditMDP() lin_agent = LinUCBAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, lin_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, open_plot=open_plot)
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False): ''' Args: mdp (simple_rl.MDP) demo_policy (lambda : simple_rl.State --> str) beta_list (list) is_deterministic_ib (bool) Summary: Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction. ''' # Run info_sa. dict_of_phi_pmfs = {} for beta in beta_list: pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib) # Translate abstractions. prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) #ground state to abstract state dict_of_phi_pmfs[beta] = crisp_s_phi print("crisp_s_phi:" ) for single_state in crisp_s_phi.get_abs_states(): print(str(type(single_state))) print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state))) print("ground states:") for ground_states in crisp_s_phi.get_ground_states(): print(str(type(ground_states))) print(len(crisp_s_phi.get_ground_states())) print(len(crisp_s_phi.get_abs_states())) # Make agents. demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$") ql_agent = QLearningAgent(mdp.get_actions()) agent_dict = {} for beta in beta_list: beta_phi = dict_of_phi_pmfs[beta] ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True}) agent_dict[beta] = ql_abstr_agent # Learn. run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5) # Print num abstract states. for beta in dict_of_phi_pmfs.keys(): print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states() print
def _setup_agents(solar_mdp): ''' Args: solar_mdp (SolarOOMDP) Returns: (list): of Agents ''' # Get relevant MDP params. actions, gamma, panel_step = solar_mdp.get_actions(), solar_mdp.get_gamma( ), solar_mdp.get_panel_step() # Setup fixed agent. static_agent = FixedPolicyAgent(tb.static_policy, name="fixed-panel") optimal_agent = FixedPolicyAgent(tb.optimal_policy, name="optimal") # Grena single axis and double axis trackers from time/loc. grena_tracker = SolarTracker(tb.grena_tracker, panel_step=panel_step, dual_axis=solar_mdp.dual_axis, actions=solar_mdp.get_bandit_actions()) grena_tracker_agent = FixedPolicyAgent(grena_tracker.get_policy(), name="grena-tracker") # Setup RL agents alpha, epsilon = 0.1, 0.05 rand_init = True num_features = solar_mdp.get_num_state_feats() lin_ucb_agent = LinUCBAgent(solar_mdp.get_bandit_actions(), context_size=num_features, name="lin-ucb", rand_init=rand_init, alpha=2.0) # sarsa_agent_g0 = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin-g0", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=0, rbf=False, anneal=True) # sarsa_agent = LinearSarsaAgent(actions, num_features=num_features, name="sarsa-lin", rand_init=rand_init, alpha=alpha, epsilon=epsilon, gamma=gamma, rbf=False, anneal=True) ql_agent = QLearningAgent(actions, alpha=alpha, epsilon=epsilon, gamma=gamma) random_agent = RandomAgent(actions) # Regular experiments. # agents = [lin_ucb_agent, sarsa_agent, sarsa_agent_g0, grena_tracker_agent, static_agent] agents = [grena_tracker_agent, static_agent] return agents
def main(): args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=args.epsilon, alpha=args.alpha, explore=args.explore, anneal=args.anneal) viz = args.mode if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) rand_agent = RandomAgent(actions=mdp.get_actions()) run_agents_on_mdp([rand_agent, ql_agent], mdp, open_plot=True, episodes=60, steps=200, instances=5, success_reward=1) # mdp.visualize_agent(ql_agent) elif viz == "learning": # --> Press <r> to reset. # Show agent's interaction with the environment. mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200)
def main(): # ======================== # === Make Environment === # ======================== mdp_class = "hrooms" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = environment.get_actions() # ========================== # === Make SA, AA Stacks === # ========================== # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3) sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3) # Debug. print("\n" + ("=" * 30)) print("== Done making abstraction. ==") print("=" * 30 + "\n") sa_stack.print_state_space_sizes() print("Num Action Abstractions:", len(aa_stack.get_aa_list())) # =================== # === Make Agents === # =================== baseline_agent = QLearningAgent(actions) rmax_agent = RMaxAgent(actions) rand_agent = RandomAgent(actions) l0_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$") l1_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$") # l2_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$") dynamic_hierarch_agent = DynamicHierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") print("\n" + ("=" * 26)) print("== Running experiments. ==") print("=" * 26 + "\n") # ====================== # === Run Experiment === # ====================== agents = [l1_hierarch_agent, dynamic_hierarch_agent, baseline_agent] run_agents_multi_task(agents, environment, task_samples=10, steps=1500, episodes=1, reset_at_terminal=True)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], gamma=0.95, walls=[(2, 2)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=20, open_plot=open_plot)
def reset(self): self.weights = np.zeros(self.num_features*len(self.actions)) QLearningAgent.reset(self)