def __init__(self, SubAgentClass, actions, agent_params={}, state_abstr=None, action_abstr=None, name_ext="abstr"): ''' Args: SubAgentClass (simple_rl.AgentClass) actions (list of str) agent_params (dict): A dictionary with key=param_name, val=param_value, to be given to the constructor for the instance of @SubAgentClass. state_abstr (StateAbstraction) state_abstr (ActionAbstraction) name_ext (str) ''' # Setup the abstracted agent. self.agent = SubAgentClass(actions=actions, **agent_params) self.action_abstr = ActionAbstraction( prim_actions=self.agent.actions ) if action_abstr is None else action_abstr self.state_abstr = StateAbstraction( {}) if state_abstr is None else state_abstr Agent.__init__(self, name=self.agent.name + "-" + name_ext, actions=self.action_abstr.get_actions())
class AbstractionWrapper(Agent): def __init__(self, SubAgentClass, actions, agent_params={}, state_abstr=None, action_abstr=None, name_ext="abstr"): ''' Args: SubAgentClass (simple_rl.AgentClass) actions (list of str) agent_params (dict): A dictionary with key=param_name, val=param_value, to be given to the constructor for the instance of @SubAgentClass. state_abstr (StateAbstraction) state_abstr (ActionAbstraction) name_ext (str) ''' # Setup the abstracted agent. self.agent = SubAgentClass(actions=actions, **agent_params) self.action_abstr = ActionAbstraction( prim_actions=self.agent.actions ) if action_abstr is None else action_abstr self.state_abstr = StateAbstraction( {}) if state_abstr is None else state_abstr Agent.__init__(self, name=self.agent.name + "-" + name_ext, actions=self.action_abstr.get_actions()) def act(self, ground_state, reward): ''' Args: ground_state (State) reward (float) Return: (str) ''' abstr_state = self.state_abstr.phi(ground_state) ground_action = self.action_abstr.act(self.agent, abstr_state, ground_state, reward) return ground_action def reset(self): # Write data. self.agent.reset() self.action_abstr.reset() def end_of_episode(self): self.agent.end_of_episode() self.action_abstr.end_of_episode()
def compute_omega_given_m_phi(mdp, state_abstr): ''' Args: mdp (simple_rl.MDP) phi (simple_rl.abstraction.StateAbstraction) Returns: omega (simple_rl.abstraction.ActionAbstraction) ''' # Grab relevant states. abs_states = state_abstr.get_abs_states() g_start_state = mdp.get_init_state() # Compute all directed options that transition between abstract states. options = [] state_pairs = {} placeholder_policy = lambda s: random.choice(mdp.get_actions(s)) # For each s_{phi,1} s_{phi,2} pair. for s_a in abs_states: for s_a_prime in abs_states: if not (s_a == s_a_prime) and ( s_a, s_a_prime) not in state_pairs.keys() and ( s_a_prime, s_a) not in state_pairs.keys(): # Make an option to transition between the two states. init_predicate = InListPredicate( ls=state_abstr.get_ground_states_in_abs_state(s_a)) term_predicate = InListPredicate( ls=state_abstr.get_ground_states_in_abs_state(s_a_prime)) o = Option(init_predicate=init_predicate, term_predicate=term_predicate, policy=placeholder_policy) options.append(o) state_pairs[(s_a, s_a_prime)] = 1 # Prune. pruned_option_set = ah._prune_redundant_options(options, state_pairs.keys(), state_abstr, mdp) return ActionAbstraction(options=pruned_option_set, on_failure="primitives")
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, vi_sample_rate=5, max_iterations=1000, amdp_sample_rate=5, delta=0.001): ''' Args: ground_mdp (simple_rl.MDP) state_abstr (simple_rl.StateAbstraction) action_abstr (simple_rl.ActionAbstraction) vi_sample_rate (int): Num samples per transition for running VI. max_iterations (int): Usual VI # Iteration bound. amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract. ''' self.ground_mdp = ground_mdp # Grab ground state space. vi = ValueIteration(self.ground_mdp, delta=0.001, max_iterations=1000, sample_rate=5) state_space = vi.get_states() # Make the abstract MDP. self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction( ground_state_space=state_space) self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction( prim_actions=ground_mdp.get_actions()) abstr_mdp = abstr_mdp_funcs.make_abstr_mdp( ground_mdp, self.state_abstr, self.action_abstr, step_cost=0.0, sample_rate=amdp_sample_rate) # Create VI with the abstract MDP. ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta, max_iterations)
def main(): # MDP Setting. lifelong = True mdp_class = "four_room" grid_dim = 11 # Make MDP. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=grid_dim) actions = mdp_distr.get_actions() experiment_type = "aa" goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=actions, options=goal_based_options) # Visualize Action Abstractions. visualize_options_grid(mdp_distr, goal_based_aa) input("Press any key to quit ") quit()
def make_abstr_mdp(mdp, state_abstr, action_abstr=None, step_cost=0.0, sample_rate=5, max_rollout=10): ''' Args: mdp (MDP) state_abstr (StateAbstraction) action_abstr (ActionAbstraction) step_cost (float): Cost for a step in the lower MDP. sample_rate (int): Sample rate for computing the abstract R and T. Returns: (MDP) ''' if action_abstr is None: action_abstr = ActionAbstraction(prim_actions=mdp.get_actions()) # Make abstract reward and transition functions. def abstr_reward_lambda(abstr_state, abstr_action): if abstr_state.is_terminal(): return 0 # Get relevant MDP components from the lower MDP. lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute reward. total_reward = 0 for ground_s in lower_states: for sample in range(sample_rate): s_prime, reward = abstr_action.rollout( ground_s, lower_reward_func, lower_trans_func, max_rollout_depth=max_rollout, step_cost=step_cost) total_reward += float(reward) / ( len(lower_states) * sample_rate) # Add weighted reward. return total_reward def abstr_transition_lambda(abstr_state, abstr_action): is_ground_terminal = False for s_g in state_abstr.get_lower_states_in_abs_state(abstr_state): if s_g.is_terminal(): is_ground_terminal = True break # Get relevant MDP components from the lower MDP. if abstr_state.is_terminal(): return abstr_state lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute next state distribution. s_prime_prob_dict = defaultdict(int) total_reward = 0 for ground_s in lower_states: for sample in range(sample_rate): s_prime, reward = abstr_action.rollout( ground_s, lower_reward_func, lower_trans_func, max_rollout_depth=max_rollout) s_prime_prob_dict[s_prime] += ( 1.0 / (len(lower_states) * sample_rate) ) # Weighted average. # Form distribution and sample s_prime. next_state_sample_list = list( np.random.multinomial(1, list(s_prime_prob_dict.values())).tolist()) end_ground_state = list( s_prime_prob_dict.keys())[next_state_sample_list.index(1)] end_abstr_state = state_abstr.phi(end_ground_state) return end_abstr_state # Make the components of the Abstract MDP. abstr_init_state = state_abstr.phi(mdp.get_init_state()) abstr_action_space = action_abstr.get_actions() abstr_state_space = state_abstr.get_abs_states() abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space, abstr_action_space) abstr_transition_func = TransitionFunc(abstr_transition_lambda, abstr_state_space, abstr_action_space, sample_rate=sample_rate) # Make the MDP. abstr_mdp = MDP(actions=abstr_action_space, init_state=abstr_init_state, reward_func=abstr_reward_func.reward_func, transition_func=abstr_transition_func.transition_func, gamma=mdp.get_gamma()) return abstr_mdp
def make_abstr_mdp(mdp, state_abstr, action_abstr=None, step_cost=0.0, sample_rate=5): ''' Args: mdp (MDP) state_abstr (StateAbstraction) action_abstr (ActionAbstraction) step_cost (float): Cost for a step in the lower MDP. sample_rate (int): Sample rate for computing the abstract R and T. Returns: (MDP) ''' if action_abstr is None: action_abstr = ActionAbstraction(prim_actions=mdp.get_actions()) # Make abstract reward and transition functions. def abstr_reward_lambda(abstr_state, abstr_action): if abstr_state.is_terminal(): return 0 # Get relevant MDP components from the lower MDP. lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute reward. total_reward = 0 for ground_s in lower_states: for sample in range(sample_rate): s_prime, reward = abstr_action.rollout(ground_s, lower_reward_func, lower_trans_func, step_cost=step_cost) total_reward += float(reward) / (len(lower_states) * sample_rate) # Add weighted reward. return total_reward def abstr_transition_lambda(abstr_state, abstr_action): is_ground_terminal = False for s_g in state_abstr.get_lower_states_in_abs_state(abstr_state): if s_g.is_terminal(): is_ground_terminal = True break # Get relevant MDP components from the lower MDP. if abstr_state.is_terminal(): return abstr_state lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute next state distribution. s_prime_prob_dict = defaultdict(int) total_reward = 0 for ground_s in lower_states: for sample in range(sample_rate): s_prime, reward = abstr_action.rollout(ground_s, lower_reward_func, lower_trans_func) s_prime_prob_dict[s_prime] += (1.0 / (len(lower_states) * sample_rate)) # Weighted average. # Form distribution and sample s_prime. next_state_sample_list = list(np.random.multinomial(1, list(s_prime_prob_dict.values())).tolist()) end_ground_state = list(s_prime_prob_dict.keys())[next_state_sample_list.index(1)] end_abstr_state = state_abstr.phi(end_ground_state) return end_abstr_state # Make the components of the Abstract MDP. abstr_init_state = state_abstr.phi(mdp.get_init_state()) abstr_action_space = action_abstr.get_actions() abstr_state_space = state_abstr.get_abs_states() abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space, abstr_action_space) abstr_transition_func = TransitionFunc(abstr_transition_lambda, abstr_state_space, abstr_action_space, sample_rate=sample_rate) # Make the MDP. abstr_mdp = MDP(actions=abstr_action_space, init_state=abstr_init_state, reward_func=abstr_reward_func.reward_func, transition_func=abstr_transition_func.transition_func, gamma=mdp.get_gamma()) return abstr_mdp
def get_exact_vs_approx_agents(environment, incl_opt=True): ''' Args: environment (simple_rl.MDPDistribution) incl_opt (bool) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() exact_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0) approx_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.05) ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) ql_exact_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") ql_approx_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent] dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) dql_exact_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") dql_approx_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent] rm_agent = RMaxAgent(actions, gamma=gamma) rm_exact_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") rm_approx_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent] if incl_opt: vi = ValueIteration(environment) vi.run_vi() opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$") sa_vi = AbstractValueIteration( environment, sample_rate=50, max_iterations=3000, delta=0.0001, state_abstr=approx_qds_test, action_abstr=ActionAbstraction( options=[], prim_actions=environment.get_actions())) sa_vi.run_vi() approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$") dql_agents += [opt_agent, approx_opt_agent] return ql_agents