Exemple #1
0
    def __init__(self,
                 SubAgentClass,
                 actions,
                 agent_params={},
                 state_abstr=None,
                 action_abstr=None,
                 name_ext="abstr"):
        '''
        Args:
            SubAgentClass (simple_rl.AgentClass)
            actions (list of str)
            agent_params (dict): A dictionary with key=param_name, val=param_value,
                to be given to the constructor for the instance of @SubAgentClass.
            state_abstr (StateAbstraction)
            state_abstr (ActionAbstraction)
            name_ext (str)
        '''

        # Setup the abstracted agent.
        self.agent = SubAgentClass(actions=actions, **agent_params)
        self.action_abstr = ActionAbstraction(
            prim_actions=self.agent.actions
        ) if action_abstr is None else action_abstr
        self.state_abstr = StateAbstraction(
            {}) if state_abstr is None else state_abstr

        Agent.__init__(self,
                       name=self.agent.name + "-" + name_ext,
                       actions=self.action_abstr.get_actions())
Exemple #2
0
class AbstractionWrapper(Agent):
    def __init__(self,
                 SubAgentClass,
                 actions,
                 agent_params={},
                 state_abstr=None,
                 action_abstr=None,
                 name_ext="abstr"):
        '''
        Args:
            SubAgentClass (simple_rl.AgentClass)
            actions (list of str)
            agent_params (dict): A dictionary with key=param_name, val=param_value,
                to be given to the constructor for the instance of @SubAgentClass.
            state_abstr (StateAbstraction)
            state_abstr (ActionAbstraction)
            name_ext (str)
        '''

        # Setup the abstracted agent.
        self.agent = SubAgentClass(actions=actions, **agent_params)
        self.action_abstr = ActionAbstraction(
            prim_actions=self.agent.actions
        ) if action_abstr is None else action_abstr
        self.state_abstr = StateAbstraction(
            {}) if state_abstr is None else state_abstr

        Agent.__init__(self,
                       name=self.agent.name + "-" + name_ext,
                       actions=self.action_abstr.get_actions())

    def act(self, ground_state, reward):
        '''
        Args:
            ground_state (State)
            reward (float)

        Return:
            (str)
        '''
        abstr_state = self.state_abstr.phi(ground_state)
        ground_action = self.action_abstr.act(self.agent, abstr_state,
                                              ground_state, reward)

        return ground_action

    def reset(self):
        # Write data.
        self.agent.reset()
        self.action_abstr.reset()

    def end_of_episode(self):
        self.agent.end_of_episode()
        self.action_abstr.end_of_episode()
def compute_omega_given_m_phi(mdp, state_abstr):
    '''
    Args:
        mdp (simple_rl.MDP)
        phi (simple_rl.abstraction.StateAbstraction)

    Returns:
        omega (simple_rl.abstraction.ActionAbstraction)
    '''
    # Grab relevant states.
    abs_states = state_abstr.get_abs_states()
    g_start_state = mdp.get_init_state()

    # Compute all directed options that transition between abstract states.
    options = []
    state_pairs = {}
    placeholder_policy = lambda s: random.choice(mdp.get_actions(s))

    # For each s_{phi,1} s_{phi,2} pair.
    for s_a in abs_states:
        for s_a_prime in abs_states:
            if not (s_a == s_a_prime) and (
                    s_a, s_a_prime) not in state_pairs.keys() and (
                        s_a_prime, s_a) not in state_pairs.keys():
                # Make an option to transition between the two states.
                init_predicate = InListPredicate(
                    ls=state_abstr.get_ground_states_in_abs_state(s_a))
                term_predicate = InListPredicate(
                    ls=state_abstr.get_ground_states_in_abs_state(s_a_prime))

                o = Option(init_predicate=init_predicate,
                           term_predicate=term_predicate,
                           policy=placeholder_policy)

                options.append(o)
                state_pairs[(s_a, s_a_prime)] = 1

    # Prune.
    pruned_option_set = ah._prune_redundant_options(options,
                                                    state_pairs.keys(),
                                                    state_abstr, mdp)

    return ActionAbstraction(options=pruned_option_set,
                             on_failure="primitives")
    def __init__(self,
                 ground_mdp,
                 state_abstr=None,
                 action_abstr=None,
                 vi_sample_rate=5,
                 max_iterations=1000,
                 amdp_sample_rate=5,
                 delta=0.001):
        '''
        Args:
            ground_mdp (simple_rl.MDP)
            state_abstr (simple_rl.StateAbstraction)
            action_abstr (simple_rl.ActionAbstraction)
            vi_sample_rate (int): Num samples per transition for running VI.
            max_iterations (int): Usual VI # Iteration bound.
            amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract.
        '''
        self.ground_mdp = ground_mdp

        # Grab ground state space.
        vi = ValueIteration(self.ground_mdp,
                            delta=0.001,
                            max_iterations=1000,
                            sample_rate=5)
        state_space = vi.get_states()

        # Make the abstract MDP.
        self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction(
            ground_state_space=state_space)
        self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction(
            prim_actions=ground_mdp.get_actions())
        abstr_mdp = abstr_mdp_funcs.make_abstr_mdp(
            ground_mdp,
            self.state_abstr,
            self.action_abstr,
            step_cost=0.0,
            sample_rate=amdp_sample_rate)

        # Create VI with the abstract MDP.
        ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta,
                                max_iterations)
def main():

    # MDP Setting.
    lifelong = True
    mdp_class = "four_room"
    grid_dim = 11

    # Make MDP.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=grid_dim)
    actions = mdp_distr.get_actions()
    experiment_type = "aa"

    goal_based_options = aa_helpers.make_goal_based_options(mdp_distr)
    goal_based_aa = ActionAbstraction(prim_actions=actions,
                                      options=goal_based_options)

    # Visualize Action Abstractions.
    visualize_options_grid(mdp_distr, goal_based_aa)

    input("Press any key to quit ")
    quit()
Exemple #6
0
def make_abstr_mdp(mdp,
                   state_abstr,
                   action_abstr=None,
                   step_cost=0.0,
                   sample_rate=5,
                   max_rollout=10):
    '''
	Args:
		mdp (MDP)
		state_abstr (StateAbstraction)
		action_abstr (ActionAbstraction)
		step_cost (float): Cost for a step in the lower MDP.
		sample_rate (int): Sample rate for computing the abstract R and T.

	Returns:
		(MDP)
	'''

    if action_abstr is None:
        action_abstr = ActionAbstraction(prim_actions=mdp.get_actions())

    # Make abstract reward and transition functions.
    def abstr_reward_lambda(abstr_state, abstr_action):
        if abstr_state.is_terminal():
            return 0

        # Get relevant MDP components from the lower MDP.
        lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
        lower_reward_func = mdp.get_reward_func()
        lower_trans_func = mdp.get_transition_func()

        # Compute reward.
        total_reward = 0
        for ground_s in lower_states:
            for sample in range(sample_rate):
                s_prime, reward = abstr_action.rollout(
                    ground_s,
                    lower_reward_func,
                    lower_trans_func,
                    max_rollout_depth=max_rollout,
                    step_cost=step_cost)
                total_reward += float(reward) / (
                    len(lower_states) * sample_rate)  # Add weighted reward.
        return total_reward

    def abstr_transition_lambda(abstr_state, abstr_action):
        is_ground_terminal = False
        for s_g in state_abstr.get_lower_states_in_abs_state(abstr_state):
            if s_g.is_terminal():
                is_ground_terminal = True
                break

        # Get relevant MDP components from the lower MDP.
        if abstr_state.is_terminal():
            return abstr_state

        lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
        lower_reward_func = mdp.get_reward_func()
        lower_trans_func = mdp.get_transition_func()

        # Compute next state distribution.
        s_prime_prob_dict = defaultdict(int)
        total_reward = 0
        for ground_s in lower_states:
            for sample in range(sample_rate):
                s_prime, reward = abstr_action.rollout(
                    ground_s,
                    lower_reward_func,
                    lower_trans_func,
                    max_rollout_depth=max_rollout)
                s_prime_prob_dict[s_prime] += (
                    1.0 / (len(lower_states) * sample_rate)
                )  # Weighted average.

        # Form distribution and sample s_prime.
        next_state_sample_list = list(
            np.random.multinomial(1,
                                  list(s_prime_prob_dict.values())).tolist())
        end_ground_state = list(
            s_prime_prob_dict.keys())[next_state_sample_list.index(1)]
        end_abstr_state = state_abstr.phi(end_ground_state)
        return end_abstr_state

    # Make the components of the Abstract MDP.
    abstr_init_state = state_abstr.phi(mdp.get_init_state())
    abstr_action_space = action_abstr.get_actions()
    abstr_state_space = state_abstr.get_abs_states()
    abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space,
                                   abstr_action_space)
    abstr_transition_func = TransitionFunc(abstr_transition_lambda,
                                           abstr_state_space,
                                           abstr_action_space,
                                           sample_rate=sample_rate)

    # Make the MDP.
    abstr_mdp = MDP(actions=abstr_action_space,
                    init_state=abstr_init_state,
                    reward_func=abstr_reward_func.reward_func,
                    transition_func=abstr_transition_func.transition_func,
                    gamma=mdp.get_gamma())

    return abstr_mdp
def make_abstr_mdp(mdp, state_abstr, action_abstr=None, step_cost=0.0, sample_rate=5):
	'''
	Args:
		mdp (MDP)
		state_abstr (StateAbstraction)
		action_abstr (ActionAbstraction)
		step_cost (float): Cost for a step in the lower MDP.
		sample_rate (int): Sample rate for computing the abstract R and T.

	Returns:
		(MDP)
	'''

	if action_abstr is None:
		action_abstr = ActionAbstraction(prim_actions=mdp.get_actions())

	# Make abstract reward and transition functions.
	def abstr_reward_lambda(abstr_state, abstr_action):
		if abstr_state.is_terminal():
			return 0

		# Get relevant MDP components from the lower MDP.
		lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
		lower_reward_func = mdp.get_reward_func()
		lower_trans_func = mdp.get_transition_func()

		# Compute reward.
		total_reward = 0
		for ground_s in lower_states:
			for sample in range(sample_rate):
				s_prime, reward = abstr_action.rollout(ground_s, lower_reward_func, lower_trans_func, step_cost=step_cost)
				total_reward += float(reward) / (len(lower_states) * sample_rate) # Add weighted reward.

		return total_reward

	def abstr_transition_lambda(abstr_state, abstr_action):
		is_ground_terminal = False
		for s_g in state_abstr.get_lower_states_in_abs_state(abstr_state):
			if s_g.is_terminal():
				is_ground_terminal = True
				break

		# Get relevant MDP components from the lower MDP.
		if abstr_state.is_terminal():
			return abstr_state

		lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
		lower_reward_func = mdp.get_reward_func()
		lower_trans_func = mdp.get_transition_func()


		# Compute next state distribution.
		s_prime_prob_dict = defaultdict(int)
		total_reward = 0
		for ground_s in lower_states:
			for sample in range(sample_rate):
				s_prime, reward = abstr_action.rollout(ground_s, lower_reward_func, lower_trans_func)
				s_prime_prob_dict[s_prime] += (1.0 / (len(lower_states) * sample_rate)) # Weighted average.
		
		# Form distribution and sample s_prime.
		next_state_sample_list = list(np.random.multinomial(1, list(s_prime_prob_dict.values())).tolist())
		end_ground_state = list(s_prime_prob_dict.keys())[next_state_sample_list.index(1)]
		end_abstr_state = state_abstr.phi(end_ground_state)

		return end_abstr_state
	
	# Make the components of the Abstract MDP.
	abstr_init_state = state_abstr.phi(mdp.get_init_state())
	abstr_action_space = action_abstr.get_actions()
	abstr_state_space = state_abstr.get_abs_states()
	abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space, abstr_action_space)
	abstr_transition_func = TransitionFunc(abstr_transition_lambda, abstr_state_space, abstr_action_space, sample_rate=sample_rate)

	# Make the MDP.
	abstr_mdp = MDP(actions=abstr_action_space,
                    init_state=abstr_init_state,
                    reward_func=abstr_reward_func.reward_func,
                    transition_func=abstr_transition_func.transition_func,
                    gamma=mdp.get_gamma())

	return abstr_mdp
Exemple #8
0
def get_exact_vs_approx_agents(environment, incl_opt=True):
    '''
    Args:
        environment (simple_rl.MDPDistribution)
        incl_opt (bool)

    Returns:
        (list)
    '''

    actions = environment.get_actions()
    gamma = environment.get_gamma()

    exact_qds_test = get_sa(environment,
                            indic_func=ind_funcs._q_eps_approx_indicator,
                            epsilon=0.0)
    approx_qds_test = get_sa(environment,
                             indic_func=ind_funcs._q_eps_approx_indicator,
                             epsilon=0.05)

    ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    ql_exact_agent = AbstractionWrapper(QLearningAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    ql_approx_agent = AbstractionWrapper(QLearningAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent]

    dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    dql_exact_agent = AbstractionWrapper(DoubleQAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=exact_qds_test,
                                         name_ext="-exact")
    dql_approx_agent = AbstractionWrapper(DoubleQAgent,
                                          agent_params={"actions": actions},
                                          state_abstr=approx_qds_test,
                                          name_ext="-approx")
    dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent]

    rm_agent = RMaxAgent(actions, gamma=gamma)
    rm_exact_agent = AbstractionWrapper(RMaxAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    rm_approx_agent = AbstractionWrapper(RMaxAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent]

    if incl_opt:
        vi = ValueIteration(environment)
        vi.run_vi()
        opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$")

        sa_vi = AbstractValueIteration(
            environment,
            sample_rate=50,
            max_iterations=3000,
            delta=0.0001,
            state_abstr=approx_qds_test,
            action_abstr=ActionAbstraction(
                options=[], prim_actions=environment.get_actions()))
        sa_vi.run_vi()
        approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$")

        dql_agents += [opt_agent, approx_opt_agent]

    return ql_agents