コード例 #1
0
ファイル: chain.py プロジェクト: shadowkun/rl_abstraction
 def __init__(self, gamma, kappa=0.001):
     MDP.__init__(self,
                  BadChainMDP.ACTIONS,
                  self._transition_func,
                  self._reward_func,
                  init_state=ChainState(1),
                  gamma=gamma)
     self.num_states = 4
     self.kappa = kappa
コード例 #2
0
def make_option_policy(mdp, init_states, goal_states):
    '''
    Args:
        mdp
        init_states
        goal_states

    Returns:
        (lambda)
    '''

    def goal_new_trans_func(s, a):
        original = s.is_terminal()
        s.set_terminal(s in goal_states) # or original)
        s_prime = mdp.get_transition_func()(s, a)
        s_prime.set_terminal(s_prime in goal_states)
        s.set_terminal(original)
        return s_prime

    # Make a new MDP.
    mini_mdp = MDP(actions=mdp.get_actions(),
            init_state=mdp.get_init_state(),
            transition_func=goal_new_trans_func,
            reward_func=lambda x,y,z : -1)

    o_policy, _ = _make_mini_mdp_option_policy(mini_mdp)

    return o_policy
コード例 #3
0
def compute_avg_mdp(mdp_distr, sample_rate=5):
    '''
    Args:
        mdp_distr (defaultdict)

    Returns:
        (MDP)
    '''

    # Get normal components.
    init_state = mdp_distr.get_init_state()
    actions = mdp_distr.get_actions()
    gamma = mdp_distr.get_gamma()
    T = mdp_distr.get_all_mdps()[0].get_transition_func()

    # Compute avg reward.
    avg_rew = defaultdict(lambda: defaultdict(float))
    avg_trans_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(
        float)))  # Stores T_i(s,a,s') * Pr(M_i)
    for mdp in mdp_distr.get_mdps():
        prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp)

        # Get a vi instance to compute state space.
        vi = ValueIteration(mdp,
                            delta=0.0001,
                            max_iterations=2000,
                            sample_rate=sample_rate)
        iters, value = vi.run_vi()
        states = vi.get_states()

        for s in states:
            for a in actions:
                r = mdp.reward_func(s, a)

                avg_rew[s][a] += prob_of_mdp * r

                for repeat in range(sample_rate):
                    s_prime = mdp.transition_func(s, a)
                    avg_trans_counts[s][a][s_prime] += prob_of_mdp

    avg_trans_probs = defaultdict(
        lambda: defaultdict(lambda: defaultdict(float)))
    for s in avg_trans_counts.keys():
        for a in actions:
            for s_prime in avg_trans_counts[s][a].keys():
                avg_trans_probs[s][a][s_prime] = avg_trans_counts[s][a][
                    s_prime] / sum(avg_trans_counts[s][a].values())

    def avg_rew_func(s, a):
        return avg_rew[s][a]

    avg_trans_func = T
    avg_mdp = MDP(actions, avg_trans_func, avg_rew_func, init_state, gamma)

    return avg_mdp
コード例 #4
0
def make_abstr_mdp(mdp,
                   state_abstr,
                   action_abstr=None,
                   step_cost=0.0,
                   sample_rate=5,
                   max_rollout=10):
    '''
	Args:
		mdp (MDP)
		state_abstr (StateAbstraction)
		action_abstr (ActionAbstraction)
		step_cost (float): Cost for a step in the lower MDP.
		sample_rate (int): Sample rate for computing the abstract R and T.

	Returns:
		(MDP)
	'''

    if action_abstr is None:
        action_abstr = ActionAbstraction(prim_actions=mdp.get_actions())

    # Make abstract reward and transition functions.
    def abstr_reward_lambda(abstr_state, abstr_action):
        if abstr_state.is_terminal():
            return 0

        # Get relevant MDP components from the lower MDP.
        lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
        lower_reward_func = mdp.get_reward_func()
        lower_trans_func = mdp.get_transition_func()

        # Compute reward.
        total_reward = 0
        for ground_s in lower_states:
            for sample in range(sample_rate):
                s_prime, reward = abstr_action.rollout(
                    ground_s,
                    lower_reward_func,
                    lower_trans_func,
                    max_rollout_depth=max_rollout,
                    step_cost=step_cost)
                total_reward += float(reward) / (
                    len(lower_states) * sample_rate)  # Add weighted reward.
        return total_reward

    def abstr_transition_lambda(abstr_state, abstr_action):
        is_ground_terminal = False
        for s_g in state_abstr.get_lower_states_in_abs_state(abstr_state):
            if s_g.is_terminal():
                is_ground_terminal = True
                break

        # Get relevant MDP components from the lower MDP.
        if abstr_state.is_terminal():
            return abstr_state

        lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
        lower_reward_func = mdp.get_reward_func()
        lower_trans_func = mdp.get_transition_func()

        # Compute next state distribution.
        s_prime_prob_dict = defaultdict(int)
        total_reward = 0
        for ground_s in lower_states:
            for sample in range(sample_rate):
                s_prime, reward = abstr_action.rollout(
                    ground_s,
                    lower_reward_func,
                    lower_trans_func,
                    max_rollout_depth=max_rollout)
                s_prime_prob_dict[s_prime] += (
                    1.0 / (len(lower_states) * sample_rate)
                )  # Weighted average.

        # Form distribution and sample s_prime.
        next_state_sample_list = list(
            np.random.multinomial(1,
                                  list(s_prime_prob_dict.values())).tolist())
        end_ground_state = list(
            s_prime_prob_dict.keys())[next_state_sample_list.index(1)]
        end_abstr_state = state_abstr.phi(end_ground_state)
        return end_abstr_state

    # Make the components of the Abstract MDP.
    abstr_init_state = state_abstr.phi(mdp.get_init_state())
    abstr_action_space = action_abstr.get_actions()
    abstr_state_space = state_abstr.get_abs_states()
    abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space,
                                   abstr_action_space)
    abstr_transition_func = TransitionFunc(abstr_transition_lambda,
                                           abstr_state_space,
                                           abstr_action_space,
                                           sample_rate=sample_rate)

    # Make the MDP.
    abstr_mdp = MDP(actions=abstr_action_space,
                    init_state=abstr_init_state,
                    reward_func=abstr_reward_func.reward_func,
                    transition_func=abstr_transition_func.transition_func,
                    gamma=mdp.get_gamma())

    return abstr_mdp
コード例 #5
0
def make_abstr_mdp(mdp, state_abstr, action_abstr, sample_rate=25):
    '''
	Args:
		mdp (MDP)
		state_abstr (StateAbstraction)
		action_abstr (ActionAbstraction)
		sample_rate (int): Sample rate for computing the abstract R and T.

	Returns:
		(MDP)
	'''

    # Grab ground state space.
    vi = ValueIteration(mdp)
    state_space = vi.get_states()

    # Make abstract reward and transition functions.
    def abstr_reward_lambda(abstr_state, abstr_action):
        # Get relevant MDP components from the lower MDP.
        lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
        lower_reward_func = mdp.get_reward_func()
        lower_trans_func = mdp.get_transition_func()

        # Compute reward.
        total_reward = 0
        for ground_s in lower_states:
            for sample in xrange(sample_rate):
                s_prime, reward = abstr_action.rollout(ground_s,
                                                       lower_reward_func,
                                                       lower_trans_func)
                total_reward += float(reward) / (
                    len(lower_states) * sample_rate)  # Add weighted reward.

        # print "~"*20
        # print "R_A:", abstr_state, abstr_action, total_reward
        # print "~"*20

        return total_reward

    def abstr_transition_lambda(abstr_state, abstr_action):
        # print "Abstr Transition Func:"
        # print "\t abstr_state:", abstr_state
        # Get relevant MDP components from the lower MDP.
        lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state)
        lower_reward_func = mdp.get_reward_func()
        lower_trans_func = mdp.get_transition_func()

        # Compute next state distribution.
        s_prime_prob_dict = defaultdict(int)
        total_reward = 0
        for ground_s in lower_states:
            for sample in xrange(sample_rate):
                s_prime, reward = abstr_action.rollout(ground_s,
                                                       lower_reward_func,
                                                       lower_trans_func)
                s_prime_prob_dict[s_prime] += (
                    1.0 / (len(lower_states) * sample_rate)
                )  # Weighted average.

        # Form distribution and sample s_prime.
        end_ground_state = s_prime_prob_dict.keys()[list(
            np.random.multinomial(
                1, s_prime_prob_dict.values()).tolist()).index(1)]
        end_abstr_state = state_abstr.phi(end_ground_state,
                                          level=abstr_state.get_level())

        return end_abstr_state

    # Make the components of the MDP.
    abstr_init_state = state_abstr.phi(mdp.get_init_state())
    abstr_action_space = action_abstr.get_actions()
    abstr_state_space = state_abstr.get_abs_states()
    abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space,
                                   abstr_action_space)
    abstr_transition_func = TransitionFunc(abstr_transition_lambda,
                                           abstr_state_space,
                                           abstr_action_space,
                                           sample_rate=sample_rate)

    # Make the MDP.
    abstr_mdp = MDP(actions=abstr_action_space,
                    init_state=abstr_init_state,
                    reward_func=abstr_reward_func.reward_func,
                    transition_func=abstr_transition_func.transition_func,
                    gamma=0.5)

    return abstr_mdp