Exemple #1
0
def compute_sub_opt_func_for_mdp_distr(mdp_distr):
    '''
    Args:
        mdp_distr (dict)

    Returns:
        (list): Contains the suboptimality function for each MDP in mdp_distr.
            subopt: V^*(s) - Q^(s,a)
    '''
    actions = mdp_distr.get_actions()
    sub_opt_funcs = []

    i = 0
    for mdp in mdp_distr.get_mdps():
        print "\t mdp", i + 1, "of", mdp_distr.get_num_mdps()
        vi = ValueIteration(mdp, delta=0.001, max_iterations=1000)
        iters, value = vi.run_vi()

        new_sub_opt_func = defaultdict(float)
        for s in vi.get_states():
            max_q = float("-inf")
            for a in actions:
                next_q = vi.get_q_value(s, a)
                if next_q > max_q:
                    max_q = next_q

            for a in actions:
                new_sub_opt_func[(s, a)] = max_q - vi.get_q_value(s, a)

        sub_opt_funcs.append(new_sub_opt_func)
        i += 1

    return sub_opt_funcs
Exemple #2
0
def make_goal_based_options(mdp_distr):
    '''
    Args:
        mdp_distr (MDPDistribution)

    Returns:
        (list): Contains Option instances.
    '''

    goal_list = set([])
    for mdp in mdp_distr.get_all_mdps():
        vi = ValueIteration(mdp)
        state_space = vi.get_states()
        for s in state_space:
            if s.is_terminal():
                goal_list.add(s)

    options = set([])
    for mdp in mdp_distr.get_all_mdps():

        init_predicate = Predicate(func=lambda x: True)
        term_predicate = InListPredicate(ls=goal_list)
        o = Option(init_predicate=init_predicate,
                    term_predicate=term_predicate,
                    policy=_make_mini_mdp_option_policy(mdp),
                    term_prob=0.0)
        options.add(o)

    return options
Exemple #3
0
def make_multitask_sa(mdp_distr,
                      state_class=State,
                      indic_func=ind_funcs._q_eps_approx_indicator,
                      epsilon=0.0,
                      aa_single_act=True,
                      track_act_opt_pr=False):
    '''
    Args:
        mdp_distr (MDPDistribution)
        state_class (Class)
        indicator_func (S x S --> {0,1})
        epsilon (float)
        aa_single_act (bool): If we should track optimal actions.

    Returns:
        (StateAbstraction)
    '''
    sa_list = []
    for mdp in mdp_distr.get_mdps():
        sa = make_singletask_sa(mdp,
                                indic_func,
                                state_class,
                                epsilon,
                                aa_single_act=aa_single_act,
                                prob_of_mdp=mdp_distr.get_prob_of_mdp(mdp),
                                track_act_opt_pr=track_act_opt_pr)
        sa_list += [sa]

    mdp = mdp_distr.get_all_mdps()[0]
    vi = ValueIteration(mdp)
    ground_states = vi.get_states()
    multitask_sa = merge_state_abstr(sa_list, ground_states)

    return multitask_sa
def make_goal_based_options(mdp_distr):
    '''
    Args:
        mdp_distr (MDPDistribution)

    Returns:
        (list): Contains Option instances.
    '''

    goal_list = set([])
    for mdp in mdp_distr.get_all_mdps():
        vi = ValueIteration(mdp)
        state_space = vi.get_states()
        for s in state_space:
            if s.is_terminal():
                goal_list.add(s)

    options = set([])
    for mdp in mdp_distr.get_all_mdps():

        init_predicate = Predicate(func=lambda x: True)
        term_predicate = InListPredicate(ls=goal_list)
        o = Option(init_predicate=init_predicate,
                   term_predicate=term_predicate,
                   policy=_make_mini_mdp_option_policy(mdp),
                   term_prob=0.0)
        options.add(o)

    return options
Exemple #5
0
def compute_avg_mdp(mdp_distr, sample_rate=5):
    '''
    Args:
        mdp_distr (defaultdict)

    Returns:
        (MDP)
    '''

    # Get normal components.
    init_state = mdp_distr.get_init_state()
    actions = mdp_distr.get_actions()
    gamma = mdp_distr.get_gamma()
    T = mdp_distr.get_all_mdps()[0].get_transition_func()

    # Compute avg reward.
    avg_rew = defaultdict(lambda: defaultdict(float))
    avg_trans_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(
        float)))  # Stores T_i(s,a,s') * Pr(M_i)
    for mdp in mdp_distr.get_mdps():
        prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp)

        # Get a vi instance to compute state space.
        vi = ValueIteration(mdp,
                            delta=0.0001,
                            max_iterations=2000,
                            sample_rate=sample_rate)
        iters, value = vi.run_vi()
        states = vi.get_states()

        for s in states:
            for a in actions:
                r = mdp.reward_func(s, a)

                avg_rew[s][a] += prob_of_mdp * r

                for repeat in range(sample_rate):
                    s_prime = mdp.transition_func(s, a)
                    avg_trans_counts[s][a][s_prime] += prob_of_mdp

    avg_trans_probs = defaultdict(
        lambda: defaultdict(lambda: defaultdict(float)))
    for s in avg_trans_counts.keys():
        for a in actions:
            for s_prime in avg_trans_counts[s][a].keys():
                avg_trans_probs[s][a][s_prime] = avg_trans_counts[s][a][
                    s_prime] / sum(avg_trans_counts[s][a].values())

    def avg_rew_func(s, a):
        return avg_rew[s][a]

    avg_trans_func = T
    avg_mdp = MDP(actions, avg_trans_func, avg_rew_func, init_state, gamma)

    return avg_mdp
Exemple #6
0
def _make_mini_mdp_option_policy(mini_mdp):
    '''
    Args:
        mini_mdp (MDP)

    Returns:
        Policy
    '''
    # Solve the MDP defined by the terminal abstract state.
    mini_mdp_vi = ValueIteration(mini_mdp, delta=0.001, max_iterations=1000, sample_rate=10)
    iters, val = mini_mdp_vi.run_vi()

    o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states())
    o_policy = PolicyFromDict(o_policy_dict)

    return o_policy.get_action
def make_random_sa_stack(mdp_distr, cluster_size_ratio=0.5, max_num_levels=2):
    '''
    Args:
        mdp_distr (MDPDistribution)
        cluster_size_ratio (float): A float in (0,1) that determines the size of the abstract state space.
        max_num_levels (int): Determines the _total_ number of levels in the hierarchy (includes ground).

    Returns:
        (StateAbstraction)
    '''

    # Get ground state space.
    vi = ValueIteration(mdp_distr.get_all_mdps()[0],
                        delta=0.0001,
                        max_iterations=5000)
    ground_state_space = vi.get_states()
    sa_stack = StateAbstractionStack(list_of_phi=[])

    # Each loop adds a stack.
    for i in range(max_num_levels - 1):

        # Grab curent state space (at level i).
        cur_state_space = _get_level_i_state_space(ground_state_space,
                                                   sa_stack, i)
        cur_state_space_size = len(cur_state_space)

        if int(cur_state_space_size / cluster_size_ratio) <= 1:
            # The abstract is as small as it can get.
            break

        # Add the mapping.
        new_phi = {}
        for s in cur_state_space:
            new_phi[s] = HierarchyState(data=random.randint(
                1, max(int(cur_state_space_size * cluster_size_ratio), 1)),
                                        level=i + 1)

        if len(set(new_phi.values())) <= 1:
            # The abstract is as small as it can get.
            break

        # Add the sa to the stack.
        sa_stack.add_phi(new_phi)

    return sa_stack
Exemple #8
0
def _make_mini_mdp_option_policy(mini_mdp):
    '''
    Args:
        mini_mdp (MDP)

    Returns:
        Policy
    '''
    # Solve the MDP defined by the terminal abstract state.
    mini_mdp_vi = ValueIteration(mini_mdp,
                                 delta=0.005,
                                 max_iterations=1000,
                                 sample_rate=30)
    iters, val = mini_mdp_vi.run_vi()

    o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy,
                                          mini_mdp_vi.get_states())
    o_policy = PolicyFromDict(o_policy_dict)

    return o_policy.get_action, mini_mdp_vi
    def __init__(self,
                 ground_mdp,
                 state_abstr=None,
                 action_abstr=None,
                 vi_sample_rate=5,
                 max_iterations=1000,
                 amdp_sample_rate=5,
                 delta=0.001):
        '''
        Args:
            ground_mdp (simple_rl.MDP)
            state_abstr (simple_rl.StateAbstraction)
            action_abstr (simple_rl.ActionAbstraction)
            vi_sample_rate (int): Num samples per transition for running VI.
            max_iterations (int): Usual VI # Iteration bound.
            amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract.
        '''
        self.ground_mdp = ground_mdp

        # Grab ground state space.
        vi = ValueIteration(self.ground_mdp,
                            delta=0.001,
                            max_iterations=1000,
                            sample_rate=5)
        state_space = vi.get_states()

        # Make the abstract MDP.
        self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction(
            ground_state_space=state_space)
        self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction(
            prim_actions=ground_mdp.get_actions())
        abstr_mdp = abstr_mdp_funcs.make_abstr_mdp(
            ground_mdp,
            self.state_abstr,
            self.action_abstr,
            step_cost=0.0,
            sample_rate=amdp_sample_rate)

        # Create VI with the abstract MDP.
        ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta,
                                max_iterations)
def get_distance(mdp, epsilon=0.05):

    vi = ValueIteration(mdp)
    vi.run_vi()
    vstar = vi.value_func  # dictionary of state -> float

    states = vi.get_states()  # list of state

    distance = defaultdict(lambda: defaultdict(float))

    v_df = ValueIterationDist(mdp, vstar)
    v_df.run_vi()
    d_to_s = v_df.distance
    for t in states:
        for s in states:
            distance[t][s] = max(d_to_s[t] - 1, 0)

    for s in states:  # s: state
        vis = ValueIterationDist(mdp, vstar)
        vis.add_fixed_val(s, vstar[s])
        vis.run_vi()
        d_to_s = vis.distance
        for t in states:
            distance[t][s] = min(d_to_s[t], distance[t][s])

    sToInd = OrderedDict()
    indToS = OrderedDict()
    for i, s in enumerate(states):
        sToInd[s] = i
        indToS[i] = s

    d = np.zeros((len(states), len(states)), dtype=int)
    # print "type(d)=", type(d)
    # print "d.shape=", d.shape
    for s in states:
        for t in states:
            # print 's, t=', index[s], index[t]
            d[sToInd[s]][sToInd[t]] = distance[s][t]

    return sToInd, indToS, d
Exemple #11
0
def compute_optimal_stoch_policy(mdp_distr):
    '''
    Args:
        mdp_distr (defaultdict)

    Returns:
        (lambda)
    '''

    # Key: state
    # Val: dict
    # Key: action
    # Val: probability
    policy_dict = defaultdict(lambda: defaultdict(float))

    # Compute optimal policy for each MDP.
    for mdp in mdp_distr.get_all_mdps():
        # Solve the MDP and get the optimal policy.
        vi = ValueIteration(mdp, delta=0.001, max_iterations=1000)
        iters, value = vi.run_vi()
        vi_policy = vi.policy
        states = vi.get_states()

        # Compute the probability each action is optimal in each state.
        prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp)
        for s in states:
            a_star = vi_policy(s)
            policy_dict[s][a_star] += prob_of_mdp

    # Create the lambda.
    def policy_from_dict(state):
        action_id = np.random.multinomial(
            1, policy_dict[state].values()).tolist().index(1)
        action = policy_dict[state].keys()[action_id]

        return action

    return policy_from_dict
    def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, vi_sample_rate=5, max_iterations=1000, amdp_sample_rate=5, delta=0.001):
        '''
        Args:
            ground_mdp (simple_rl.MDP)
            state_abstr (simple_rl.StateAbstraction)
            action_abstr (simple_rl.ActionAbstraction)
            vi_sample_rate (int): Num samples per transition for running VI.
            max_iterations (int): Usual VI # Iteration bound.
            amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract.
        '''
        self.ground_mdp = ground_mdp
    
        # Grab ground state space.
        vi = ValueIteration(self.ground_mdp, delta=0.001, max_iterations=1000, sample_rate=5)
        state_space = vi.get_states()

        # Make the abstract MDP.
        self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction(ground_state_space=state_space)
        self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction(prim_actions=ground_mdp.get_actions())
        abstr_mdp = abstr_mdp_funcs.make_abstr_mdp(ground_mdp, self.state_abstr, self.action_abstr, step_cost=0.0, sample_rate=amdp_sample_rate)

        # Create VI with the abstract MDP.
        ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta, max_iterations)
Exemple #13
0
def make_multitask_sa(mdp_distr, state_class=State, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0, aa_single_act=True, track_act_opt_pr=False):
    '''
    Args:
        mdp_distr (MDPDistribution)
        state_class (Class)
        indicator_func (S x S --> {0,1})
        epsilon (float)
        aa_single_act (bool): If we should track optimal actions.

    Returns:
        (StateAbstraction)
    '''
    sa_list = []
    for mdp in mdp_distr.get_mdps():
        sa = make_singletask_sa(mdp, indic_func, state_class, epsilon, aa_single_act=aa_single_act, prob_of_mdp=mdp_distr.get_prob_of_mdp(mdp), track_act_opt_pr=track_act_opt_pr)
        sa_list += [sa]

    mdp = mdp_distr.get_all_mdps()[0]
    vi = ValueIteration(mdp)
    ground_states = vi.get_states()
    multitask_sa = merge_state_abstr(sa_list, ground_states)

    return multitask_sa
def main():

    # Setup environment.
    mdp_class, agent_type, samples = parse_args()
    is_goal_terminal = False
    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute priors.

    # Stochastic mixture.
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy)

    # Avg MDP
    avg_mdp = ape.compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    # Make agents.

    # Q Learning
    ql_agent = QLearnerAgent(actions)
    shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy,
                                         actions=actions,
                                         name="Prior-QLearning")
    shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy,
                                          actions=actions,
                                          name="AvgMDP-QLearning")

    # RMax
    rmax_agent = RMaxAgent(actions)
    shaped_rmax_agent_prior = ShapedRMaxAgent(
        shaping_policy=opt_stoch_policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="Prior-RMax")
    shaped_rmax_agent_avgmdp = ShapedRMaxAgent(
        shaping_policy=avg_mdp_vi.policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="AvgMDP-RMax")
    prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr)

    if agent_type == "rmax":
        agents = [
            rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp,
            prune_rmax_agent
        ]
    else:
        agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=200,
                          is_rec_disc_reward=False,
                          verbose=True)
Exemple #15
0
def make_singletask_sa(mdp,
                       indic_func,
                       state_class,
                       epsilon=0.0,
                       aa_single_act=False,
                       prob_of_mdp=1.0,
                       track_act_opt_pr=False):
    '''
    Args:
        mdp (MDP)
        indic_func (S x S --> {0,1})
        state_class (Class)
        epsilon (float)

    Returns:
        (StateAbstraction)
    '''

    print("\tRunning VI...", )
    sys.stdout.flush()
    # Run VI
    if isinstance(mdp, MDPDistribution):
        mdp = mdp.sample()

    vi = ValueIteration(mdp)
    iters, val = vi.run_vi()
    print(" done.")

    print("\tMaking state abstraction...", )
    sys.stdout.flush()
    sa = StateAbstraction(phi={},
                          state_class=state_class,
                          track_act_opt_pr=track_act_opt_pr)
    clusters = defaultdict(list)
    num_states = len(vi.get_states())

    actions = mdp.get_actions()
    # Find state pairs that satisfy the condition.
    for i, state_x in enumerate(vi.get_states()):
        sys.stdout.flush()
        clusters[state_x] = [state_x]

        for state_y in vi.get_states()[i:]:
            if not (state_x == state_y) and indic_func(
                    state_x, state_y, vi, actions, epsilon=epsilon):
                clusters[state_x].append(state_y)
                clusters[state_y].append(state_x)

    print("making clusters...", )
    sys.stdout.flush()

    # Build SA.
    for i, state in enumerate(clusters.keys()):
        new_cluster = clusters[state]
        sa.make_cluster(new_cluster)

        # Destroy old so we don't double up.
        for s in clusters[state]:
            if s in clusters.keys():
                clusters.pop(s)

    if aa_single_act:
        # Put all optimal actions in a set associated with the ground state.
        for ground_s in sa.get_ground_states():
            a_star_set = set(vi.get_max_q_actions(ground_s))
            sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp)

    print(" done.")
    print("\tGround States:", num_states)
    print("\tAbstract:", sa.get_num_abstr_states())
    print()

    return sa
class BeliefUpdater(object):
    ''' Wrapper class for different methods for belief state updates in POMDPs. '''

    def __init__(self, mdp, transition_func, reward_func, observation_func, updater_type='discrete'):
        '''
        Args:
            mdp (POMDP)
            transition_func: T(s, a) --> s'
            reward_func: R(s, a) --> float
            observation_func: O(s, a) --> z
            updater_type (str)
        '''
        self.reward_func = reward_func
        self.updater_type = updater_type

        # We use the ValueIteration class to construct the transition and observation probabilities
        self.vi = ValueIteration(mdp, sample_rate=500)

        self.transition_probs = self.construct_transition_matrix(transition_func)
        self.observation_probs = self.construct_observation_matrix(observation_func, transition_func)

        if updater_type == 'discrete':
            self.updater = self.discrete_filter_updater
        elif updater_type == 'kalman':
            self.updater = self.kalman_filter_updater
        elif updater_type == 'particle':
            self.updater = self.particle_filter_updater
        else:
            raise AttributeError('updater_type {} did not conform to expected type'.format(updater_type))

    def discrete_filter_updater(self, belief, action, observation):
        def _compute_normalization_factor(bel):
            return sum(bel.values())

        def _update_belief_for_state(b, sp, T, O, a, z):
            return O[sp][z] * sum([T[s][a][sp] * b[s] for s in b])

        new_belief = defaultdict()
        for sprime in belief:
            new_belief[sprime] = _update_belief_for_state(belief, sprime, self.transition_probs, self.observation_probs, action, observation)

        normalization = _compute_normalization_factor(new_belief)

        for sprime in belief:
            if normalization > 0: new_belief[sprime] /= normalization

        return new_belief

    def kalman_filter_updater(self, belief, action, observation):
        pass

    def particle_filter_updater(self, belief, action, observation):
        pass

    def construct_transition_matrix(self, transition_func):
        '''
        Create an MLE of the transition probabilities by sampling from the transition_func
        multiple times.
        Args:
            transition_func: T(s, a) -> s'

        Returns:
            transition_probabilities (defaultdict): T(s, a, s') --> float
        '''
        self.vi._compute_matrix_from_trans_func()
        return self.vi.trans_dict

    def construct_observation_matrix(self, observation_func, transition_func):
        '''
        Create an MLE of the observation probabilities by sampling from the observation_func
        multiple times.
        Args:
            observation_func: O(s) -> z
            transition_func: T(s, a) -> s'

        Returns:
            observation_probabilities (defaultdict): O(s, z) --> float
        '''
        def normalize_probabilities(odict):
            norm_factor = sum(odict.values())
            for obs in odict:
                odict[obs] /= norm_factor
            return odict

        obs_dict = defaultdict(lambda:defaultdict(float))
        for state in self.vi.get_states():
            for action in self.vi.mdp.actions:
                for sample in range(self.vi.sample_rate):
                    observation = observation_func(state, action)
                    next_state = transition_func(state, action)
                    obs_dict[next_state][observation] += 1. / self.vi.sample_rate
        for state in self.vi.get_states():
            obs_dict[state] = normalize_probabilities(obs_dict[state])
        return obs_dict
Exemple #17
0
class BeliefUpdater(object):
    ''' Wrapper class for different methods for belief state updates in POMDPs. '''
    def __init__(self,
                 mdp,
                 transition_func,
                 reward_func,
                 observation_func,
                 updater_type='discrete'):
        '''
        Args:
            mdp (POMDP)
            transition_func: T(s, a) --> s'
            reward_func: R(s, a) --> float
            observation_func: O(s, a) --> z
            updater_type (str)
        '''
        self.reward_func = reward_func
        self.updater_type = updater_type

        # We use the ValueIteration class to construct the transition and observation probabilities
        self.vi = ValueIteration(mdp, sample_rate=500)

        self.transition_probs = self.construct_transition_matrix(
            transition_func)
        self.observation_probs = self.construct_observation_matrix(
            observation_func, transition_func)

        if updater_type == 'discrete':
            self.updater = self.discrete_filter_updater
        elif updater_type == 'kalman':
            self.updater = self.kalman_filter_updater
        elif updater_type == 'particle':
            self.updater = self.particle_filter_updater
        else:
            raise AttributeError(
                'updater_type {} did not conform to expected type'.format(
                    updater_type))

    def discrete_filter_updater(self, belief, action, observation):
        def _compute_normalization_factor(bel):
            return sum(bel.values())

        def _update_belief_for_state(b, sp, T, O, a, z):
            return O[sp][z] * sum([T[s][a][sp] * b[s] for s in b])

        new_belief = defaultdict()
        for sprime in belief:
            new_belief[sprime] = _update_belief_for_state(
                belief, sprime, self.transition_probs, self.observation_probs,
                action, observation)

        normalization = _compute_normalization_factor(new_belief)

        for sprime in belief:
            if normalization > 0: new_belief[sprime] /= normalization

        return new_belief

    def kalman_filter_updater(self, belief, action, observation):
        pass

    def particle_filter_updater(self, belief, action, observation):
        pass

    def construct_transition_matrix(self, transition_func):
        '''
        Create an MLE of the transition probabilities by sampling from the transition_func
        multiple times.
        Args:
            transition_func: T(s, a) -> s'

        Returns:
            transition_probabilities (defaultdict): T(s, a, s') --> float
        '''
        self.vi._compute_matrix_from_trans_func()
        return self.vi.trans_dict

    def construct_observation_matrix(self, observation_func, transition_func):
        '''
        Create an MLE of the observation probabilities by sampling from the observation_func
        multiple times.
        Args:
            observation_func: O(s) -> z
            transition_func: T(s, a) -> s'

        Returns:
            observation_probabilities (defaultdict): O(s, z) --> float
        '''
        def normalize_probabilities(odict):
            norm_factor = sum(odict.values())
            for obs in odict:
                odict[obs] /= norm_factor
            return odict

        obs_dict = defaultdict(lambda: defaultdict(float))
        for state in self.vi.get_states():
            for action in self.vi.mdp.actions:
                for sample in range(self.vi.sample_rate):
                    observation = observation_func(state, action)
                    next_state = transition_func(state, action)
                    obs_dict[next_state][
                        observation] += 1. / self.vi.sample_rate
        for state in self.vi.get_states():
            obs_dict[state] = normalize_probabilities(obs_dict[state])
        return obs_dict
Exemple #18
0
def make_singletask_sa(mdp,
                       indic_func,
                       state_class,
                       epsilon=0.0,
                       aa_single_act=False,
                       prob_of_mdp=1.0):
    '''
    Args:
        mdp (MDP)
        indic_func (S x S --> {0,1})
        state_class (Class)
        epsilon (float)

    Returns:
        (StateAbstraction)
    '''

    print "\tRunning VI...",
    sys.stdout.flush()
    # Run VI
    if isinstance(mdp, MDPDistribution):
        mdp = mdp.sample()

    vi = ValueIteration(mdp)
    iters, val = vi.run_vi()
    print " done."

    print "\tMaking state abstraction...",
    sys.stdout.flush()
    sa = StateAbstraction(phi={}, state_class=state_class)
    clusters = defaultdict(set)
    num_states = len(vi.get_states())
    actions = mdp.get_actions()

    # Find state pairs that satisfy the condition.
    for i, state_x in enumerate(vi.get_states()):
        sys.stdout.flush()
        clusters[state_x].add(state_x)

        for state_y in vi.get_states()[i:]:
            if not (state_x == state_y) and indic_func(
                    state_x, state_y, vi, actions, epsilon=epsilon):
                clusters[state_x].add(state_y)
                clusters[state_y].add(state_x)

    print "making clusters...",
    sys.stdout.flush()

    # Build SA.
    for i, state in enumerate(clusters.keys()):
        new_cluster = clusters[state]
        sa.make_cluster(new_cluster)

        # Destroy old so we don't double up.
        for s in clusters[state]:
            if s in clusters.keys():
                clusters.pop(s)

    print " done."
    print "\tGround States:", num_states
    print "\tAbstract:", sa.get_num_abstr_states()
    print

    return sa
Exemple #19
0
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0, track_act_opt_pr=False):
    '''
    Args:
        mdp (MDP)
        indic_func (S x S --> {0,1})
        state_class (Class)
        epsilon (float)

    Returns:
        (StateAbstraction)
    '''

    print("\tRunning VI...",)
    sys.stdout.flush()
    # Run VI
    if isinstance(mdp, MDPDistribution):
        mdp = mdp.sample()

    vi = ValueIteration(mdp)
    iters, val = vi.run_vi()
    print(" done.")

    print("\tMaking state abstraction...",)
    sys.stdout.flush()
    sa = StateAbstraction(phi={}, state_class=state_class, track_act_opt_pr=track_act_opt_pr)
    clusters = defaultdict(list)
    num_states = len(vi.get_states())

    actions = mdp.get_actions()
    # Find state pairs that satisfy the condition.
    for i, state_x in enumerate(vi.get_states()):
        sys.stdout.flush()
        clusters[state_x] = [state_x]

        for state_y in vi.get_states()[i:]:
            if not (state_x == state_y) and indic_func(state_x, state_y, vi, actions, epsilon=epsilon):
                clusters[state_x].append(state_y)
                clusters[state_y].append(state_x)

    print("making clusters...",)
    sys.stdout.flush()
    
    # Build SA.
    for i, state in enumerate(clusters.keys()):
        new_cluster = clusters[state]
        sa.make_cluster(new_cluster)

        # Destroy old so we don't double up.
        for s in clusters[state]:
            if s in clusters.keys():
                clusters.pop(s)
    
    if aa_single_act:
        # Put all optimal actions in a set associated with the ground state.
        for ground_s in sa.get_ground_states():
            a_star_set = set(vi.get_max_q_actions(ground_s))
            sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp)

    print(" done.")
    print("\tGround States:", num_states)
    print("\tAbstract:", sa.get_num_abstr_states())
    print()

    return sa