def __init__(self, mdp, rew_func_list, trans_func_list, sa_stack, aa_stack, name="hierarch_value_iter", delta=0.001, max_iterations=200, sample_rate=3): ''' Args: mdp (MDP) delta (float): After an iteration if VI, if no change more than @\delta has occurred, terminates. max_iterations (int): Hard limit for number of iterations. sample_rate (int): Determines how many samples from @mdp to take to estimate T(s' | s, a). horizon (int): Number of steps before terminating. ''' self.rew_func_list = rew_func_list self.trans_func_list = trans_func_list self.sa_stack = sa_stack self.aa_stack = aa_stack abstr_actions = [] for aa in self.aa_stack.get_aa_list(): abstr_actions += aa.get_actions() self.actions = mdp.get_actions() + abstr_actions ValueIteration.__init__(self, mdp, name=name, delta=delta, max_iterations=max_iterations, sample_rate=sample_rate)
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, sample_rate=10, delta=0.001, max_iterations=1000): ''' Args: ground_mdp (simple_rl.MDP) state_abstr (simple_rl.StateAbstraction) action_abstr (simple_rl.ActionAbstraction) ''' self.ground_mdp = ground_mdp # If None is given for either, set the sa/aa to defaults. self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction( ) self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction( prim_actions=ground_mdp.get_actions()) mdp = make_abstr_mdp(ground_mdp, self.state_abstr, self.action_abstr, step_cost=0.0) ValueIteration.__init__(self, mdp, sample_rate, delta, max_iterations)
def __init__(self, mdp, name="value_iter", delta=0.0001, max_iterations=500, sample_rate=3): ValueIteration.__init__(self, mdp, name, delta, max_iterations, sample_rate) # Including for clarity. OptionsMDPValueIteration gets actions from its # MDP instance, and not from the self.actions variable in the Planner class. self.actions = None
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, vi_sample_rate=5, max_iterations=1000, amdp_sample_rate=5, delta=0.001): ''' Args: ground_mdp (simple_rl.MDP) state_abstr (simple_rl.StateAbstraction) action_abstr (simple_rl.ActionAbstraction) vi_sample_rate (int): Num samples per transition for running VI. max_iterations (int): Usual VI # Iteration bound. amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract. ''' self.ground_mdp = ground_mdp # Grab ground state space. vi = ValueIteration(self.ground_mdp, delta=0.001, max_iterations=1000, sample_rate=5) state_space = vi.get_states() # Make the abstract MDP. self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction( ground_state_space=state_space) self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction( prim_actions=ground_mdp.get_actions()) abstr_mdp = abstr_mdp_funcs.make_abstr_mdp( ground_mdp, self.state_abstr, self.action_abstr, step_cost=0.0, sample_rate=amdp_sample_rate) # Create VI with the abstract MDP. ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta, max_iterations)
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, vi_sample_rate=5, max_iterations=1000, amdp_sample_rate=5, delta=0.001): ''' Args: ground_mdp (simple_rl.MDP) state_abstr (simple_rl.StateAbstraction) action_abstr (simple_rl.ActionAbstraction) vi_sample_rate (int): Num samples per transition for running VI. max_iterations (int): Usual VI # Iteration bound. amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract. ''' self.ground_mdp = ground_mdp # Grab ground state space. vi = ValueIteration(self.ground_mdp, delta=0.001, max_iterations=1000, sample_rate=5) state_space = vi.get_states() # Make the abstract MDP. self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction(ground_state_space=state_space) self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction(prim_actions=ground_mdp.get_actions()) abstr_mdp = abstr_mdp_funcs.make_abstr_mdp(ground_mdp, self.state_abstr, self.action_abstr, step_cost=0.0, sample_rate=amdp_sample_rate) # Create VI with the abstract MDP. ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta, max_iterations)
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, sample_rate=10, delta=0.001, max_iterations=1000): ''' Args: ground_mdp (MDP) state_abstr (StateAbstraction) action_abstr (ActionAbstraction) ''' self.ground_mdp = ground_mdp self.state_abstr = state_abstr if state_abstr not in [ [], None ] else StateAbstraction() self.action_abstr = action_abstr if action_abstr not in [ [], None ] else ActionAbstraction(prim_actions=ground_mdp.get_actions()) mdp = make_abstr_mdp(ground_mdp, self.state_abstr, self.action_abstr) ValueIteration.__init__(self, mdp, sample_rate, delta, max_iterations) # self.delta = delta # self.max_iterations = max_iterations # self.sample_rate = sample_rate # self.value_func = defaultdict(float) # self.reachability_done = False # self.has_run_vi = False # self._compute_reachable_state_space() # def get_num_states(self): # return len(self.states) # def get_states(self): # if self.reachability_done: # return self.states # else: # self._compute_reachable_state_space() # return self.states # def _compute_reachable_state_space(self): # ''' # Summary: # Starting with @self.start_state, determines all reachable states # and stores their abstracted counterparts in self.states. # ''' # state_queue = Queue.Queue() # s_g_init = self.mdp.get_init_state() # s_a_init = self.state_abstr.phi(s_g_init) # state_queue.put(s_g_init) # self.states.add(s_a_init) # ground_t = self.mdp.get_transition_func() # while not state_queue.empty(): # ground_state = state_queue.get() # for option in self.action_abstr.get_active_options(ground_state): # # For each active option. # # Take @sample_rate samples to estimate E[V] # for samples in xrange(self.sample_rate): # next_g_state = option.act_until_terminal(ground_state, ground_t) # if next_g_state not in self.states: # next_a_state = self.state_abstr.phi(next_g_state) # self.states.add(next_a_state) # state_queue.put(next_g_state) # self.reachability_done = True # def plan(self, ground_state=None, horizon=100): # ''' # Args: # ground_state (State) # horizon (int) # Returns: # (tuple): # (list): List of primitive actions taken. # (list): List of ground states. # (list): List of abstract actions taken. # ''' # ground_state = self.mdp.get_init_state() if ground_state is None else ground_state # if self.has_run_vi is False: # print "Warning: VI has not been run. Plan will be random." # primitive_action_seq = [] # abstr_action_seq = [] # state_seq = [ground_state] # steps = 0 # ground_t = self.transition_func # # Until terminating condition is met. # while (not ground_state.is_terminal()) and steps < horizon: # # Compute best action, roll it out. # next_option = self._get_max_q_action(ground_state) # while not next_option.is_term_true(ground_state): # # Keep applying option until it terminates. # abstr_state = self.state_abstr.phi(ground_state) # ground_action = next_option.act(ground_state) # ground_state = ground_t(ground_state, ground_action) # steps += 1 # primitive_action_seq.append(ground_action) # state_seq.append(ground_state) # abstr_action_seq.append(next_option) # return primitive_action_seq, state_seq, abstr_action_seq # def run_vi(self): # ''' # Summary: # Runs ValueIteration and fills in the self.value_func. # ''' # # Algorithm bookkeeping params. # iterations = 0 # max_diff = float("inf") # # Main loop. # while max_diff > self.delta and iterations < self.max_iterations: # max_diff = 0 # for s_g in self.get_states(): # if s_g.is_terminal(): # continue # max_q = float("-inf") # for a in self.action_abstr.get_active_options(s_g): # # For each active option, compute it's q value. # q_s_a = self.get_q_value(s_g, a) # max_q = q_s_a if q_s_a > max_q else max_q # # Check terminating condition. # max_diff = max(abs(self.value_func[s_g] - max_q), max_diff) # # Update value. # self.value_func[s_g] = max_q # iterations += 1 # value_of_init_state = self._compute_max_qval_action_pair(self.init_state)[0] # self.has_run_vi = True # return iterations, value_of_init_state # def get_q_value(self, s_g, option): # ''' # Args: # s (State) # a (Option): Assumed active option. # Returns: # (float): The Q estimate given the current value function @self.value_func. # ''' # # Take samples and track next state counts. # next_state_counts = defaultdict(int) # reward_total = 0 # for samples in xrange(self.sample_rate): # Take @sample_rate samples to estimate E[V] # next_state, reward, num_steps = self.do_rollout(option, s_g) # next_state_counts[next_state] += 1 # reward_total += reward # # Compute T(s' | s, option) estimate based on MLE and R(s, option). # next_state_probs = defaultdict(float) # avg_reward = 0 # for state in next_state_counts: # next_state_probs[state] = float(next_state_counts[state]) / self.sample_rate # avg_reward = float(reward_total) / self.sample_rate # # Compute expected value. # expected_future_val = 0 # for state in next_state_probs: # expected_future_val += next_state_probs[state] * self.value_func[state] # return avg_reward + self.gamma*expected_future_val # def do_rollout(self, option, ground_state): # ''' # Args: # option (Option) # ground_state (State) # Returns: # (tuple): # (State): Next ground state. # (float): Reward. # (int): Number of steps taken. # ''' # ground_t = self.mdp.get_transition_func() # ground_r = self.mdp.get_reward_func() # if type(option) is str: # ground_action = option # else: # ground_action = option.act(ground_state) # total_reward = ground_r(ground_state, ground_action) # ground_state = ground_t(ground_state, ground_action) # total_steps = 1 # while type(option) is not str and not option.is_term_true(ground_state): # # Keep applying option until it terminates. # ground_action = option.act(ground_state) # total_reward += ground_r(ground_state, ground_action) # ground_state = ground_t(ground_state, ground_action) # total_steps += 1 # return ground_state, total_reward, total_steps # def _compute_max_qval_action_pair(self, state): # ''' # Args: # state (State) # Returns: # (tuple) --> (float, str): where the float is the Qval, str is the action. # ''' # # Grab random initial action in case all equal # max_q_val = float("-inf") # shuffled_option_list = self.action_abstr.get_active_options(state)[:] # if len(shuffled_option_list) == 0: # # Prims on failure. # shuffled_option_list = self.mdp.get_actions() # random.shuffle(shuffled_option_list) # best_action = shuffled_option_list[0] # # Find best action (action w/ current max predicted Q value) # for option in shuffled_option_list: # q_s_a = self.get_q_value(state, option) # if q_s_a > max_q_val: # max_q_val = q_s_a # best_action = option # return max_q_val, best_action # def _get_max_q_action(self, state): # ''' # Args: # state (State) # Returns: # (str): denoting the action with the max q value in the given @state. # ''' # return self._compute_max_qval_action_pair(state)[1] # def policy(self, state): # ''' # Args: # state (State) # Returns: # (str): Action # Summary: # For use in a FixedPolicyAgent. # ''' # return self._get_max_q_action(state) # def main(): # # MDP Setting. # multi_task = False # mdp_class = "grid" # # Make single/multi task environment. # environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, num_mdps=3, horizon=30) if multi_task else make_mdp.make_mdp(mdp_class=mdp_class) # actions = environment.get_actions() # gamma = environment.get_gamma() # directed_sa, directed_aa = ae.get_abstractions(environment, directed=True) # default_sa, default_aa = ae.get_sa(environment, default=True), ae.get_aa(environment, default=True) # vi = ValueIteration(environment) # avi = AbstractValueIteration(environment, state_abstr=default_sa, action_abstr=default_aa) # a_num_iters, a_val = avi.run_vi() # g_num_iters, g_val = vi.run_vi() # print "a", a_num_iters, a_val # print "g", g_num_iters, g_val # if __name__ == "__main__": # main()