def to_mrp(self, pi: Policy) -> MRP: # the goal here is to produce the input to the MRP constructor mrp_input = {} for state in self.s_a_s_.keys(): output_states = set() output_reward = 0 for action in pi.get_actions(state).keys(): output_states = output_states.union( set(self.s_a_s_[state][action].keys())) output_reward += self.s_a_r_[state][action] * pi.get_prob( state, action) output_probs = {} for state2 in output_states: for action in pi.get_actions(state).keys(): if state2 in self.s_a_s_[state][action].keys(): if state2 in output_probs.keys(): output_probs[state2] += self.s_a_s_[state][action][ state2] * pi.get_prob(state, action) else: output_probs[state2] = self.s_a_s_[state][action][ state2] * pi.get_prob(state, action) mrp_input[state] = (output_probs, output_reward) return MRP(mrp_input, self.gamma_)
def get_soft_policy_from_qf_dict(qf_dict: SAf, softmax: bool, epsilon: float) -> Policy: if softmax: ret = Policy( {s: get_softmax_action_probs(v) for s, v in qf_dict.items()}) else: ret = Policy({ s: get_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items() }) return ret
def get_value_func_dict(self, pol: Policy) -> VFType: sa_dict = self.mdp_rep.state_action_dict vf_dict = {s: 0. for s in sa_dict.keys()} act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in sa_dict.keys()} episodes = 0 updates = 0 while episodes < self.num_episodes: et_dict = {s: 0. for s in sa_dict.keys()} state = self.mdp_rep.init_state_gen() steps = 0 terminate = False while not terminate: action = act_gen_dict[state]() next_state, reward =\ self.mdp_rep.state_reward_gen_dict[state][action]() delta = reward + self.mdp_rep.gamma * vf_dict[next_state] -\ vf_dict[state] et_dict[state] += 1 alpha = self.learning_rate * (updates / self.learning_rate_decay + 1) ** -0.5 for s in sa_dict.keys(): vf_dict[s] += alpha * delta * et_dict[s] et_dict[s] *= self.gamma_lambda updates += 1 steps += 1 terminate = steps >= self.max_steps or\ state in self.mdp_rep.terminal_states state = next_state episodes += 1 return vf_dict
def get_vf_dict_from_qf_dict_and_policy(qf_dict: SAf, pol: Policy) -> Mapping[A, float]: return { s: sum(pol.get_state_action_probability(s, a) * q for a, q in v.items()) for s, v in qf_dict.items() }
def policy_improve(mdp: MDP, vf: VF) -> Policy: new_pol = {} # for each state for state in mdp.states_: # find maximizing action max_val = float('-inf') max_action = [] # iterate across actions for action in mdp.s_a_s_[state].keys(): action_val = 0 # find the expected action value for state2 in mdp.s_a_s_[state][action].keys(): action_val += mdp.s_a_s_[state][action][state2][0] * (mdp.s_a_s_[state][action][state2][1] + mdp.gamma_ * vf.value_dict_[state2]) # update if new max if action_val > max_val: max_val = action_val max_action = [action] elif action_val == max_val: max_action.append(action) actions = {} for action in max_action: actions[action] = 1.0 / len(max_action) new_pol[state] = actions return Policy(new_pol)
def get_mc_path( self, pol: Policy, start_state: S, start_action: Optional[A] = None, ) -> Sequence[Tuple[S, A, float, bool]]: res = [] next_state = start_state steps = 0 terminate = False occ_states = set() act_gen_dict = { s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in self.mdp_rep.state_action_dict.keys() } while not terminate: state = next_state first = state not in occ_states occ_states.add(state) action = act_gen_dict[state]()\ if (steps > 0 or start_action is None) else start_action next_state, reward =\ self.mdp_rep.state_reward_gen_dict[state][action]() res.append((state, action, reward, first)) steps += 1 terminate = steps >= self.max_steps or\ state in self.mdp_rep.terminal_states return res
def get_value_func_dict(self, pol: Policy) -> VFType: sa_dict = self.mdp_rep.state_action_dict vf_dict = {s: 0.0 for s in sa_dict.keys()} act_gen_dict = { s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in sa_dict.keys() } episodes = 0 while episodes < self.num_episodes: state = self.mdp_rep.init_state_gen() steps = 0 terminate = False while not terminate: action = act_gen_dict[state]() next_state, reward = \ self.mdp_rep.state_reward_gen_dict[state][action]() vf_dict[state] += self.alpha * \ (reward + self.mdp_rep.gamma * vf_dict[next_state] - vf_dict[state]) state = next_state steps += 1 terminate = steps >= self.max_steps or \ state in self.mdp_rep.terminal_states episodes += 1 return vf_dict
def get_optimal_policy(self, tol=1e-4) -> DetPolicy: pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items()}) vf = self.get_value_func_dict(pol) epsilon = tol * 1e4 while epsilon >= tol: pol = self.get_improved_policy(pol) new_vf = self.get_value_func_dict(pol) epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return pol
def get_value_func(self, polf: PolicyType) -> Callable[[S], float]: pol = Policy({ s: get_pdf_from_samples( polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION)) for s, v in self.get_state_action_dict().items() }) # noinspection PyShadowingNames def vf(state: S, pol=pol) -> float: return self.get_value_func_dict(pol)[state] return vf
def get_value_func(self, pol_func: Callable[[S], Callable[[A], float]])\ -> Callable[[S], float]: pol = Policy({ s: {a: pol_func(s)(a) for a in v} for s, v in self.get_state_action_dict() }) # noinspection PyShadowingNames def vf(state: S, pol=pol) -> float: return self.get_value_func_dict(pol)[state] return vf
def get_act_value_func(self, pol_func: Callable[[S], Callable[[A], float]])\ -> Callable[[S], Callable[[A], float]]: pol = Policy({ s: {a: pol_func(s)(a) for a in v} for s, v in self.get_state_action_dict() }) # noinspection PyShadowingNames def qvf(state: S, pol=pol) -> Callable[[A], float]: # noinspection PyShadowingNames def inner_f(action: A, pol=pol, state=state) -> float: return self.get_act_value_func_dict(pol)[state][action] return inner_f return qvf
def get_optimal_policy(self, tol=1e-4) -> DetPolicy: # Initialize a policy pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items()}) # Initialize a value function vector by rolling out with the initialized policy vf = self.get_value_func_dict(pol) # Convergence criterion epsilon = tol * 1e4 while epsilon >= tol: # Policy improvement - maximize value function with actions over each state # and use this to form the new improved policy pol = self.get_improved_policy(pol) # Run the improved policy out to get new value function new_vf = self.get_value_func_dict(pol) # Check convergence epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return pol
def policy_improve(mdp: MDP, vf: VF) -> Policy: new_pol = {} for state in mdp.states_: max_val = float('-inf') max_action = [] for action in mdp.s_a_s_[state].keys(): action_val = 0 for state2 in mdp.s_a_s_[state][action].keys(): action_val += mdp.s_a_s_[state][action][state2] * ( mdp.s_a_r_[state][action] + mdp.gamma_ * vf.get_value(state2)) if action_val > max_val: max_val = action_val max_action = [action] elif action_val == max_val: max_action.append(action) actions = {} for action in max_action: actions[action] = 1.0 / len(max_action) new_pol[state] = actions return Policy(new_pol)
3: (0.4, -8.2) } }, 3: { 'a': { 3: (1.0, 0.0) }, 'b': { 3: (1.0, 0.0) } } } mdp2_obj = MDPRefined(mdp_refined_data, 0.97) policy_data = { 1: { 'a': 0.4, 'b': 0.6 }, 2: { 'a': 0.7, 'c': 0.3 }, 3: { 'b': 1.0 } } pol_obj = Policy(policy_data) mrp_refined_obj = mdp2_obj.get_mrp_refined(pol_obj) print(mrp_refined_obj.transitions) print(mrp_refined_obj.rewards_refined)
raise ValueError return ret def pf_as_policy_type(i: int) -> Callable[[int], Sequence[str]]: return get_sampling_func_from_prob_dict(policy_func(i)) this_qf = adp_pg_obj.get_act_value_func(pf_as_policy_type) this_vf = adp_pg_obj.get_value_func(pf_as_policy_type) print("Printing vf for a policy") print(this_vf(1)) print(this_vf(2)) print(this_vf(3)) print("Printing DP vf for a policy") from processes.policy import Policy true_vf_for_pol = mdp_ref_obj1.get_value_func_dict( Policy({s: policy_func(s) for s in {1, 2, 3}})) print(true_vf_for_pol) opt_det_polf = adp_pg_obj.get_optimal_det_policy_func() # noinspection PyShadowingNames def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]: return {opt_det_polf(s): 1.0} print("Printing Opt Policy") print(opt_polf(1)) print(opt_polf(2)) print(opt_polf(3)) opt_vf = adp_pg_obj.get_value_func(adp_pg_obj.get_policy_as_policy_type()) print("Printing Opt VF")
def get_epsilon_policy_from_qf(qf_dict: Mapping[S, Mapping[A, float]], epsilon: float) -> Policy: return Policy( {s: get_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items()})
def get_softmax_policy_from_qf( qf_dict: Mapping[S, Mapping[A, float]]) -> Policy: return Policy({s: get_softmax_action_probs(v) for s, v in qf_dict.items()})
13: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 14: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 15: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 0: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, } policy = Policy(policy_data) vf = policy_eval(mdp, policy, 0.001) print(value_iter(mdp, vf))
ret = {'b': 1.0} else: raise ValueError return ret this_qf = adp_obj.get_act_value_func_fa(policy_func, True) this_vf = adp_obj.get_value_func_fa(policy_func, True) print("Printing vf for a policy") print(this_vf(1)) print(this_vf(2)) print(this_vf(3)) print("Printing DP vf for a policy") from processes.policy import Policy true_vf_for_pol = mdp_ref_obj1.get_value_func_dict(Policy( {s: policy_func(s) for s in {1, 2, 3}} )) print(true_vf_for_pol) opt_det_polf = adp_obj.get_optimal_policy_func_vi() # noinspection PyShadowingNames def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]: return {opt_det_polf(s): 1.0} print("Printing Opt Policy") print(opt_polf(1)) print(opt_polf(2)) print(opt_polf(3)) opt_vf = adp_obj.get_value_func_fa(opt_polf, False)
'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 14: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 15: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 0: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, } pol = Policy(policy_data) #vf = policy_eval(mdp, pol, 0.001) #new_pol = policy_improve(mdp, pol, vf) #print(new_pol) vf, pol = policy_iter(mdp, pol, 0.001)
def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy: return Policy( {s: {a: 1. / len(v) for a in v} for s, v in state_action_dict.items()})