def get_soft_policy_from_qf_dict(qf_dict: SAf, softmax: bool, epsilon: float) -> Policy: if softmax: ret = Policy( {s: get_softmax_action_probs(v) for s, v in qf_dict.items()}) else: ret = Policy({ s: get_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items() }) return ret
def policy_improve(mdp: MDP, vf: VF) -> Policy: new_pol = {} # for each state for state in mdp.states_: # find maximizing action max_val = float('-inf') max_action = [] # iterate across actions for action in mdp.s_a_s_[state].keys(): action_val = 0 # find the expected action value for state2 in mdp.s_a_s_[state][action].keys(): action_val += mdp.s_a_s_[state][action][state2][0] * (mdp.s_a_s_[state][action][state2][1] + mdp.gamma_ * vf.value_dict_[state2]) # update if new max if action_val > max_val: max_val = action_val max_action = [action] elif action_val == max_val: max_action.append(action) actions = {} for action in max_action: actions[action] = 1.0 / len(max_action) new_pol[state] = actions return Policy(new_pol)
def get_optimal_policy(self, tol=1e-4) -> DetPolicy: pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items()}) vf = self.get_value_func_dict(pol) epsilon = tol * 1e4 while epsilon >= tol: pol = self.get_improved_policy(pol) new_vf = self.get_value_func_dict(pol) epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return pol
def get_value_func(self, polf: PolicyType) -> Callable[[S], float]: pol = Policy({ s: get_pdf_from_samples( polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION)) for s, v in self.get_state_action_dict().items() }) # noinspection PyShadowingNames def vf(state: S, pol=pol) -> float: return self.get_value_func_dict(pol)[state] return vf
def get_value_func(self, pol_func: Callable[[S], Callable[[A], float]])\ -> Callable[[S], float]: pol = Policy({ s: {a: pol_func(s)(a) for a in v} for s, v in self.get_state_action_dict() }) # noinspection PyShadowingNames def vf(state: S, pol=pol) -> float: return self.get_value_func_dict(pol)[state] return vf
def get_act_value_func(self, pol_func: Callable[[S], Callable[[A], float]])\ -> Callable[[S], Callable[[A], float]]: pol = Policy({ s: {a: pol_func(s)(a) for a in v} for s, v in self.get_state_action_dict() }) # noinspection PyShadowingNames def qvf(state: S, pol=pol) -> Callable[[A], float]: # noinspection PyShadowingNames def inner_f(action: A, pol=pol, state=state) -> float: return self.get_act_value_func_dict(pol)[state][action] return inner_f return qvf
def get_optimal_policy(self, tol=1e-4) -> DetPolicy: # Initialize a policy pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items()}) # Initialize a value function vector by rolling out with the initialized policy vf = self.get_value_func_dict(pol) # Convergence criterion epsilon = tol * 1e4 while epsilon >= tol: # Policy improvement - maximize value function with actions over each state # and use this to form the new improved policy pol = self.get_improved_policy(pol) # Run the improved policy out to get new value function new_vf = self.get_value_func_dict(pol) # Check convergence epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return pol
def policy_improve(mdp: MDP, vf: VF) -> Policy: new_pol = {} for state in mdp.states_: max_val = float('-inf') max_action = [] for action in mdp.s_a_s_[state].keys(): action_val = 0 for state2 in mdp.s_a_s_[state][action].keys(): action_val += mdp.s_a_s_[state][action][state2] * ( mdp.s_a_r_[state][action] + mdp.gamma_ * vf.get_value(state2)) if action_val > max_val: max_val = action_val max_action = [action] elif action_val == max_val: max_action.append(action) actions = {} for action in max_action: actions[action] = 1.0 / len(max_action) new_pol[state] = actions return Policy(new_pol)
3: (0.4, -8.2) } }, 3: { 'a': { 3: (1.0, 0.0) }, 'b': { 3: (1.0, 0.0) } } } mdp2_obj = MDPRefined(mdp_refined_data, 0.97) policy_data = { 1: { 'a': 0.4, 'b': 0.6 }, 2: { 'a': 0.7, 'c': 0.3 }, 3: { 'b': 1.0 } } pol_obj = Policy(policy_data) mrp_refined_obj = mdp2_obj.get_mrp_refined(pol_obj) print(mrp_refined_obj.transitions) print(mrp_refined_obj.rewards_refined)
raise ValueError return ret def pf_as_policy_type(i: int) -> Callable[[int], Sequence[str]]: return get_sampling_func_from_prob_dict(policy_func(i)) this_qf = adp_pg_obj.get_act_value_func(pf_as_policy_type) this_vf = adp_pg_obj.get_value_func(pf_as_policy_type) print("Printing vf for a policy") print(this_vf(1)) print(this_vf(2)) print(this_vf(3)) print("Printing DP vf for a policy") from processes.policy import Policy true_vf_for_pol = mdp_ref_obj1.get_value_func_dict( Policy({s: policy_func(s) for s in {1, 2, 3}})) print(true_vf_for_pol) opt_det_polf = adp_pg_obj.get_optimal_det_policy_func() # noinspection PyShadowingNames def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]: return {opt_det_polf(s): 1.0} print("Printing Opt Policy") print(opt_polf(1)) print(opt_polf(2)) print(opt_polf(3)) opt_vf = adp_pg_obj.get_value_func(adp_pg_obj.get_policy_as_policy_type()) print("Printing Opt VF")
ret = {'b': 1.0} else: raise ValueError return ret this_qf = adp_obj.get_act_value_func_fa(policy_func, True) this_vf = adp_obj.get_value_func_fa(policy_func, True) print("Printing vf for a policy") print(this_vf(1)) print(this_vf(2)) print(this_vf(3)) print("Printing DP vf for a policy") from processes.policy import Policy true_vf_for_pol = mdp_ref_obj1.get_value_func_dict(Policy( {s: policy_func(s) for s in {1, 2, 3}} )) print(true_vf_for_pol) opt_det_polf = adp_obj.get_optimal_policy_func_vi() # noinspection PyShadowingNames def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]: return {opt_det_polf(s): 1.0} print("Printing Opt Policy") print(opt_polf(1)) print(opt_polf(2)) print(opt_polf(3)) opt_vf = adp_obj.get_value_func_fa(opt_polf, False)
'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 14: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 15: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 0: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, } pol = Policy(policy_data) #vf = policy_eval(mdp, pol, 0.001) #new_pol = policy_improve(mdp, pol, vf) #print(new_pol) vf, pol = policy_iter(mdp, pol, 0.001)
def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy: return Policy( {s: {a: 1. / len(v) for a in v} for s, v in state_action_dict.items()})
13: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 14: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 15: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, 0: { 'n': 0.25, 's': 0.25, 'w': 0.25, 'e': 0.25 }, } policy = Policy(policy_data) vf = policy_eval(mdp, policy, 0.001) print(value_iter(mdp, vf))
def get_softmax_policy_from_qf( qf_dict: Mapping[S, Mapping[A, float]]) -> Policy: return Policy({s: get_softmax_action_probs(v) for s, v in qf_dict.items()})
def get_epsilon_policy_from_qf(qf_dict: Mapping[S, Mapping[A, float]], epsilon: float) -> Policy: return Policy( {s: get_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items()})