def get_optimal_policy_vi(self) -> DetPolicy: vf = {s: 0. for s in self.mdp_obj.all_states} epsilon = self.tol * 1e4 mo = self.mdp_obj while epsilon >= self.tol: new_vf = { s: max(r + mo.gamma * sum(p * vf[s1] for s1, p in mo.transitions[s][a].items()) for a, r in v.items()) for s, v in mo.rewards.items() } epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf # for s, v in mo.rewards.items(): # if s[0] > 90.0: # print(s[0]) # for a, r in v.items(): # print("state: {}, action: {}, reward: {}, value function: {}, transition prob: {}" .format(s, a, r, vf[s], mo.transitions[s][a].items())) pol = DetPolicy({ s: max([(a, r + mo.gamma * sum(p * vf[s1] for s1, p in mo.transitions[s][a].items())) for a, r in v.items()], key=itemgetter(1))[0] for s, v in mo.rewards.items() }) return pol, vf
def print_policy(self, pol: DetPolicy) -> None: display1 = "%%%dd" % 2 display2 = "%%%dd " % 2 blocks_dict = {s: 'X' for s in self.blocks} full_dict = {**pol.get_state_to_action_map(), **blocks_dict} print(" " + " ".join([display1 % j for j in range(0, self.x_len)])) for i in range(self.y_len - 1, -1, -1): print(display2 % i + " ".join(full_dict[(j, i)] for j in range(0, self.x_len)))
def get_improved_policy(self, pol: Policy) -> DetPolicy: # Get dict of q-values (state-action value function) # for the state-action pairs prescribed by the policy q_dict = self.get_act_value_func_dict(pol) # Go back through each of the states, and maximize over the # actions possible from that state, and return a new policy # which assigns the value-maximizing action for each state return DetPolicy({s: max(v.items(), key=itemgetter(1))[0] for s, v in q_dict.items()})
def get_optimal_policy_Robust_vi(self) -> DetPolicy: vf = {s: 0. for s in self.mdp_obj.all_states} epsilon = self.tol * 1e4 mo = self.mdp_obj while epsilon >= self.tol: new_vf = { s: max(r + mo.gamma * self.worst_case(vf, mo.transitions[s][a]) for a, r in v.items()) for s, v in mo.rewards.items() } epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf pol = DetPolicy({ s: max([(a, r + mo.gamma * self.worst_case(vf, mo.transitions[s][a])) for a, r in v.items()], key=itemgetter(1))[0] for s, v in mo.rewards.items() }) return pol, vf
tolerance=this_tolerance, first_visit_mc=this_first_visit_mc, num_samples=this_num_samples, softmax=this_softmax, epsilon=this_epsilon, epsilon_half_life=this_epsilon_half_life, learning_rate=this_learning_rate, learning_rate_decay=this_learning_rate_decay, lambd=this_lambd, num_episodes=this_num_episodes, max_steps=this_max_steps, tdl_fa_offline=this_td_offline, fa_spec=this_fa_spec) for name, algo in raa.get_all_algorithms().items(): print(name) opt_pol_func = algo.get_optimal_det_policy_func() opt_pol = DetPolicy( {s: opt_pol_func(s) for s in mdp_ref_obj.all_states}) opt_vf_func = algo.get_optimal_value_func() opt_vf_dict = {s: opt_vf_func(s) for s in mdp_ref_obj.all_states} wg.print_policy(opt_pol) chars_count = 5 decimals_count = 2 print() wg.print_vf(opt_vf_dict, chars_count, decimals_count) print() wg.print_wind_and_bumps(chars_count, decimals_count) print() print()
def get_improved_policy(self, pol: Policy) -> DetPolicy: q_dict = self.get_act_value_func_dict(pol) return DetPolicy({s: max(v.items(), key=itemgetter(1))[0] for s, v in q_dict.items()})
def get_det_policy_from_qf_dict(qf_dict: SAf) -> DetPolicy: return DetPolicy( {s: max(v.items(), key=itemgetter(1))[0] for s, v in qf_dict.items()})
def get_det_policy_from_qf( qf_dict: Mapping[S, Mapping[A, float]]) -> DetPolicy: return DetPolicy( {s: max(v.items(), key=itemgetter(1))[0] for s, v in qf_dict.items()})