def get_soft_policy_from_qf_dict(qf_dict: SAf, softmax: bool, epsilon: float) -> Policy: if softmax: ret = Policy( {s: get_softmax_action_probs(v) for s, v in qf_dict.items()}) else: ret = Policy({ s: get_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items() }) return ret
def get_vf_dict_from_qf_dict_and_policy(qf_dict: SAf, pol: Policy) -> Mapping[A, float]: return { s: sum(pol.get_state_action_probability(s, a) * q for a, q in v.items()) for s, v in qf_dict.items() }
def mdp_rep_to_mrp_rep2( mdp_rep: SAf, policy_rep: SAf ) -> Mapping[S, float]: return {s: sum(p * v[a] for a, p in policy_rep[s].items()) for s, v in mdp_rep.items()}
def get_det_policy_from_qf_dict(qf_dict: SAf) -> DetPolicy: return DetPolicy( {s: max(v.items(), key=itemgetter(1))[0] for s, v in qf_dict.items()})
def verify_policy(policy_data: SAf) -> bool: return all(is_approx_eq(sum(v.values()), 1.0) for s, v in policy_data.items())