Esempio n. 1
0
def get_soft_policy_from_qf_dict(qf_dict: SAf, softmax: bool,
                                 epsilon: float) -> Policy:
    if softmax:
        ret = Policy(
            {s: get_softmax_action_probs(v)
             for s, v in qf_dict.items()})
    else:
        ret = Policy({
            s: get_epsilon_action_probs(v, epsilon)
            for s, v in qf_dict.items()
        })
    return ret
Esempio n. 2
0
def get_vf_dict_from_qf_dict_and_policy(qf_dict: SAf,
                                        pol: Policy) -> Mapping[A, float]:
    return {
        s:
        sum(pol.get_state_action_probability(s, a) * q for a, q in v.items())
        for s, v in qf_dict.items()
    }
Esempio n. 3
0
def mdp_rep_to_mrp_rep2(
    mdp_rep: SAf,
    policy_rep: SAf
) -> Mapping[S, float]:
    return {s: sum(p * v[a] for a, p in policy_rep[s].items())
            for s, v in mdp_rep.items()}
Esempio n. 4
0
def get_det_policy_from_qf_dict(qf_dict: SAf) -> DetPolicy:
    return DetPolicy(
        {s: max(v.items(), key=itemgetter(1))[0]
         for s, v in qf_dict.items()})
Esempio n. 5
0
def verify_policy(policy_data: SAf) -> bool:
    return all(is_approx_eq(sum(v.values()), 1.0) for s, v in policy_data.items())