def get_mrp_refined(self, pol: Policy) -> MRPRefined: tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data) rew_ref = mdp_rep_to_mrp_rep1(self.rewards_refined, pol.policy_data) return MRPRefined( { s: {s1: (v1, rew_ref[s][s1]) for s1, v1 in v.items()} for s, v in tr.items() }, self.gamma)
def get_mrp_refined(self, pol: Policy) -> MRPRefined: flat_transitions = flatten_sasf_dict(self.transitions) flat_rewards_refined = flatten_sasf_dict(self.rewards_refined) flat_exp_rewards = merge_dicts(flat_rewards_refined, flat_transitions, lambda x, y: x * y) exp_rewards = unflatten_sasf_dict(flat_exp_rewards) tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data) rew_ref = mdp_rep_to_mrp_rep1( exp_rewards, pol.policy_data ) flat_tr = flatten_ssf_dict(tr) flat_rew_ref = flatten_ssf_dict(rew_ref) flat_norm_rewards = merge_dicts(flat_rew_ref, flat_tr, lambda x, y: x / y) norm_rewards = unflatten_ssf_dict(flat_norm_rewards) return MRPRefined( {s: {s1: (v1, norm_rewards[s][s1]) for s1, v1 in v.items()} for s, v in tr.items()}, self.gamma )
def get_value_func_dict(self, pol: Policy) -> VFDictType: vf = {s: 0. for s in self.mdp_obj.all_states} epsilon = self.tol * 1e4 mo = self.mdp_obj pd = pol.policy_data rew = mdp_rep_to_mrp_rep2(mo.rewards, pd) prob = mdp_rep_to_mrp_rep1(mo.transitions, pd) while epsilon >= self.tol: new_vf = {s: rew[s] + mo.gamma * sum(p * vf[s1] for s1, p in prob[s].items()) for s in mo.all_states} epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return vf
def get_mrp(self, pol: Policy) -> MRP: tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data) rew = mdp_rep_to_mrp_rep2(self.rewards, pol.policy_data) return MRP({s: (v, rew[s]) for s, v in tr.items()}, self.gamma)