コード例 #1
0
 def get_value_func_dict(self, pol: Policy) -> VFDictType:
     vf = {s: 0. for s in self.mdp_obj.all_states}
     epsilon = self.tol * 1e4
     mo = self.mdp_obj
     pd = pol.policy_data
     rew = mdp_rep_to_mrp_rep2(mo.rewards, pd)
     prob = mdp_rep_to_mrp_rep1(mo.transitions, pd)
     while epsilon >= self.tol:
         new_vf = {s: rew[s] + mo.gamma * sum(p * vf[s1]
                                              for s1, p in prob[s].items())
                   for s in mo.all_states}
         
         epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
         vf = new_vf
     return vf
コード例 #2
0
 def get_mrp(self, pol: Policy) -> MRP:
     tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data)
     rew = mdp_rep_to_mrp_rep2(self.rewards, pol.policy_data)
     return MRP({s: (v, rew[s]) for s, v in tr.items()}, self.gamma)