Beispiel #1
0
    def get_optimal_policy_vi(self) -> DetPolicy:
        vf = {s: 0. for s in self.mdp_obj.all_states}
        epsilon = self.tol * 1e4
        mo = self.mdp_obj

        while epsilon >= self.tol:
            new_vf = {
                s:
                max(r +
                    mo.gamma * sum(p * vf[s1]
                                   for s1, p in mo.transitions[s][a].items())
                    for a, r in v.items())
                for s, v in mo.rewards.items()
            }
            epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
            vf = new_vf

#        for s, v in mo.rewards.items():
#            if s[0] > 90.0:
#                print(s[0])
#                for a, r in v.items():
#                    print("state: {}, action: {}, reward: {}, value function: {}, transition prob: {}" .format(s, a, r, vf[s], mo.transitions[s][a].items()))

        pol = DetPolicy({
            s: max([(a, r +
                     mo.gamma * sum(p * vf[s1]
                                    for s1, p in mo.transitions[s][a].items()))
                    for a, r in v.items()],
                   key=itemgetter(1))[0]
            for s, v in mo.rewards.items()
        })

        return pol, vf
Beispiel #2
0
 def print_policy(self, pol: DetPolicy) -> None:
     display1 = "%%%dd" % 2
     display2 = "%%%dd  " % 2
     blocks_dict = {s: 'X' for s in self.blocks}
     full_dict = {**pol.get_state_to_action_map(), **blocks_dict}
     print("   " + " ".join([display1 % j for j in range(0, self.x_len)]))
     for i in range(self.y_len - 1, -1, -1):
         print(display2 % i + "  ".join(full_dict[(j, i)]
                                        for j in range(0, self.x_len)))
Beispiel #3
0
 def get_improved_policy(self, pol: Policy) -> DetPolicy:
     # Get dict of q-values (state-action value function)
     # for the state-action pairs prescribed by the policy
     q_dict = self.get_act_value_func_dict(pol)
     # Go back through each of the states, and maximize over the
     # actions possible from that state, and return a new policy
     # which assigns the value-maximizing action for each state
     return DetPolicy({s: max(v.items(), key=itemgetter(1))[0]
                       for s, v in q_dict.items()})
Beispiel #4
0
    def get_optimal_policy_Robust_vi(self) -> DetPolicy:
        vf = {s: 0. for s in self.mdp_obj.all_states}
        epsilon = self.tol * 1e4
        mo = self.mdp_obj

        while epsilon >= self.tol:
            new_vf = {
                s: max(r + mo.gamma * self.worst_case(vf, mo.transitions[s][a])
                       for a, r in v.items())
                for s, v in mo.rewards.items()
            }
            epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
            vf = new_vf
        pol = DetPolicy({
            s:
            max([(a, r + mo.gamma * self.worst_case(vf, mo.transitions[s][a]))
                 for a, r in v.items()],
                key=itemgetter(1))[0]
            for s, v in mo.rewards.items()
        })
        return pol, vf
Beispiel #5
0
                           tolerance=this_tolerance,
                           first_visit_mc=this_first_visit_mc,
                           num_samples=this_num_samples,
                           softmax=this_softmax,
                           epsilon=this_epsilon,
                           epsilon_half_life=this_epsilon_half_life,
                           learning_rate=this_learning_rate,
                           learning_rate_decay=this_learning_rate_decay,
                           lambd=this_lambd,
                           num_episodes=this_num_episodes,
                           max_steps=this_max_steps,
                           tdl_fa_offline=this_td_offline,
                           fa_spec=this_fa_spec)
    for name, algo in raa.get_all_algorithms().items():
        print(name)
        opt_pol_func = algo.get_optimal_det_policy_func()
        opt_pol = DetPolicy(
            {s: opt_pol_func(s)
             for s in mdp_ref_obj.all_states})
        opt_vf_func = algo.get_optimal_value_func()
        opt_vf_dict = {s: opt_vf_func(s) for s in mdp_ref_obj.all_states}
        wg.print_policy(opt_pol)
        chars_count = 5
        decimals_count = 2
        print()
        wg.print_vf(opt_vf_dict, chars_count, decimals_count)
        print()
        wg.print_wind_and_bumps(chars_count, decimals_count)
        print()
        print()
Beispiel #6
0
 def get_improved_policy(self, pol: Policy) -> DetPolicy:
     q_dict = self.get_act_value_func_dict(pol)
     return DetPolicy({s: max(v.items(), key=itemgetter(1))[0]
                       for s, v in q_dict.items()})
def get_det_policy_from_qf_dict(qf_dict: SAf) -> DetPolicy:
    return DetPolicy(
        {s: max(v.items(), key=itemgetter(1))[0]
         for s, v in qf_dict.items()})
Beispiel #8
0
def get_det_policy_from_qf(
        qf_dict: Mapping[S, Mapping[A, float]]) -> DetPolicy:
    return DetPolicy(
        {s: max(v.items(), key=itemgetter(1))[0]
         for s, v in qf_dict.items()})