def get_soft_policy_from_qf_dict(qf_dict: SAf, softmax: bool, epsilon: float) -> Policy: if softmax: ret = Policy( {s: get_softmax_action_probs(v) for s, v in qf_dict.items()}) else: ret = Policy({ s: get_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items() }) return ret
def get_mc_path( self, pol: Policy, start_state: S, start_action: Optional[A] = None, ) -> Sequence[Tuple[S, A, float, bool]]: res = [] state = start_state steps = 0 terminate = False occ_states = set() act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in self.mdp_rep.state_action_dict.keys()} while not terminate: first = state not in occ_states occ_states.add(state) action = act_gen_dict[state]()\ if (steps > 0 or start_action is None) else start_action next_state, reward =\ self.mdp_rep.state_reward_gen_dict[state][action]() res.append((state, action, reward, first)) steps += 1 terminate = steps >= self.max_steps or\ state in self.mdp_rep.terminal_states state = next_state return res
def get_vf_dict_from_qf_dict_and_policy(qf_dict: SAf, pol: Policy) -> Mapping[A, float]: return { s: sum(pol.get_state_action_probability(s, a) * q for a, q in v.items()) for s, v in qf_dict.items() }
def get_value_func_dict(self, pol: Policy) -> VFDictType: sa_dict = self.mdp_rep.state_action_dict vf_dict = {s: 0.0 for s in sa_dict.keys()} act_gen_dict = { s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in sa_dict.keys() } episodes = 0 updates = 0 while episodes < self.num_episodes: state = self.mdp_rep.init_state_gen() steps = 0 terminate = False while not terminate: action = act_gen_dict[state]() next_state, reward = \ self.mdp_rep.state_reward_gen_dict[state][action]() vf_dict[state] += self.learning_rate *\ (updates / self.learning_rate_decay + 1) ** -0.5 *\ (reward + self.mdp_rep.gamma * vf_dict[next_state] - vf_dict[state]) updates += 1 steps += 1 terminate = steps >= self.max_steps or \ state in self.mdp_rep.terminal_states state = next_state episodes += 1 return vf_dict
def get_value_func(self, polf: PolicyType) -> Callable[[S], float]: pol = Policy({s: get_pdf_from_samples( polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION) ) for s, v in self.get_state_action_dict().items()}) # noinspection PyShadowingNames def vf(state: S, pol=pol) -> float: return self.get_value_func_dict(pol)[state] return vf
if __name__ == '__main__': from mdp_dp_rl.processes.mdp import MDP policy_data = { 1: { 'a': 0.4, 'b': 0.6 }, 2: { 'a': 0.7, 'c': 0.3 }, 3: { 'b': 1.0 } } pol_obj = Policy(policy_data) mdp_data = { 1: { 'a': ({ 1: 0.2, 2: 0.6, 3: 0.2 }, 7.0), 'b': ({ 1: 0.6, 2: 0.3, 3: 0.1 }, -2.0), 'c': ({ 1: 0.1, 2: 0.2,
elif i == 3: ret = {'b': 1.0} else: raise ValueError return ret this_qf = adp_obj.get_act_value_func_fa(policy_func, True) this_vf = adp_obj.get_value_func_fa(policy_func, True) print("Printing vf for a policy") print(this_vf(1)) print(this_vf(2)) print(this_vf(3)) print("Printing DP vf for a policy") from mdp_dp_rl.processes.policy import Policy true_vf_for_pol = mdp_ref_obj1.get_value_func_dict( Policy({s: policy_func(s) for s in {1, 2, 3}})) print(true_vf_for_pol) opt_det_polf = adp_obj.get_optimal_policy_func_vi() # noinspection PyShadowingNames def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]: return {opt_det_polf(s): 1.0} print("Printing Opt Policy") print(opt_polf(1)) print(opt_polf(2)) print(opt_polf(3)) opt_vf = adp_obj.get_value_func_fa(opt_polf, False) print("Printing Opt VF")
def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy: return Policy( {s: {a: 1. / len(v) for a in v} for s, v in state_action_dict.items()})