def get_mc_path( self, pol: Policy, start_state: S, start_action: Optional[A] = None, ) -> Sequence[Tuple[S, A, float, bool]]: res = [] next_state = start_state steps = 0 terminate = False occ_states = set() act_gen_dict = { s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in self.mdp_rep.state_action_dict.keys() } while not terminate: state = next_state first = state not in occ_states occ_states.add(state) action = act_gen_dict[state]()\ if (steps > 0 or start_action is None) else start_action next_state, reward =\ self.mdp_rep.state_reward_gen_dict[state][action]() res.append((state, action, reward, first)) steps += 1 terminate = steps >= self.max_steps or\ state in self.mdp_rep.terminal_states return res
def get_value_func_dict(self, pol: Policy) -> VFType: sa_dict = self.mdp_rep.state_action_dict vf_dict = {s: 0.0 for s in sa_dict.keys()} act_gen_dict = { s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in sa_dict.keys() } episodes = 0 while episodes < self.num_episodes: state = self.mdp_rep.init_state_gen() steps = 0 terminate = False while not terminate: action = act_gen_dict[state]() next_state, reward = \ self.mdp_rep.state_reward_gen_dict[state][action]() vf_dict[state] += self.alpha * \ (reward + self.mdp_rep.gamma * vf_dict[next_state] - vf_dict[state]) state = next_state steps += 1 terminate = steps >= self.max_steps or \ state in self.mdp_rep.terminal_states episodes += 1 return vf_dict
def get_value_func_dict(self, pol: Policy) -> VFType: sa_dict = self.mdp_rep.state_action_dict vf_dict = {s: 0. for s in sa_dict.keys()} act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in sa_dict.keys()} episodes = 0 updates = 0 while episodes < self.num_episodes: et_dict = {s: 0. for s in sa_dict.keys()} state = self.mdp_rep.init_state_gen() steps = 0 terminate = False while not terminate: action = act_gen_dict[state]() next_state, reward =\ self.mdp_rep.state_reward_gen_dict[state][action]() delta = reward + self.mdp_rep.gamma * vf_dict[next_state] -\ vf_dict[state] et_dict[state] += 1 alpha = self.learning_rate * (updates / self.learning_rate_decay + 1) ** -0.5 for s in sa_dict.keys(): vf_dict[s] += alpha * delta * et_dict[s] et_dict[s] *= self.gamma_lambda updates += 1 steps += 1 terminate = steps >= self.max_steps or\ state in self.mdp_rep.terminal_states state = next_state episodes += 1 return vf_dict