def get_optimal_policy(self):
        for state_index, action_distribution in self.policy.policy_table.items(
        ):
            q_values = {}
            for action_index, _ in action_distribution.items():
                q_values[action_index] = self._get_value_of_action(
                    state_index, action_index)
            greedy_distibution = create_distribution_greedily()(q_values)
            self.policy.policy_table[state_index] = greedy_distibution

        return self.policy
    def __init__(self,
                 q_table,
                 table_policy,
                 env,
                 steps,
                 statistics,
                 episodes,
                 step_size=0.1,
                 discount=1.0):
        self.q_table = q_table
        self.policy = table_policy
        self.env = env
        self.episodes = episodes
        self.step_size = step_size
        self.discount = discount
        self.create_distribution_greedily = create_distribution_greedily()
        self.steps = steps

        self.statistics = statistics
 def __init__(self, behavior_policy, target_policy, critic):
     self.target_policy = target_policy
     self.behavior_policy = behavior_policy
     self.critic = critic
     self.create_distribution_greedily = create_distribution_greedily()
 def __init__(self, policy, critic):
     self.policy = policy
     self.critic = critic
     self.create_distribution_greedily = create_distribution_greedily()
 def __init__(self, policy,critic,epsilon=0.1):
     self.policy = policy
     self.critic = critic
     self.create_distribution_epsilon_greedily = create_distribution_epsilon_greedily(epsilon)
     self.create_distribution_greedily = create_distribution_greedily()