def run(self,
            num_episodes,
            discount_factor=1,
            epsilon=0.1,
            learning_rate=0.5):
        self.env = Env(self.gym_env,
                       discount_factor,
                       epsilon,
                       action_type=QAction,
                       learning_rate=learning_rate)
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))
        for i_episode in tqdm(range(num_episodes)):
            state_actions = set()
            state = self.env.reset()
            for t in itertools.count():
                action_state = state.get_next_action_state(
                    EGreedyPolicy(epsilon))
                next_state, reward, done, _ = self.env.step(
                    action_state.get_gym_action())
                if state not in state_actions:
                    state_actions.add(action_state)
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                action_state.update(reward, next_state.get_actions())
                if done:
                    break
                state = next_state

        return stats
class NStepSarsa(Algorithm):
    def __init__(self, gym_env: gym.Env, n):
        super().__init__(gym_env)
        self.n = n

    def run(self,
            num_episodes,
            discount_factor=1,
            epsilon=0.1,
            learning_rate=0.5):
        self.env = Env(self.gym_env,
                       discount_factor,
                       epsilon,
                       action_type=NStepAction,
                       learning_rate=learning_rate)
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))
        for i_episode in tqdm(range(num_episodes)):
            states = list()
            state = self.env.reset()
            env_list = list()
            T = 1e10
            update_time = -1
            for t in itertools.count():
                if t < T:
                    action_state = state.get_next_action_state(
                        EGreedyPolicy(epsilon))
                    action_state.add_reward_calculator(t)
                    next_state, reward, done, _ = self.env.step(
                        action_state.get_gym_action())
                    env_list.append((state, action_state))
                    states.append(next_state)
                    for _, a_s in env_list[update_time + 1:]:
                        a_s.cache_reward(reward, step=t)
                    stats.episode_rewards[i_episode] += reward
                    stats.episode_lengths[i_episode] = t
                    if done:
                        T = t + 1
                    else:
                        state = next_state
                update_time = t - self.n + 1
                if update_time >= 0:
                    action_state_update_time = env_list[update_time][1]
                    evaluated_state_index = update_time + self.n - 1
                    if evaluated_state_index < len(states):
                        state_update_time = states[evaluated_state_index]
                        action_state_update_time.update(
                            0,
                            state_update_time.get_actions(),
                            time_step=update_time)
                    else:
                        action_state_update_time.update(0,
                                                        None,
                                                        time_step=update_time)
                if update_time == T - 1:
                    a_ss = [a_s for _, a_s in env_list]
                    for a_s in a_ss:
                        a_s.clear_reward_calculator()
                    break
        return stats
 def run(self,
         num_episodes,
         discount_factor=1,
         epsilon=0.3,
         learning_rate=0.5):
     self.env = Env(self.gym_env,
                    discount_factor,
                    epsilon,
                    action_type=FirstMCAction,
                    learning_rate=learning_rate)
     for _ in tqdm(range(num_episodes)):
         action_states = []
         state = self.env.reset()
         states = [state]
         for t in range(100):
             action_state = state.get_next_action_state(
                 EGreedyPolicy(epsilon))
             next_state, reward, done, _ = self.env.step(
                 action_state.get_gym_action())
             action_state.add_reward_calculator(t)
             if state not in states:
                 states.append(state)
             if action_state not in action_states:
                 action_states.append(action_state)
             for a_s in action_states:
                 a_s.cache_reward(reward, t)
             if done:
                 break
             state = next_state
         for i, s in enumerate(action_states):
             s.update(0, [], time_step=i)
         for a_s in action_states:
             a_s.clear_reward_calculator()
Exemple #4
0
class McOfflinePolicy(Algorithm):
    def run(self, num_episodes, discount_factor=0.8, epsilon=0.1):
        self.env = Env(self.gym_env, discount_factor, epsilon, action_type=McOfflineAction)
        n = self.env.env.action_space.n
        for _ in tqdm(range(num_episodes)):
            action_states = self.generate_one_episode_action_states_by_policy(RandomPolicy())
            w = 1
            g = 0
            for action_state in reversed(action_states):
                state, action_state, reward = action_state
                g = discount_factor * g + reward
                action_state.update_c(w)
                action_state.update_q(g, w)
                action = state.get_next_action_state(GreedyPolicy())
                if action != action_state:
                    break
                w = w * n
        return state

    def generate_one_episode_action_states_by_policy(self, policy):
        actions = []
        state = self.env.reset()
        for t in range(100):
            action = state.get_next_action_state(policy)
            next_state, reward, done, _ = self.env.step(action.get_gym_action())
            actions.append((state, action, reward))
            if done:
                break
            state = next_state
        return actions
Exemple #5
0
 def run(self, num_episodes, discount_factor=0.8, epsilon=0.1):
     self.env = Env(self.gym_env, discount_factor, epsilon, action_type=McOfflineAction)
     n = self.env.env.action_space.n
     for _ in tqdm(range(num_episodes)):
         action_states = self.generate_one_episode_action_states_by_policy(RandomPolicy())
         w = 1
         g = 0
         for action_state in reversed(action_states):
             state, action_state, reward = action_state
             g = discount_factor * g + reward
             action_state.update_c(w)
             action_state.update_q(g, w)
             action = state.get_next_action_state(GreedyPolicy())
             if action != action_state:
                 break
             w = w * n
     return state
Exemple #6
0
 def run(self,
         num_episodes,
         discount_factor=1,
         epsilon=0.1,
         learning_rate=0.5):
     self.env = Env(self.gym_env,
                    discount_factor,
                    epsilon,
                    action_type=NStepAction,
                    learning_rate=learning_rate)
     stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                   episode_rewards=np.zeros(num_episodes))
     for i_episode in tqdm(range(num_episodes)):
         states = list()
         state = self.env.reset()
         env_list = list()
         T = 1e10
         update_time = -1
         log.debug('----------------------\n\n')
         for t in itertools.count():
             if t < T:
                 policy = EGreedyPolicy(epsilon)
                 action_state = state.get_next_action_state(policy)
                 b = state.get_action_probability(policy, action_state)
                 pi = state.get_action_probability(EGreedyPolicy(0.01),
                                                   action_state)
                 ratio = pi / b
                 log.debug('s:{}'.format(state))
                 log.debug('a:{}'.format(action_state))
                 log.debug('b:{0:.2f} pi:{1:.2f} ra:{2:.2f}'.format(
                     b, pi, ratio))
                 action_state.add_reward_calculator(t)
                 # self.env.render()
                 next_state, reward, done, _ = self.env.step(
                     action_state.get_gym_action())
                 log.debug('done: {} reward: {}'.format(done, reward))
                 env_list.append((state, action_state))
                 states.append(next_state)
                 accumulated_time = 0 if update_time + 1 < 0 else update_time + 1
                 for s, a_s in env_list[accumulated_time:]:
                     log.debug('cache for s:{} a:{}'.format(s, a_s))
                     a_s.cache_reward(
                         reward,
                         step=t,
                         one_step_importance_sampling_ratio=ratio)
                 stats.episode_rewards[i_episode] += reward
                 stats.episode_lengths[i_episode] = t
                 if done:
                     T = t + 1
                 else:
                     state = next_state
             update_time = t - self.n + 1
             if update_time >= 0:
                 action_state_update_time = env_list[update_time][1]
                 evaluated_state_index = update_time + self.n - 1
                 if evaluated_state_index < len(states):
                     log.debug('=n')
                     state_update_time = states[evaluated_state_index]
                     action_state_update_time.update(
                         0,
                         state_update_time.get_actions(),
                         time_step=update_time)
                 else:
                     log.debug('<n')
                     action_state_update_time.update(0,
                                                     None,
                                                     time_step=update_time)
             if update_time == T - 1:
                 a_ss = [a_s for _, a_s in env_list]
                 for a_s in a_ss:
                     a_s.clear_reward_calculator()
                 break
     return stats