class QLearning(Algorithm): def run(self, num_episodes, discount_factor=1, epsilon=0.1, learning_rate=0.5): self.env = Env(self.gym_env, discount_factor, epsilon, action_type=QAction, learning_rate=learning_rate) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): state_actions = set() state = self.env.reset() for t in itertools.count(): action_state = state.get_next_action_state( EGreedyPolicy(epsilon)) next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) if state not in state_actions: state_actions.add(action_state) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t action_state.update(reward, next_state.get_actions()) if done: break state = next_state return stats
class McOnline(Algorithm): def run(self, num_episodes, discount_factor=1, epsilon=0.3, learning_rate=0.5): self.env = Env(self.gym_env, discount_factor, epsilon, action_type=FirstMCAction, learning_rate=learning_rate) for _ in tqdm(range(num_episodes)): action_states = [] state = self.env.reset() states = [state] for t in range(100): action_state = state.get_next_action_state( EGreedyPolicy(epsilon)) next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) action_state.add_reward_calculator(t) if state not in states: states.append(state) if action_state not in action_states: action_states.append(action_state) for a_s in action_states: a_s.cache_reward(reward, t) if done: break state = next_state for i, s in enumerate(action_states): s.update(0, [], time_step=i) for a_s in action_states: a_s.clear_reward_calculator()
class NStepSarsa(Algorithm): def __init__(self, gym_env: gym.Env, n): super().__init__(gym_env) self.n = n def run(self, num_episodes, discount_factor=1, epsilon=0.1, learning_rate=0.5): self.env = Env(self.gym_env, discount_factor, epsilon, action_type=NStepAction, learning_rate=learning_rate) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): states = list() state = self.env.reset() env_list = list() T = 1e10 update_time = -1 for t in itertools.count(): if t < T: action_state = state.get_next_action_state( EGreedyPolicy(epsilon)) action_state.add_reward_calculator(t) next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) env_list.append((state, action_state)) states.append(next_state) for _, a_s in env_list[update_time + 1:]: a_s.cache_reward(reward, step=t) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: T = t + 1 else: state = next_state update_time = t - self.n + 1 if update_time >= 0: action_state_update_time = env_list[update_time][1] evaluated_state_index = update_time + self.n - 1 if evaluated_state_index < len(states): state_update_time = states[evaluated_state_index] action_state_update_time.update( 0, state_update_time.get_actions(), time_step=update_time) else: action_state_update_time.update(0, None, time_step=update_time) if update_time == T - 1: a_ss = [a_s for _, a_s in env_list] for a_s in a_ss: a_s.clear_reward_calculator() break return stats
class McOfflinePolicy(Algorithm): def run(self, num_episodes, discount_factor=0.8, epsilon=0.1): self.env = Env(self.gym_env, discount_factor, epsilon, action_type=McOfflineAction) n = self.env.env.action_space.n for _ in tqdm(range(num_episodes)): action_states = self.generate_one_episode_action_states_by_policy(RandomPolicy()) w = 1 g = 0 for action_state in reversed(action_states): state, action_state, reward = action_state g = discount_factor * g + reward action_state.update_c(w) action_state.update_q(g, w) action = state.get_next_action_state(GreedyPolicy()) if action != action_state: break w = w * n return state def generate_one_episode_action_states_by_policy(self, policy): actions = [] state = self.env.reset() for t in range(100): action = state.get_next_action_state(policy) next_state, reward, done, _ = self.env.step(action.get_gym_action()) actions.append((state, action, reward)) if done: break state = next_state return actions
class OffNStepSarsa(Algorithm): def __init__(self, gym_env: gym.Env, n): super().__init__(gym_env) self.n = n def run(self, num_episodes, discount_factor=1, epsilon=0.1, learning_rate=0.5): self.env = Env(self.gym_env, discount_factor, epsilon, action_type=NStepAction, learning_rate=learning_rate) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): states = list() state = self.env.reset() env_list = list() T = 1e10 update_time = -1 log.debug('----------------------\n\n') for t in itertools.count(): if t < T: policy = EGreedyPolicy(epsilon) action_state = state.get_next_action_state(policy) b = state.get_action_probability(policy, action_state) pi = state.get_action_probability(EGreedyPolicy(0.01), action_state) ratio = pi / b log.debug('s:{}'.format(state)) log.debug('a:{}'.format(action_state)) log.debug('b:{0:.2f} pi:{1:.2f} ra:{2:.2f}'.format( b, pi, ratio)) action_state.add_reward_calculator(t) # self.env.render() next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) log.debug('done: {} reward: {}'.format(done, reward)) env_list.append((state, action_state)) states.append(next_state) accumulated_time = 0 if update_time + 1 < 0 else update_time + 1 for s, a_s in env_list[accumulated_time:]: log.debug('cache for s:{} a:{}'.format(s, a_s)) a_s.cache_reward( reward, step=t, one_step_importance_sampling_ratio=ratio) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: T = t + 1 else: state = next_state update_time = t - self.n + 1 if update_time >= 0: action_state_update_time = env_list[update_time][1] evaluated_state_index = update_time + self.n - 1 if evaluated_state_index < len(states): log.debug('=n') state_update_time = states[evaluated_state_index] action_state_update_time.update( 0, state_update_time.get_actions(), time_step=update_time) else: log.debug('<n') action_state_update_time.update(0, None, time_step=update_time) if update_time == T - 1: a_ss = [a_s for _, a_s in env_list] for a_s in a_ss: a_s.clear_reward_calculator() break return stats