Esempio n. 1
0
 def _learn(self, g, importance_sampling_ratio):
     # importance_sampling_ratio = 1
     # print(importance_sampling_ratio)
     log.debug('isr: {}'.format(importance_sampling_ratio))
     if importance_sampling_ratio == 0:
         return
     self.c += importance_sampling_ratio
     self.q = self.q + self.get_learning_rate(importance_sampling_ratio,
                                              self.c) * (g - self.q)
     log.debug('q:{} '.format(self.q))
Esempio n. 2
0
 def cache_reward(self, reward, time_step=9e20, **kwargs):
     one_step_importance_sampling_ratio = kwargs[
         'one_step_importance_sampling_ratio']
     log.debug('isr: {:.2f} ->'.format(self.importance_sampling_ratio))
     self.importance_sampling_ratio *= one_step_importance_sampling_ratio
     log.debug('isr: {:.2f}'.format(self.importance_sampling_ratio))
     if self.initial_time_step <= time_step:
         # print('cache {} {} {}'.format(self.initial_time_step, time_step, reward))
         self.reward_cache += reward * (self.discount_factor**
                                        self.reward_cache_count)
         self.reward_cache_count += 1
Esempio n. 3
0
 def update(self, reward_calculator, next_actions, **kwargs):
     time_step = kwargs['time_step']
     evaluated_action_value = 0
     if next_actions:
         next_action = GreedyPolicy().pick_action(next_actions)
         evaluated_action_value = next_action.evaluate()
     reward_calculator = self.reward_calculators[time_step]
     g = reward_calculator.get_reward(
     ) + reward_calculator.get_next_discount() * evaluated_action_value
     log.debug('g: {}'.format(g))
     self._learn(g, reward_calculator.get_importance_sampling_ratio())
     del self.reward_calculators[time_step]
 def execute_one_episode(self):
     log.debug('start')
     while True:
         log.debug(self.current_stat)
         is_end, reward = self.play()
         log.debug('reward={}'.format(reward))
         if is_end:
             log.debug('end')
             return reward
Esempio n. 5
0
 def run(self,
         num_episodes,
         discount_factor=1,
         epsilon=0.1,
         learning_rate=0.5):
     self.env = Env(self.gym_env,
                    discount_factor,
                    epsilon,
                    action_type=NStepAction,
                    learning_rate=learning_rate)
     stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                   episode_rewards=np.zeros(num_episodes))
     for i_episode in tqdm(range(num_episodes)):
         states = list()
         state = self.env.reset()
         env_list = list()
         T = 1e10
         update_time = -1
         log.debug('----------------------\n\n')
         for t in itertools.count():
             if t < T:
                 policy = EGreedyPolicy(epsilon)
                 action_state = state.get_next_action_state(policy)
                 b = state.get_action_probability(policy, action_state)
                 pi = state.get_action_probability(EGreedyPolicy(0.01),
                                                   action_state)
                 ratio = pi / b
                 log.debug('s:{}'.format(state))
                 log.debug('a:{}'.format(action_state))
                 log.debug('b:{0:.2f} pi:{1:.2f} ra:{2:.2f}'.format(
                     b, pi, ratio))
                 action_state.add_reward_calculator(t)
                 # self.env.render()
                 next_state, reward, done, _ = self.env.step(
                     action_state.get_gym_action())
                 log.debug('done: {} reward: {}'.format(done, reward))
                 env_list.append((state, action_state))
                 states.append(next_state)
                 accumulated_time = 0 if update_time + 1 < 0 else update_time + 1
                 for s, a_s in env_list[accumulated_time:]:
                     log.debug('cache for s:{} a:{}'.format(s, a_s))
                     a_s.cache_reward(
                         reward,
                         step=t,
                         one_step_importance_sampling_ratio=ratio)
                 stats.episode_rewards[i_episode] += reward
                 stats.episode_lengths[i_episode] = t
                 if done:
                     T = t + 1
                 else:
                     state = next_state
             update_time = t - self.n + 1
             if update_time >= 0:
                 action_state_update_time = env_list[update_time][1]
                 evaluated_state_index = update_time + self.n - 1
                 if evaluated_state_index < len(states):
                     log.debug('=n')
                     state_update_time = states[evaluated_state_index]
                     action_state_update_time.update(
                         0,
                         state_update_time.get_actions(),
                         time_step=update_time)
                 else:
                     log.debug('<n')
                     action_state_update_time.update(0,
                                                     None,
                                                     time_step=update_time)
             if update_time == T - 1:
                 a_ss = [a_s for _, a_s in env_list]
                 for a_s in a_ss:
                     a_s.clear_reward_calculator()
                 break
     return stats
Esempio n. 6
0
 def choose_next_policy(self):
     self.next_policy = self._choose_next_action()
     log.debug('choose:{}'.format(self.next_policy))
Esempio n. 7
0
 def get_reward(self) -> int:
     dealer_sum = self.get_dealer_score()
     log.debug('dealer:{} --- player:{}'.format(dealer_sum,
                                                self.player_sum))
     return self._get_reward(self.player_sum, dealer_sum)
 def _get_next_action(self):
     action = self._choose_next_action()
     log.debug('choose:{}'.format(self.next_policy))
     return action
 def choose_random_policy(self):
     actions = self.available_actions
     self.next_policy = actions[random.randint(0, len(actions) - 1)]
     log.debug('choose random:{}'.format(self.next_policy))
 def update_reward(self, reward):
     log.debug('start update')
     for state in self.state_gone_through:
         state.update_reward(reward)
         log.debug(state)
     log.debug('end update')