Example #1
0
 def update(self, reward_calculator, next_actions, **kwargs):
     time_step = kwargs['time_step']
     evaluated_action_value = 0
     if next_actions:
         next_action = GreedyPolicy().pick_action(next_actions)
         evaluated_action_value = next_action.evaluate()
     reward_calculator = self.reward_calculators[time_step]
     g = reward_calculator.get_reward(
     ) + reward_calculator.get_next_discount() * evaluated_action_value
     self.learn(g)
     del self.reward_calculators[time_step]
Example #2
0
 def update(self, reward_calculator, next_actions, **kwargs):
     time_step = kwargs['time_step']
     evaluated_action_value = 0
     if next_actions:
         next_action = GreedyPolicy().pick_action(next_actions)
         evaluated_action_value = next_action.evaluate()
     reward_calculator = self.reward_calculators[time_step]
     g = reward_calculator.get_reward(
     ) + reward_calculator.get_next_discount() * evaluated_action_value
     log.debug('g: {}'.format(g))
     self._learn(g, reward_calculator.get_importance_sampling_ratio())
     del self.reward_calculators[time_step]
Example #3
0
 def test_should_return_correct_probabilities(self):
     action1 = Mock()
     action2 = Mock()
     action1.evaluate = MagicMock(return_value=1)
     action2.evaluate = MagicMock(return_value=2)
     actual = GreedyPolicy(0.1).action_to_probability([action1, action2])
     self.assertAlmostEqual(1, actual[action2])
     self.assertAlmostEqual(0, actual[action1])
Example #4
0
 def show_one_episode(self):
     state = self.env.reset()
     for t in itertools.count():
         self.env.render()
         action_state = state.get_next_action_state(
             GreedyPolicy(self.action_evaluator))
         next_state, reward, done, _ = self.env.step(
             action_state.get_gym_action())
         if done:
             self.env.render()
             break
         state = next_state
Example #5
0
 def run(self, num_episodes, discount_factor=0.8, epsilon=0.1):
     self.env = Env(self.gym_env, discount_factor, epsilon, action_type=McOfflineAction)
     n = self.env.env.action_space.n
     for _ in tqdm(range(num_episodes)):
         action_states = self.generate_one_episode_action_states_by_policy(RandomPolicy())
         w = 1
         g = 0
         for action_state in reversed(action_states):
             state, action_state, reward = action_state
             g = discount_factor * g + reward
             action_state.update_c(w)
             action_state.update_q(g, w)
             action = state.get_next_action_state(GreedyPolicy())
             if action != action_state:
                 break
             w = w * n
     return state
Example #6
0
 def update(self, reward, next_actions, **kwargs):
     next_q_value = GreedyPolicy().pick_action(next_actions).evaluate()
     # self.q = self.q + self.learning_rate * (reward + self.discount_factor * next_q_value - self.q)
     g = reward + self.discount_factor * next_q_value
     self.learn(g)
 def get_result(self):
     return self.state, GreedyPolicy().pick_action(
         self.available_actions).get_gym_action()
 def update_q2(self, reward, next_actions):
     next_action_state = GreedyPolicy(lambda action: action.q1).pick_action(next_actions)
     self.q2 = self.q2 + self.learning_rate * (reward + self.discount_factor * next_action_state.q1 - self.q2)
     self.learning_rate = self.anneal()