Python GreedyPolicy Examples

Programming Language: Python

Namespace/Package Name: mdp.policy.greedy_policy

Class/Type: GreedyPolicy

Examples at hotexamples.com: 8

Python GreedyPolicy - 8 examples found. These are the top rated real world Python examples of mdp.policy.greedy_policy.GreedyPolicy extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GreedyPolicy(8)

evaluate(2)

Frequently Used Methods

GreedyPolicy (8)

evaluate (2)

Example #1

Show file

 def update(self, reward_calculator, next_actions, **kwargs):
     time_step = kwargs['time_step']
     evaluated_action_value = 0
     if next_actions:
         next_action = GreedyPolicy().pick_action(next_actions)
         evaluated_action_value = next_action.evaluate()
     reward_calculator = self.reward_calculators[time_step]
     g = reward_calculator.get_reward(
     ) + reward_calculator.get_next_discount() * evaluated_action_value
     self.learn(g)
     del self.reward_calculators[time_step]

Example #2

Show file

 def update(self, reward_calculator, next_actions, **kwargs):
     time_step = kwargs['time_step']
     evaluated_action_value = 0
     if next_actions:
         next_action = GreedyPolicy().pick_action(next_actions)
         evaluated_action_value = next_action.evaluate()
     reward_calculator = self.reward_calculators[time_step]
     g = reward_calculator.get_reward(
     ) + reward_calculator.get_next_discount() * evaluated_action_value
     log.debug('g: {}'.format(g))
     self._learn(g, reward_calculator.get_importance_sampling_ratio())
     del self.reward_calculators[time_step]

Example #3

Show file

 def test_should_return_correct_probabilities(self):
     action1 = Mock()
     action2 = Mock()
     action1.evaluate = MagicMock(return_value=1)
     action2.evaluate = MagicMock(return_value=2)
     actual = GreedyPolicy(0.1).action_to_probability([action1, action2])
     self.assertAlmostEqual(1, actual[action2])
     self.assertAlmostEqual(0, actual[action1])

Example #4

Show file

 def show_one_episode(self):
     state = self.env.reset()
     for t in itertools.count():
         self.env.render()
         action_state = state.get_next_action_state(
             GreedyPolicy(self.action_evaluator))
         next_state, reward, done, _ = self.env.step(
             action_state.get_gym_action())
         if done:
             self.env.render()
             break
         state = next_state

Example #5

Show file

 def run(self, num_episodes, discount_factor=0.8, epsilon=0.1):
     self.env = Env(self.gym_env, discount_factor, epsilon, action_type=McOfflineAction)
     n = self.env.env.action_space.n
     for _ in tqdm(range(num_episodes)):
         action_states = self.generate_one_episode_action_states_by_policy(RandomPolicy())
         w = 1
         g = 0
         for action_state in reversed(action_states):
             state, action_state, reward = action_state
             g = discount_factor * g + reward
             action_state.update_c(w)
             action_state.update_q(g, w)
             action = state.get_next_action_state(GreedyPolicy())
             if action != action_state:
                 break
             w = w * n
     return state

Example #6

Show file

 def update(self, reward, next_actions, **kwargs):
     next_q_value = GreedyPolicy().pick_action(next_actions).evaluate()
     # self.q = self.q + self.learning_rate * (reward + self.discount_factor * next_q_value - self.q)
     g = reward + self.discount_factor * next_q_value
     self.learn(g)

Example #7

Show file

File: gym_state.py Project: ixiaoxi/reinforcement_learning

 def get_result(self):
     return self.state, GreedyPolicy().pick_action(
         self.available_actions).get_gym_action()

Example #8

Show file

File: double_q_action.py Project: ixiaoxi/reinforcement_learning

 def update_q2(self, reward, next_actions):
     next_action_state = GreedyPolicy(lambda action: action.q1).pick_action(next_actions)
     self.q2 = self.q2 + self.learning_rate * (reward + self.discount_factor * next_action_state.q1 - self.q2)
     self.learning_rate = self.anneal()