def OnEpisodeFinishedCallback(self, env: Environment, brain: Brain, episode_idx: int, num_of_episodes: int, episode_reward: float, steps: int): values = brain.GetValues(self._states) for idx, v in enumerate(values): for a in self._actions: self._value_traces[a][idx].append(v[a])
def Decide( self, env: base.Environment, brain: base.Brain, state: base.State, episode_idx: int, num_of_episodes: int, ) -> base.Action: return env.GetActionFromChoice( numpy.random.choice(env.GetActionSpaceSize(), p=brain.GetValues(state)[0]))
def Decide( self, env: base.Environment, brain: base.Brain, state: base.State, episode_idx: int, num_of_episodes: int, ) -> base.Action: values = brain.GetValues(state) choice = int(numpy.argmax(values)) logging.vlog( 20, 'making greedy decision for state %s using values: %s; choice: %d', state, values, choice) return env.GetActionFromChoice(choice)