def future_rewards(self, state): """ Calculates the maximum reward of state Args: state (nested list (3 x 3)): a Tictactoe board Returns: float: maximum reward """ # Get all the available ations actions = Tictactoe.available_actions(state) # Return 0 if not actions available if len(actions) == 0: return 0 # Choose the max reward according to the Q table max_reward = self.get_q_value(state, actions[0]) for action in actions: value = self.get_q_value(state, action) if value > max_reward: max_reward = value return max_reward
def best_action(self, state, epsilon_true=False): """ Chooses the best action according to the state Args: state (nested list (3 x 3)): a Tictactoe board Raises: Exception: If no action is available Returns: tuple (i, j): best action """ # Get all the available ations actions = Tictactoe.available_actions(state) # Raise if no action available if len(actions) == 0: raise Exception('No action available') # Choose the best action according to its Q value if (not epsilon_true) or (epsilon_true and random.random() < (1 - self.epsilon)): best_action = actions[0] max_reward = self.get_q_value(state, best_action) for action in actions: value = self.get_q_value(state, action) if value > max_reward or (value == max_reward and random.random() < 0.5): best_action = action max_reward = value # Randomly choose the best action else: best_action = random.choice(actions) return best_action