def play_move(self, game: Game, moves: List[Move]): available_actions = [move.move_id for move in moves] start_state = game.get_nn_features(self) action = np.random.choice(game.total_num_actions, p=self.policy( start_state, available_actions=available_actions)) move = self.action_to_move(action, moves) # Perform the action -> Get the reward and observe the next state new_state, reward = self.env.step(move) action = np.random.choice(game.total_num_actions, p=self.policy( start_state, available_actions=available_actions)) # new_action = np.random.choice( # game.total_num_actions, # p=self.policy(new_state, available_actions=available_actions) # ) q_values_new_state = self.estimator.predict( state_features=new_state, available_actions=available_actions) # value that we should have got # The Q-learning target policy is a greedy one, hence the `max` td_target = reward + self.discount_factor * np.nanmax( q_values_new_state) self.estimator.update(action, start_state, td_target)
def play_move(self, game: Game, moves: List[Move]): available_actions = [move.move_id for move in moves] self.state = game.get_nn_features(self) if len(available_actions) == 1: self.action = available_actions[0] else: self.action = self._get_policy_action(game, available_actions) move = self._action_to_move(self.action, moves) # Perform the action -> Get the reward and observe the next state new_state, reward = self.env.step(move) if self.train: td_target = self.get_td_target(new_state, reward, available_actions) self.estimator.update(self.state, td_target)
def _set_estimator(self, game: Game): if not self.estimator: self.estimator = LinearEstimator(game.total_num_actions, game.get_linear_features(self))
def _set_estimator(self, game: Game): if not self.estimator: self.estimator = TensorflowNNEstimator(game.total_num_actions, game.get_nn_features(self))
def _set_estimator(self, game: Game): if not self.estimator: self.estimator = TorchEstimator(game.total_num_actions, game.get_nn_features(self), load_model=self.load_model, name=self.name)
def _set_estimator(self, game: Game): if not self.estimator: self.estimator = NeuralNetworkEstimator(game.total_num_actions, game.get_nn_features(self))