class AIPlayer(Player): def __init__(self, identifier: int, config: MLPConfig): super().__init__(identifier=identifier) self.action_service = ActionService() self.model = config.load_model() def _predict(self, game_state: 'GameState'): return self.model.predict({ "state": np.atleast_2d(game_state.create_numeral_representation(self)) }) @property def is_human(self): return False def _choose_action(self, game_state: 'GameState', verbose: bool = False) -> 'Action': predictions = self._predict(game_state=game_state) mask = self.action_service.get_valid_actions_mask( self, game_state.board) proper_predictions = predictions * np.atleast_2d(mask) masked_predictions = np.atleast_2d( np.logical_not(mask) * (np.min(predictions) - 1)) predictions = proper_predictions + masked_predictions action_idx = np.argmax(predictions[0]) # type: int return self.action_service.idx_to_action(action_idx)
def play_game(env, train_net, target_net, epsilon, copy_step, print_exp_step): rewards = 0 iteration = 0 done = False state = env.reset() while not done: actions_mask = env.get_current_actions_mask() action = train_net.get_action(state, actions_mask, epsilon) prev_state = state state, reward, done, _ = env.step(action) rewards += reward if done: env.reset() exp = { 's': prev_state, 'a': action, 'r': reward, 'm': actions_mask, 's2': state, 'done': done } train_net.add_experience(exp) train_net.train(target_net) iteration += 1 if iteration % print_exp_step == 0: print("Experience replay:") for exp_action in train_net.experience['a']: print(ActionService().idx_to_action(exp_action)) if iteration % copy_step == 0: target_net.copy_weights(train_net) return rewards
def step(self, action_idx: int): """ The agent takes a step in the environment. Parameters ---------- action_idx : int Returns ------- ob, reward, episode_over, info : tuple ob (object) : an environment-specific object representing your observation of the environment. reward (float) : amount of reward achieved by the previous action. The scale varies between environments, but the goal is always to increase your total reward. episode_over (bool) : whether it's time to reset the environment again. Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. (For example, perhaps the pole tipped too far, or you lost your last life.) info (dict) : diagnostic information useful for debugging. It can sometimes be useful for learning (for example, it might contain the raw probabilities behind the environment's last state change). However, official evaluations of your agent are not allowed to use this for learning. """ if self.game.is_finished(): raise RuntimeError("Episode is done, please reset the game.") action = ActionService().idx_to_action(action_idx) # type: Action self.curr_step += 1 self._take_action(action, action_idx) reward = action.get_reward() # If the game is over and the last player who played a card is the winner, add a winning bonus to the reward # Note that at this point the 'current player' is not the player who took the last action, # since _take_action() already swapped the current player. if self.game.is_finished(): player0 = self.game.players[0] player1 = self.game.players[1] if self.game.current_player == player0 and player0.score < player1.score: reward += 100 if self.game.current_player == player1 and player0.score > player1.score: reward += 100 observation = self._get_state() return observation, reward, self.game.is_finished(), {}
def _choose_action(self, game_state: 'GameState', verbose: bool = False) -> 'Action': eligible_actions = ActionService().get_valid_actions(self, game_state) for i, action in enumerate(eligible_actions): print("{}: {}".format(i, action)) action_index = None while action_index is None: try: action_index = input( "Type the number of the action you wish to execute:") action_index = int(action_index) except ValueError: print("Invalid action number '{}', please enter a number!". format(action_index)) action_index = None return eligible_actions[action_index]
def __init__(self): # Model params self.hidden_units = [32, 32] # Environment params self.num_states = GameState.SIZE self.num_actions = ActionService().num_actions # Training params self.gamma = 0.99 self.copy_step = 128 self.print_exp_step = 100000000 self.max_experiences = 1000 self.min_experiences = 64 self.batch_size = 64 self.lr = 1e-2 self.number_iterations = 10000 self.epsilon = 0.999 self.decay = 0.99995 # This decay makes it so that after 1000 iterations epsilon is 50% self.min_epsilon = 0.1 self.avg_rewards = 0
def test_take_card(self): # Execute a put action for the first player target_player = self.game.players[0] original_deck_num_cards = self.game.board.deck.num_cards() original_player_num_cards = target_player.hand.num_cards() self.game.board.set_phase(GamePhase.DRAW_PHASE) take_card_action = TakeCardAction() # Ensure the player can perform the play card action validated_actions = ActionService().get_valid_actions( target_player, self.game.board) self.assertIn(take_card_action, validated_actions) # Execute the action take_card_action.execute(target_player, self.game.board) # Ensure a card has been added to the players hand self.assertEqual(target_player.hand.num_cards(), original_player_num_cards + 1) # Ensure the deck has lost 1 card self.assertEqual(self.game.board.deck.num_cards(), original_deck_num_cards - 1)
def step(self, action_idx: int): """ The agent takes a step in the environment. Parameters ---------- action_idx : int Returns ------- ob, reward, episode_over, info : tuple ob (object) : an environment-specific object representing your observation of the environment. reward (float) : amount of reward achieved by the previous action. The scale varies between environments, but the goal is always to increase your total reward. episode_over (bool) : whether it's time to reset the environment again. Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. (For example, perhaps the pole tipped too far, or you lost your last life.) info (dict) : diagnostic information useful for debugging. It can sometimes be useful for learning (for example, it might contain the raw probabilities behind the environment's last state change). However, official evaluations of your agent are not allowed to use this for learning. """ if self.game.is_finished(): raise RuntimeError("Episode is done, please reset the game.") action = ActionService().idx_to_action(action_idx) # type: Action pre_action_score = self.game.player.score self.curr_step += 1 self._take_action(action, action_idx) if self.game.player.broken: reward = self.penalty else: score = self.game.player.score reward = int((score**2 / (abs(score - 21) + 1))) observation = self._get_state() # print(f"{pre_action_score} => {self.game.player.score} through {action} with reward {reward}") return observation, reward, self.game.is_finished(), {}
def test_play_card(self): # Execute a put action for the first player target_player = self.game.players[0] target_card = Card(12, HEARTS) self.game.board.set_phase(GamePhase.ACTION_PHASE) target_player.hand.clear() target_player.hand.add(Card(1, HEARTS)) # random card so hand isn't empty target_player.hand.add(target_card) play_card_action = PlayCardAction(target_card) # Ensure the player can perform the play card action validated_actions = ActionService().get_valid_actions( target_player, self.game.board) self.assertIn(play_card_action, validated_actions) # Execute the action play_card_action.execute(target_player, self.game.board) # Ensure the card has been played self.assertEqual(target_card, self.game.board.stack.look()) # Ensure the card has been removed from the players hand self.assertNotIn(target_card, target_player.hand)
def get_current_actions_mask(self) -> List[bool]: """Return a boolean mask representing the current valid actions.""" return ActionService().get_valid_actions_mask(self.player, self.get_state())
def _choose_action(self, game_state: 'GameState', verbose: bool = False) -> 'Action': eligible_actions = ActionService().get_valid_actions(self, game_state) return random.choice(eligible_actions)
def __init__(self, identifier: int, config: MLPConfig): super().__init__(identifier=identifier) self.action_service = ActionService() self.model = config.load_model()
def __init__(self, config: MLPConfig): super().__init__() self.action_service = ActionService() self.model = config.load_model()