def __init__(self, game_state: GameState): self.game_state = game_state self.logger = get_class_logger(self) pygame.init() self.resolution = (1280, 800) self._screen = None self._card_assets = None self._fps_clock = pygame.time.Clock() # Every player has a "Card surface" onto which their cards are drawn. # This card surface is then rotated and translated into position. p_card_surf_dims = (310, 170) p_text_surf_dims = (200, 140) self._player_card_surfs = [pygame.Surface(p_card_surf_dims) for _ in range(4)] self._player_text_surfs = [pygame.Surface(p_text_surf_dims) for _ in range(4)] # Surface in the middle, containing the "cards on the table". self._middle_trick_surf = pygame.Surface((300, 270)) pygame.freetype.init() self._font = pygame.freetype.SysFont(None, 11) self._font.antialiased = True # Latest click - these values survive only for one draw call. During the draw call they are set, and afterwards read. self._clicked_pos = None self._clicked_card = None # For now, these are only Player 0's cards.
def __init__(self, player_id: int): super().__init__(player_id) self.logger = get_class_logger(self) # "Power" values for quickly determining which card can beat which. # Defining this here because we don't want to be dependent on the enum int values. self._suit_power = {Suit.eichel: 40, Suit.gras: 30, Suit.herz: 20, Suit.schellen: 10} self._pip_power = {Pip.sau: 8, Pip.zehn: 7, Pip.koenig: 6, Pip.ober: 5, Pip.unter: 4, Pip.neun: 3, Pip.acht: 2, Pip.sieben: 1}
def main(): parser = argparse.ArgumentParser() parser.add_argument("--p0-agent", type=str, choices=['static', 'rule', 'random'], required=True) args = parser.parse_args() agent_choice = args.p0_agent # Init logging and adjust log levels for some classes. init_logging() logger = get_named_logger("{}.main".format(os.path.splitext(os.path.basename(__file__))[0])) get_class_logger(GameController).setLevel(logging.INFO) # Don't log specifics of a single game # Create the agent for Player 0. if agent_choice == "rule": agent = RuleBasedAgent(0) elif agent_choice == "static": agent = StaticPolicyAgent(0) else: agent = RandomCardAgent(0) logger.info(f'Evaluating agent "{agent.__class__.__name__}"') perf = eval_agent(agent)
def __init__(self, players: List[Player], i_player_dealer=0, dealing_behavior: DealingBehavior = DealFairly(), forced_game_mode: GameMode = None): """ Creates a GameController and, together with it, a GameState. Should be reused - run run_game() in order to simulate a single game. :param players: the players, along with their agents. :param i_player_dealer: The player who is the dealer at start (i+1 is the player who will lead in the first game). :param dealing_behavior: Optional - the dealing behaviour. Default = fair :param forced_game_mode: Optional - if not None, players cannot bid, but every game is always the provided mode. """ assert len(players) == 4 self.logger = get_class_logger(self) self.logger.debug("Initializing game.") self.logger.debug("Players:") for p in players: self.logger.debug("Player {} with behavior {}.".format(p, p.agent)) self.game_state = GameState(players, i_player_dealer=i_player_dealer) self.dealing_behavior = dealing_behavior self.forced_game_mode = forced_game_mode assert forced_game_mode is None or forced_game_mode.declaring_player_id is not None, "Must provide a specific player."
def main(): # Game Setup: # - In every game, Player 0 will play a Herz-Solo # - The cards are rigged so that Player 0 always receives a pretty good hand, most of them should be winnable. parser = argparse.ArgumentParser() parser.add_argument("--config", help="An experiment config file. Must always be specified.", required=True) args = parser.parse_args() # Init logging and adjust log levels for some classes. init_logging() logger = get_named_logger("{}.main".format(os.path.splitext(os.path.basename(__file__))[0])) get_class_logger(GameController).setLevel(logging.INFO) # Don't log specifics of a single game # Load config. # Create experiment dir and prepend it to all paths. # If it already exists, then training will simply resume from existing checkpoints in that dir. logger.info(f'Loading config from "{args.config}"...') config = load_config(args.config) experiment_dir = config["experiment_dir"] os.makedirs(config["experiment_dir"], exist_ok=True) agent_checkpoint_paths = {i: os.path.join(experiment_dir, name) for i, name in config["training"]["agent_checkpoint_names"].items()} # Create agents. agents = [] for i in range(4): x = config["training"]["player_agents"][i] if x == "DQNAgent": agent = DQNAgent(i, config=config, training=True) elif x == "RandomCardAgent": agent = RandomCardAgent(i) elif x == "RuleBasedAgent": agent = RuleBasedAgent(i) else: raise ValueError(f'Unknown agent type: "{x}"') agents.append(agent) # Load weights for agents. for i, weights_path in agent_checkpoint_paths.items(): if not os.path.exists(weights_path): logger.info('Weights file "{}" does not exist. Will create new file.'.format(weights_path)) else: agents[i].load_weights(weights_path) players = [Player(f"Player {i} ({a.__class__.__name__})", agent=a) for i, a in enumerate(agents)] # Rig the game so Player 0 has the cards to play a Herz-Solo. Force them to play it. game_mode = GameMode(GameContract.suit_solo, trump_suit=Suit.herz, declaring_player_id=0) controller = GameController(players, dealing_behavior=DealWinnableHand(game_mode), forced_game_mode=game_mode) n_episodes = config["training"]["n_episodes"] logger.info(f"Will train for {n_episodes} episodes.") # Calculate win% as simple moving average (just for display in the logfile). # The real evaluation is done in eval_rl_agent.py, with training=False. win_rate = float('nan') n_won = 0 sma_window_len = 1000 won_deque = deque() save_every_s = config["training"]["save_checkpoints_every_s"] time_start = timer() time_last_save = timer() for i_episode in range(n_episodes): if i_episode > 0: # Calculate avg win% if i_episode < sma_window_len: win_rate = n_won / i_episode else: if won_deque.popleft() is True: n_won -= 1 win_rate = n_won / sma_window_len # Log if i_episode % 100 == 0: s_elapsed = timer() - time_start logger.info("Ran {} Episodes. Win rate (last {} episodes) is {:.1%}. Speed is {:.0f} episodes/second.".format( i_episode, sma_window_len, win_rate, i_episode/s_elapsed)) # Save model checkpoint. # Also make a copy for evaluation - the eval jobs will sync on this file and later remove it. if timer() - time_last_save > save_every_s: for i, weights_path in agent_checkpoint_paths.items(): agents[i].save_weights(weights_path, overwrite=True) shutil.copyfile(weights_path, f"{os.path.splitext(weights_path)[0]}.for_eval.h5") time_last_save = timer() winners = controller.run_game() won = winners[0] won_deque.append(won) if won: n_won += 1 logger.info("Finished playing.") logger.info("Final win rate: {:.1%}".format(win_rate))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--config", help="A yaml config file. Must always be specified.", required=True) parser.add_argument("--loop", help="If set, then runs in an endless loop.", required=False, action="store_true") args = parser.parse_args() do_loop = args.loop is True # Init logging and adjust log levels for some classes. init_logging() logger = get_named_logger("{}.main".format( os.path.splitext(os.path.basename(__file__))[0])) get_class_logger(GameController).setLevel( logging.INFO) # Don't log specifics of a single game # Load config and check experiment dir. logger.info(f'Loading config from "{args.config}"...') config = load_config(args.config) experiment_dir = config["experiment_dir"] while not os.path.exists(experiment_dir): logger.warn( f'The experiment dir specified in the config does yet not exist: "{experiment_dir}" - waiting...' ) sleep(10) agent_checkpoint_paths = { i: os.path.join(experiment_dir, name) for i, name in config["training"]["agent_checkpoint_names"].items() } while True: # Wait until a ".for_eval" checkpoint exists (for any of possibly multiple agents). Then rename it to ".in_eval.[uniqueid]". # In this way, multiple eval scripts can run in parallel. # When the evaluation is done, we will rename it to ".{score}". for i_agent, cp_path in agent_checkpoint_paths.items(): # If multiple agents are specified in the config, evaluate all of them. checkpoint_path_in = f"{os.path.splitext(cp_path)[0]}.for_eval.h5" checkpoint_path_tmp = f"{os.path.splitext(cp_path)[0]}.in_eval.pid{os.getpid()}.h5" if os.path.exists(checkpoint_path_in): # Load the latest checkpoint and evaluate it try: os.rename(checkpoint_path_in, checkpoint_path_tmp) logger.info('Found a new checkpoint, evaluating...') # Create agent agent_type = config["training"]["player_agents"][i_agent] if agent_type == "DQNAgent": alphasheep_agent = DQNAgent(0, config=config, training=False) else: raise ValueError( f"Unknown agent type specified: {agent_type}") alphasheep_agent.load_weights(checkpoint_path_tmp) # Eval agent current_perf = eval_agent(alphasheep_agent) # Now we know the performance. Find best-performing previous checkpoint that exists on disk logger.info( "Comparing performance to previous checkpoints...") splitext = os.path.splitext(cp_path) checkpoints = glob.glob("{}-*{}".format( splitext[0], splitext[1])) best_perf = 0. for cp in checkpoints: perf_str = re.findall( r"{}-(.*){}".format(os.path.basename(splitext[0]), splitext[1]), cp) if len(perf_str) > 0: p = float(perf_str[0]) if p > best_perf: best_perf = p if best_perf > 0: logger.info( "Previously best checkpoint has performance {}". format(best_perf)) else: logger.info("Did not find any previous results.") if current_perf > best_perf: best_perf = current_perf logger.info("Found new best-performing checkpoint!") cp_best = "{}-{}{}".format(splitext[0], str(best_perf), splitext[1]) os.rename(checkpoint_path_tmp, cp_best) except OSError: # Probably a concurrent rename by another worker; continue and try again. logger.exception("Could not rename checkpoint!") logger.info("Waiting...") sleep(10) if not do_loop: # Run only once. return
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--p0-agent", type=str, choices=['static', 'rule', 'random', 'alphasheep', 'user'], required=True) parser.add_argument( "--alphasheep-checkpoint", help="Checkpoint for AlphaSheep, if --p0-agent=alphasheep.", required=False) parser.add_argument( "--agent-config", help="YAML file, containing agent specifications for AlphaSheep.", required=False) args = parser.parse_args() agent_choice = args.p0_agent as_checkpoint_path = args.alphasheep_checkpoint as_config_path = args.agent_config if agent_choice == "alphasheep" and (not as_checkpoint_path or not as_config_path): raise ValueError( "Need to specify --alphasheep-checkpoint and --agent-config if --p0_agent=alphasheep." ) # Init logging and adjust log levels for some classes. init_logging() logger = get_named_logger("{}.main".format( os.path.splitext(os.path.basename(__file__))[0])) get_class_logger(GameController).setLevel( logging.DEBUG) # Log every single card. get_class_logger(Gui).setLevel(logging.DEBUG) # Log mouse clicks. get_class_logger(RuleBasedAgent).setLevel( logging.DEBUG) # Log decisions by the rule-based players. # Create the agent for Player 0. if agent_choice == "alphasheep": # Load config. We ignore the "training" and "experiment" sections, but we need "agent_config". logger.info(f'Loading config from "{as_config_path}"...') config = load_config(as_config_path) get_class_logger(DQNAgent).setLevel(logging.DEBUG) # Log Q-values. alphasheep_agent = DQNAgent(0, config=config, training=False) alphasheep_agent.load_weights(as_checkpoint_path) p0 = Player("0-AlphaSheep", agent=alphasheep_agent) elif agent_choice == "user": p0 = Player("0-User", agent=GUIAgent(0)) elif agent_choice == "rule": p0 = Player("0-Hans", agent=RuleBasedAgent(0)) elif agent_choice == "static": p0 = Player("0-Static", agent=StaticPolicyAgent(0)) else: p0 = Player("0-RandomGuy", agent=RandomCardAgent(0)) # Players 1-3 are RuleBasedAgents. players = [ p0, Player("1-Zenzi", agent=RuleBasedAgent(1)), Player("2-Franz", agent=RuleBasedAgent(2)), Player("3-Andal", agent=RuleBasedAgent(3)) ] # Rig the game so Player 0 has the cards to play a Herz-Solo. # Also, force them to play it. game_mode = GameMode(GameContract.suit_solo, trump_suit=Suit.herz, declaring_player_id=0) controller = GameController(players, dealing_behavior=DealWinnableHand(game_mode), forced_game_mode=game_mode) # The GUI initializes PyGame and registers on events provided by the controller. Everything single-threaded. # # The controller runs the game as usual. Whenever the GUI receives an event, it can block execution, so the controller must wait # for the GUI to return control. Until then, it can draw stuff and wait for user input (mouse clicks, card choices, ...). logger.info("Starting GUI.") with Gui(controller.game_state) as gui: # Run an endless loop of single games. logger.info("Starting game loop...") try: while True: controller.run_game() except UserQuitGameException: # Closing the window or pressing [Esc] logger.info("User quit game.") logger.info("Shutdown.")
def __init__(self, player_id: int, config: Dict, training: bool): """ Creates a new DQNAgent. :param player_id: The unique id of the player (0-3). :param config: config dict containing an agent_config node. :param training: If True, will train during play. This usually means worse performance (because of exploration). If False, then the agent will always pick the highest-ranking valid action. """ super().__init__(player_id) self.logger = get_class_logger(self) config = config["agent_config"]["dqn_agent"] self.config = config self.training = training # We encode cards as one-hot vectors of size 32. # Providing indices to perform quick lookups. self._id2card = new_deck() self._card2id = {card: i for i, card in enumerate(self._id2card)} # Determine length of state vector. state_lens = { "cards_in_hand": 32, "cards_in_trick": 3 * 32, "cards_already_played": 32 } self._state_size = sum(state_lens[x] for x in config["state_contents"]) # Action space: One action for every card. # Naturally, most actions will be invalid because the agent doesn't have the card or is not allowed to play it. self._action_size = 32 # If True, then all unavailable actions are zeroed in the q-vector during learning. I thought this might improve training # speed, but it turned out to provide only a slight benefit. Incompatible with (and superseded by) allow_invalid_actions. self._zero_q_for_invalid_actions = config["zero_q_for_invalid_actions"] # If allowed, then the agent can choose an invalid card and get punished for it, while staying # in the same state. If not allowed, invalid actions are automatically skipped when playing. # See discussion in experiment_log.md self._allow_invalid_actions = config["allow_invalid_actions"] self._invalid_action_reward = config["invalid_action_reward"] if self._allow_invalid_actions and self._zero_q_for_invalid_actions: raise ValueError( "allow_invalid_actions and zero_q_for_invalid_actions are mutually exclusive." ) # Discount and exploration rate self._gamma = config["gamma"] self._epsilon = config["epsilon"] # Experience replay buffer for minibatch learning self.experience_buffer = deque(maxlen=config["experience_buffer_len"]) # Remember the state and action (card) played in the previous trick, so we can can judge it once we receive feedback. # Also remember which actions were valid at that time. self._prev_state = None self._prev_action = None self._prev_available_actions = None self._in_terminal_state = False # Create Q network (current state) and Target network (successor state). The networks are synced after every episode (game). self.q_network = self._build_model() self.target_network = self._build_model() self._align_target_model() self._batch_size = config["batch_size"] # Don't retrain after every single experience. # Retraining every time is expensive and doesn't add much information (rewards are received only at the end of the game). # If we wait for more experiences to accumulate before retraining, we get more fresh data before doing expensive training. # NOTE: This kind of breaks the "sync networks after every game" idea, but nevertheless is working very well to speed up training. self._retrain_every_n = config["retrain_every"] self._experiences_since_last_retrain = 0 # Memory: here are some things the agent remembers between moves. This is basically feature engineering, # it would be more interesting to have the agent learn these with an RNN or so! self._mem_cards_already_played = set() # For display in the GUI self._current_q_vals = None
def __init__(self, player_id: int): super().__init__(player_id) self.logger = get_class_logger(self) self._select_card_callback = None