def evaluate_candidate(candidate_player, reference_player): wins = 0 draws = 0 losses = 0 # Always choose best move candidate_player.tryhard_mode = True for _ in range(400): env = GameEnv() current_player = candidate_player if random.random() < 0.5 else reference_player while True: move = current_player.select_move(env) env.play_move(*move) # Current player flips env.check_win() if env.winner and current_player is candidate_player: wins += 1 break elif env.winner and current_player is reference_player: losses += 1 break elif env.draw: draws += 1 break else: current_player = reference_player if current_player is candidate_player else candidate_player candidate_player.tryhard_mode = False return wins, draws, losses
def print_diagnostics(candidate_player, episode): test_env_start = GameEnv() test_env_about_to_win = GameEnv() test_env_about_to_win.play_move(0, 0) test_env_about_to_win.play_move(2, 2) test_env_about_to_win.play_move(0, 1) test_env_about_to_win.play_move(1, 1) test_env_about_to_win_p2 = GameEnv() test_env_about_to_win_p2.play_move(0, 0) test_env_about_to_win_p2.play_move(2, 2) test_env_about_to_win_p2.play_move(1, 1) test_env_about_to_win_p2.play_move(2, 1) test_env_about_to_win_p2.play_move(1, 0) initial_preferences = candidate_player.move_probabilities(test_env_start.state(), test_env_start.possible_actions()) about_to_win_preferences = candidate_player.move_probabilities(test_env_about_to_win.state(), test_env_about_to_win.possible_actions()) about_to_win_preferences_p2 = candidate_player.move_probabilities( test_env_about_to_win_p2.state(), test_env_about_to_win_p2.possible_actions() ) print(f"Initial move preference after {episode} games") print(initial_preferences) print(f"Preferences when winning in top right") print(about_to_win_preferences) print(f"Preferences when winning in bottom left") print(about_to_win_preferences_p2)
def main(): model_name = "models/model_conv.h5" reference_player = TinyBrain() # candidate_player = BigBrain(load_model=model_name, tryhard_mode=False) # For continuing training candidate_player = BigBrain(tryhard_mode=False) # For starting training episode = 1 while True: env = GameEnv() first_move = True # immediate store is all the information we have immediately after a move # (current state, possible actions, move) immediate_store = [] # delayed store is all the information we get after the next move # (next state, reward, terminated) delayed_store = [] # Randomly choose who goes first current_player = candidate_player if random.random() < 0.5 else reference_player while True: state = env.state() move = current_player.select_move(env) if current_player is candidate_player: if first_move: first_move = False else: # Finish providing information for candidate player's last move possible_actions = env.possible_actions() delayed_store.append((possible_actions, 0, state, False)) # Provide starting information for candidate player's current move do_explore = random.random() < 0.3 move = current_player.select_move(env, explore=do_explore) immediate_store.append((state, move, do_explore)) env.play_move(*move) # Current player flips env.check_win() if env.winner or env.draw: # If game has ended we need to give rewards to both players if env.draw: delayed_store.append((None, DRAW_REWARD, None, True)) elif current_player is candidate_player: # Winner is always whoever played last delayed_store.append((None, WIN_REWARD, None, True)) else: delayed_store.append((None, LOSS_REWARD, None, True)) for immediate, delayed in zip(immediate_store, delayed_store): state, move, do_explore = immediate if not do_explore: candidate_player.store(state, move, *delayed) break current_player = reference_player if current_player is candidate_player else candidate_player if episode % GAMES_PER_SET == 0: print(f"Training after {episode} episodes") candidate_player.retrain(batch_size=TRAINING_SIZE) candidate_player.align_target_model() candidate_player.save(model_name) wins, draws, losses = evaluate_candidate(candidate_player, TinyBrain()) print(f"{wins}, {draws}, {losses}") if wins + losses > 0: percentage_wins = wins / (wins + losses) else: percentage_wins = 0 print(f"percentage wins: {percentage_wins}") reference_player = candidate_player reference_player.tryhard_mode = False candidate_player = BigBrain(tryhard_mode=False) candidate_player.q_network = reference_player.q_network candidate_player.align_target_model() episode += 1