Beispiel #1
0
def evaluate_candidate(candidate_player, reference_player):
    wins = 0
    draws = 0
    losses = 0

    # Always choose best move
    candidate_player.tryhard_mode = True

    for _ in range(400):
        env = GameEnv()
        current_player = candidate_player if random.random() < 0.5 else reference_player
        while True:
            move = current_player.select_move(env)
            env.play_move(*move)  # Current player flips
            env.check_win()
            if env.winner and current_player is candidate_player:
                wins += 1
                break
            elif env.winner and current_player is reference_player:
                losses += 1
                break
            elif env.draw:
                draws += 1
                break
            else:
                current_player = reference_player if current_player is candidate_player else candidate_player

    candidate_player.tryhard_mode = False

    return wins, draws, losses
Beispiel #2
0
def main():
    model_name = "models/model_conv.h5"
    reference_player = TinyBrain()
    # candidate_player = BigBrain(load_model=model_name, tryhard_mode=False)  # For continuing training
    candidate_player = BigBrain(tryhard_mode=False)  # For starting training

    episode = 1
    while True:
        env = GameEnv()
        first_move = True
        # immediate store is all the information we have immediately after a move
        # (current state, possible actions, move)
        immediate_store = []
        # delayed store is all the information we get after the next move
        # (next state, reward, terminated)
        delayed_store = []

        # Randomly choose who goes first
        current_player = candidate_player if random.random() < 0.5 else reference_player

        while True:
            state = env.state()
            move = current_player.select_move(env)
            if current_player is candidate_player:
                if first_move:
                    first_move = False
                else:
                    # Finish providing information for candidate player's last move
                    possible_actions = env.possible_actions()
                    delayed_store.append((possible_actions, 0, state, False))
                # Provide starting information for candidate player's current move
                do_explore = random.random() < 0.3
                move = current_player.select_move(env, explore=do_explore)
                immediate_store.append((state, move, do_explore))

            env.play_move(*move)  # Current player flips
            env.check_win()

            if env.winner or env.draw:
                # If game has ended we need to give rewards to both players
                if env.draw:
                    delayed_store.append((None, DRAW_REWARD, None, True))
                elif current_player is candidate_player:
                    # Winner is always whoever played last
                    delayed_store.append((None, WIN_REWARD, None, True))
                else:
                    delayed_store.append((None, LOSS_REWARD, None, True))

                for immediate, delayed in zip(immediate_store, delayed_store):
                    state, move, do_explore = immediate
                    if not do_explore:
                        candidate_player.store(state, move, *delayed)
                break

            current_player = reference_player if current_player is candidate_player else candidate_player

        if episode % GAMES_PER_SET == 0:
            print(f"Training after {episode} episodes")
            candidate_player.retrain(batch_size=TRAINING_SIZE)
            candidate_player.align_target_model()
            candidate_player.save(model_name)
            wins, draws, losses = evaluate_candidate(candidate_player, TinyBrain())
            print(f"{wins}, {draws}, {losses}")
            if wins + losses > 0:
                percentage_wins = wins / (wins + losses)
            else:
                percentage_wins = 0
            print(f"percentage wins: {percentage_wins}")

            reference_player = candidate_player
            reference_player.tryhard_mode = False
            candidate_player = BigBrain(tryhard_mode=False)
            candidate_player.q_network = reference_player.q_network
            candidate_player.align_target_model()

        episode += 1