Example #1
0
def play(model_path, is_max_entropy):
    """
    Play a game against a model
    :param model_path: String. Path to the model
    :param is_max_entropy: Boolean. Does the model uses entropy maximization
    """
    random.seed(int(time()))

    p1 = players.QPlayer(hidden_layers_size=layers_size,
                         learning_batch_size=batch_size,
                         gamma=gamma,
                         tau=tau,
                         batches_to_q_target_switch=batches_to_q_target_switch,
                         memory_size=memory_size,
                         session=tf.Session(),
                         maximize_entropy=is_max_entropy)
    p1.restore(model_path)

    p2 = players.Human()

    for g in range(1):
        print('STARTING NEW GAME (#{})\n-------------'.format(g))
        if g % 2 == 0:
            game = Game(p1, p2)
            print("Computer is X (1)")
        else:
            game = Game(p2, p1)
            print("Computer is O (-1)")
        while not game.game_status()['game_over']:
            if isinstance(game.active_player, players.Human):
                game.print_field()
                print("{}'s turn:".format(game.current_player))
            game.print_field()
            state = np.copy(game.board)
            # Force Q-Network to select different starting positions if it plays first
            action = int(
                game.active_player.select_cell(state, epsilon=0.0)
            ) if np.count_nonzero(game.board) > 0 or not isinstance(
                game.active_player, players.QPlayer) else random.randint(
                    0, 399)
            print(game.current_player, action)
            game.play(action)
            if not game.game_status()['game_over']:
                game.next_player()
            if game._invalid_move_played:
                print("*")
                break
        print('-------------\nGAME OVER!')
        game.print_board()
        print(game.game_status())
        print('-------------')
Example #2
0
def play():
    random.seed(int(time()))
    p1 = players.QPlayer([100, 160, 160, 100],
                         learning_batch_size=100,
                         gamma=0.95,
                         tau=0.95,
                         batches_to_q_target_switch=100,
                         memory_size=100000)
    p1.restore('./models/q.ckpt')
    p2 = players.Human()
    for g in range(4):
        print('STARTING NEW GAME (#{})\n-------------'.format(g))
        if g % 2 == 0:
            game = Game(p1, p2)
            print("Computer is X (1)")
        else:
            game = Game(p2, p1)
            print("Computer is O (-1)")
        while not game.game_status()['game_over']:
            if isinstance(game.active_player(), players.Human):
                game.print_board()
                print("{}'s turn:".format(game.current_player))
            state = np.copy(game.board)
            # Force Q-Network to select different starting positions if it plays first
            action = int(
                game.active_player().select_cell(state, epsilon=0.0)
            ) if np.count_nonzero(game.board) > 0 or not isinstance(
                game.active_player(), players.QPlayer) else random.randint(
                    0, 8)
            game.play(action)
            if not game.game_status()['game_over']:
                game.next_player()
        print('-------------\nGAME OVER!')
        game.print_board()
        print(game.game_status())
        print('-------------')


#train()
Example #3
0
def face_off(paths, rng=3, p1_name='Q', p2_name='E'):
    """
    Test different models against each other
    :param paths: List(String). Paths to the models
    :param rng: Integer. How many models in the paths supplied
    :param p1_name: String. Name of player 1
    :param p2_name: String. name of player 2
    :return: Dict. Number of won games per player
    """
    tie = 'TIE'
    results = {p1_name: 0, p2_name: 0, tie: 0}

    for path1 in paths:
        for i in range(rng):
            p1_dir = '{}/{}'.format(path1, i)
            print('Loading player {} [{}]...'.format(p1_name, p1_dir))
            graph1 = tf.Graph()
            with graph1.as_default():
                p1 = players.QPlayer(
                    hidden_layers_size=layers_size,
                    learning_batch_size=batch_size,
                    gamma=gamma,
                    tau=tau,
                    batches_to_q_target_switch=batches_to_q_target_switch,
                    memory_size=memory_size,
                    session=tf.Session(),
                    maximize_entropy=False)
                p1.restore('{}/{}.ckpt'.format(p1_dir, p1_name))
                p1.name = p1_name

            for path2 in paths:
                for j in range(rng):
                    p2_dir = '{}/{}'.format(path2, j)
                    print('Loading player {} [{}]...'.format(p2_name, p2_dir))
                    graph2 = tf.Graph()
                    with graph2.as_default():
                        p2 = players.QPlayer(hidden_layers_size=layers_size,
                                             learning_batch_size=batch_size,
                                             gamma=gamma,
                                             tau=tau,
                                             batches_to_q_target_switch=
                                             batches_to_q_target_switch,
                                             memory_size=memory_size,
                                             session=tf.Session(),
                                             maximize_entropy=True)
                        p2.restore('{}/{}.ckpt'.format(p2_dir, p2_name))
                        p2.name = p2_name

                    print('Playing...')
                    print('----------')
                    for g in range(18):
                        if g % 2 == 0:
                            game = Game(p1, p2)
                        else:
                            game = Game(p2, p1)
                        first_cell = g // 2
                        while not game.game_status()['game_over']:
                            state = np.copy(game.board)
                            action = int(
                                game.active_player.select_cell(
                                    state, epsilon=0.0)) if np.count_nonzero(
                                        game.board) > 0 else first_cell
                            game.play(action)
                            if not game.game_status()['game_over']:
                                game.next_player()
                        winner = game.game_status()['winner']
                        winner_name = game.player1.name if winner == 1 else (
                            game.player2.name if winner == -1 else tie)
                        print(
                            'GAME - player X: {p1}, player O: {p2} | First cell: {c} | Winner: {w}'
                            .format(p1=game.player1.name,
                                    p2=game.player2.name,
                                    c=first_cell,
                                    w=winner_name))
                        results[winner_name] += 1
                    print('----------')

    print('Final results: {}'.format(results))
    s = sum(results.values())
    pct = {k: int(10000 * v / s) / 100 for k, v in results.items()}
    print('Percents: {}'.format(pct))
    return results
Example #4
0
def train(p1_name,
          p2_name,
          p1_max_ent,
          p2_max_ent,
          p2_novice,
          num_of_games=1e6,
          savedir='./models'):
    """
    Initiate a single training process
    :param p1_name: String. Name of player 1 (will be used as file-name)
    :param p2_name: String. Name of player 2 (will be used as file-name)
    :param p1_max_ent: Boolean. Should player 1 use maximum-entropy learning
    :param p2_max_ent: Boolean. Should player 2 use maximum-entropy learning
    :param p2_novice: Boolean. Should player 2 be an instance of players.Novice
    :param num_of_games: Number. Number of games to train on
    :param savedir: String. Path to save trained weights
    """
    random.seed(int(time() * 1000))
    tf.reset_default_graph()
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    # Initialize players
    graph1 = tf.Graph()
    graph2 = tf.Graph()

    with graph1.as_default():
        p1 = players.QPlayer(
            tf.Session(),
            hidden_layers_size=layers_size,
            learning_batch_size=batch_size,
            gamma=gamma,
            batches_to_q_target_switch=batches_to_q_target_switch,
            tau=tau,
            memory_size=memory_size,
            maximize_entropy=p1_max_ent)
    p1.name = p1_name

    if p2_novice:
        p2 = players.Novice()
    else:
        with graph2.as_default():
            p2 = players.QPlayer(
                tf.Session(),
                hidden_layers_size=layers_size,
                learning_batch_size=batch_size,
                gamma=gamma,
                batches_to_q_target_switch=batches_to_q_target_switch,
                tau=tau,
                memory_size=memory_size,
                maximize_entropy=p2_max_ent)
    p2.name = p2_name

    total_rewards = {p1.name: 0, p2.name: 0}
    costs = {
        p1.name: [],
        p2.name: []
    }  # this will store the costs, so we can plot them later
    rewards = {
        p1.name: [],
        p2.name: []
    }  # same, but for the players total rewards

    # Start playing
    num_of_games = int(num_of_games)
    train_start_time = time()
    for g in range(1, num_of_games + 1):
        game = Game(p1, p2) if g % 2 == 0 else Game(
            p2, p1)  # make sure both players play X and O
        last_phases = {
            p1.name: None,
            p2.name: None
        }  # will be used to store the last state a player was in
        while not game.game_status()['game_over']:
            if isinstance(game.active_player, players.Human):
                game.print_board()
                print("{}'s turn:".format(game.active_player.name))

            # If this is not the first move, store in memory the transition from the last state
            # the active player saw to this one
            state = np.copy(game.board)
            if last_phases[game.active_player.name] is not None:
                memory_element = last_phases[game.active_player.name]
                memory_element['next_state'] = state
                memory_element['game_over'] = False
                game.active_player.add_to_memory(memory_element)

            # Calculate annealed epsilon
            if g <= num_of_games // 4:
                max_eps = 0.6
            elif g <= num_of_games // 2:
                max_eps = 0.1
            else:
                max_eps = 0.05
            min_eps = 0.01
            eps = round(
                max(max_eps - round(g * (max_eps - min_eps) / num_of_games, 3),
                    min_eps), 3)

            # Play and receive reward
            action = int(game.active_player.select_cell(state, epsilon=eps))
            play_status = game.play(action)
            game_over = play_status['game_over']
            if play_status['invalid_move']:
                r = game.invalid_move_reward
            elif game_over:
                if play_status['winner'] == 0:
                    r = game.tie_reward
                else:
                    r = game.winning_reward
            else:
                r = 0

            # Store the current state in temporary memory
            last_phases[game.active_player.name] = {
                'state': state,
                'action': action,
                'reward': r
            }
            total_rewards[game.active_player.name] += r
            if r == game.winning_reward:
                total_rewards[game.inactive_player.name] += game.losing_reward

            # Activate learning procedure
            cost = game.active_player.learn(learning_rate=learning_rate)
            if cost is not None:
                costs[game.active_player.name].append(cost)

            # Next player's turn, if game hasn't ended
            if not game_over:
                game.next_player()

        # Adding last phase for winning (active) player
        memory_element = last_phases[game.active_player.name]
        memory_element['next_state'] = np.zeros(9)
        memory_element['game_over'] = True
        game.active_player.add_to_memory(memory_element)

        # Adding last phase for losing (inactive) player
        memory_element = last_phases[game.inactive_player.name]
        memory_element['next_state'] = np.zeros(9)
        memory_element['game_over'] = True
        memory_element[
            'reward'] = game.losing_reward if r == game.winning_reward else game.tie_reward
        game.inactive_player.add_to_memory(memory_element)

        # Print statistics
        period = 100.0
        if g % int(period) == 0:
            print(
                'Game: {g} | Number of Trainings: {t1},{t2} | Epsilon: {e} | Average Rewards - {p1}: {r1}, {p2}: {r2}'
                .format(g=g,
                        p1=p1.name,
                        r1=total_rewards[p1.name] / period,
                        p2=p2.name,
                        r2=total_rewards[p2.name] / period,
                        t1=len(costs[p1.name]),
                        t2=len(costs[p2.name]),
                        e=eps))
            rewards[p1.name].append(total_rewards[p1.name] / period)
            rewards[p2.name].append(total_rewards[p2.name] / period)
            total_rewards = {p1.name: 0, p2.name: 0}

    # Save trained model and shutdown Tensorflow sessions
    training_time = time() - train_start_time
    minutes = int(training_time // 60)
    seconds = int(training_time % 60)
    if seconds < 10:
        seconds = '0{}'.format(seconds)
    print('Training took {m}:{s} minutes'.format(m=minutes, s=seconds))

    # Plot graphs and close sessions
    cost_colors = {p1.name: 'b', p2.name: 'k'}
    reward_colors = {p1.name: 'g', p2.name: 'r'}
    graphs = {p1.name: graph1, p2.name: graph2}
    for pp in [p1, p2]:
        with graphs[pp.name].as_default():
            pp.save('{dir}/{name}.ckpt'.format(dir=savedir, name=pp.name))
            pp.shutdown()

        plt.scatter(range(len(costs[pp.name])),
                    costs[pp.name],
                    c=cost_colors[pp.name])
        plt.title('Cost of player {}'.format(pp.name))
        plt.show()
        plt.scatter(range(len(rewards[pp.name])),
                    rewards[pp.name],
                    c=reward_colors[pp.name])
        plt.title('Average rewards of player {}'.format(pp.name))
        plt.show()

        plt.scatter(range(len(costs[pp.name])),
                    costs[pp.name],
                    c=cost_colors[pp.name])
        plt.title('Cost of player {} [0,1]'.format(pp.name))
        plt.ylim(0, 1)
        plt.show()
        plt.scatter(range(len(rewards[pp.name])),
                    rewards[pp.name],
                    c=reward_colors[pp.name])
        plt.title('Average rewards of player {} [-1,1]'.format(pp.name))
        plt.ylim(-1, 1)
        plt.show()
Example #5
0
def train():
    costs = []  # this will store the costs, so we can plot them later
    r1 = []  # same, but for the players total rewards
    r2 = []
    random.seed(int(time() * 1000))
    tf.reset_default_graph()
    logging.basicConfig(level=logging.WARN, format='%(message)s')

    # Initialize players
    p1 = players.QPlayer([100, 160, 160, 100],
                         learning_batch_size=150,
                         batches_to_q_target_switch=1000,
                         gamma=0.95,
                         tau=0.95,
                         memory_size=100000)
    p1.restore('./models/q.ckpt')
    p1.name = 'Q'

    p2 = players.Novice()
    p2.name = 'N'

    total_rewards = {p1.name: 0, p2.name: 0}

    # Start playing
    num_of_games = 400000
    for g in range(1, num_of_games + 1):
        game = Game(p1, p2) if g % 2 == 0 else Game(
            p2, p1)  # make sure both players play X and O
        last_phases = {
            p1.name: None,
            p2.name: None
        }  # will be used to store the last state a player was in
        while not game.game_status()['game_over']:
            if isinstance(game.active_player(), players.Human):
                game.print_board()
                print("{}'s turn:".format(game.active_player().name))

            # If this is not the first move, store in memory the transition from the last state
            # the active player saw to this one
            state = np.copy(game.board)
            if last_phases[game.active_player().name] is not None:
                memory_element = last_phases[game.active_player().name]
                memory_element['next_state'] = state
                memory_element['game_over'] = False
                game.active_player().add_to_memory(memory_element)

            # Calculate annealed epsilon
            if g <= num_of_games // 4:
                max_eps = 0.6
            elif g <= num_of_games // 2:
                max_eps = 0.01
            else:
                max_eps = 0.001
            min_eps = 0.01 if g <= num_of_games // 2 else 0.0
            eps = round(
                max(max_eps - round(g * (max_eps - min_eps) / num_of_games, 3),
                    min_eps), 3)

            # Play and receive reward
            action = int(game.active_player().select_cell(state, epsilon=eps))
            play_status = game.play(action)
            game_over = play_status['game_over']
            if play_status['invalid_move']:
                r = game.invalid_move_reward
            elif game_over:
                if play_status['winner'] == 0:
                    r = game.tie_reward
                else:
                    r = game.winning_reward
            else:
                r = 0

            # Store the current state in temporary memory
            last_phases[game.active_player().name] = {
                'state': state,
                'action': action,
                'reward': r
            }
            total_rewards[game.active_player().name] += r

            # Activate learning procedure
            cost = game.active_player().learn(learning_rate=0.0001)
            if cost is not None:
                costs.append(cost)

            # Next player's turn, if game hasn't ended
            if not game_over:
                game.next_player()

        # Adding last phase for winning (active) player
        memory_element = last_phases[game.active_player().name]
        memory_element['next_state'] = np.zeros(9)
        memory_element['game_over'] = True
        game.active_player().add_to_memory(memory_element)

        # Adding last phase for losing (inactive) player
        memory_element = last_phases[game.inactive_player().name]
        memory_element['next_state'] = np.zeros(9)
        memory_element['game_over'] = True
        memory_element['reward'] = game.losing_reward
        game.inactive_player().add_to_memory(memory_element)

        # Print statistics
        if g % 100 == 0:
            print(
                'Game: {g} | Number of Trainings: {t} | Epsilon: {e} | Average Rewards - {p1}: {r1}, {p2}: {r2}'
                .format(g=g,
                        p1=p1.name,
                        r1=total_rewards[p1.name] / 100.0,
                        p2=p2.name,
                        r2=total_rewards[p2.name] / 100.0,
                        t=len(costs),
                        e=eps))
            r1.append(total_rewards[p1.name] / 100.0)
            r2.append(total_rewards[p2.name] / 100.0)
            total_rewards = {p1.name: 0, p2.name: 0}

    # Save trained model and shutdown Tensorflow sessions
    p1.save('./models/q.ckpt')
    for pp in [p1, p2]:
        pp.shutdown()

    # Plot graphs
    plt.scatter(range(len(costs)), costs)
    plt.show()
    plt.scatter(range(len(r1)), r1, c='g')
    plt.show()
    plt.scatter(range(len(r2)), r2, c='r')
    plt.show()