Beispiel #1
0
def main(rounds):
    global ticTacToe
    global wins
    global losses

    initial_q_value = 0.0
    alpha = 0.5  # Step size
    gamma = 1.0  # Discount factor
    epsilon = 0.2  # Exploration rate

    actions = [(i, j) for i in range(3) for j in range(3)]
    actions_per_state = {a: initial_q_value for a in actions}

    Q_values = {'terminal': actions_per_state.copy()}
    Q_values['terminal'][None] = 0.0

    first_turn_random_count = 0
    # Running Q-Learning Q-value updates for many episodes
    for i in range(rounds):
        if i % 10000 == 0:
            print("Rounds done = {}".format(i), end=" | ")
            print("Wins = {}".format(wins), end=" | ")
            print("Losses = {}".format(losses))
            wins = 0
            losses = 0
        ticTacToe = TicTacToe()

        first_turn = random.choice(['random', 'computer'])
        if first_turn == 'random':
            first_turn_random_count += 1
            # Random player playing one turn
            # Q opposition player playing one turn
            selected_grid = epsilon_greedy_for_opposition(
                Q_values, ticTacToe.get_current_state(), actions_per_state,
                epsilon)
            if selected_grid not in ticTacToe.get_empty_cells():
                selected_grid = random.choice(ticTacToe.get_empty_cells())
            ticTacToe.set_one_grid(selected_grid[0], selected_grid[1])
            ticTacToe.toggle_turn()

        Q_values = Q_Learning(Q_values, alpha, gamma, epsilon,
                              actions_per_state)

    # pprint(Q_values)
    policy = find_optimal_policy(Q_values)
    print("First turn by random players = {}%".format(first_turn_random_count *
                                                      100 / rounds))
    filename = "Q_values_{}_episodes_025_epsilon.p".format(rounds)
    pickle.dump(Q_values, open(filename, "wb"))
    filename = "policy_{}_episodes_025_epsilon.p".format(rounds)
    pickle.dump(policy, open(filename, "wb"))
Beispiel #2
0
def main(rounds):
    global ticTacToe
    wins, losses = 0, 0

    first_turn_random_count = 0
    # Running Q-Learning Q-value updates for many episodes
    for i in range(rounds):
        if i % 10000 == 0:
            print("Rounds done = {}".format(i), end=" | ")
            print("Wins = {}".format(wins), end=" | ")
            print("Losses = {}".format(losses))
            wins = 0
            losses = 0
        ticTacToe = TicTacToe()
        current_state = None

        first_turn = random.choice(['random', 'computer'])
        if first_turn == 'random':
            first_turn_random_count += 1
            # Random player playing one turn
            selected_grid = random.choice(ticTacToe.get_empty_cells())
            ticTacToe.set_one_grid(selected_grid[0], selected_grid[1])
            ticTacToe.toggle_turn()

        while current_state != 'terminal':
            current_state = ticTacToe.get_current_state()
            try:
                selected_grid = policy[current_state]
                if selected_grid not in ticTacToe.get_empty_cells():
                    selected_grid = random.choice(ticTacToe.get_empty_cells())
            except:
                selected_grid = random.choice(ticTacToe.get_empty_cells())
            ticTacToe.set_one_grid(selected_grid[0], selected_grid[1])
            solved, result = ticTacToe.is_solved()
            if solved:
                if result != 0:
                    wins += 1
                break
            ticTacToe.toggle_turn()

            selected_grid = random.choice(ticTacToe.get_empty_cells())
            ticTacToe.set_one_grid(selected_grid[0], selected_grid[1])
            solved, result = ticTacToe.is_solved()
            if solved:
                if result != 0:
                    losses += 1
                break
            ticTacToe.toggle_turn()
    print("First turn by random players = {}%".format(first_turn_random_count *
                                                      100 / rounds))