Ejemplo n.º 1
0
def test_rush_policy():
    """
    Only one possible move.
    """
    state = State(board=[[1, 2, 1], [2, 2, 1], [0, 1, 2]], turn=1)
    policy = TabularPolicy()
    after_state = State(from_base10=policy.move_dict[state.get_num()])
    expected_after_state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]],
                                 turn=2)
    assert after_state.board == expected_after_state.board
    assert after_state.turn == expected_after_state.turn
    """
    Multiple possible moves.
    """
    state = State(board=[[1, 0, 0], [2, 2, 1], [0, 1, 2]], turn=2)
    policy = TabularPolicy()
    after_state = State(from_base10=policy.move_dict[state.get_num()])
    expected_board = [[1, 2, 0], [2, 2, 1], [0, 1, 2]]
    assert after_state.board == expected_board
    assert after_state.turn == 1
    """
    Filled board
    """
    state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]], turn=2)
    policy = TabularPolicy()
    with pytest.raises(KeyError):
        after_state = State(from_base10=policy.move_dict[state.get_num()])
Ejemplo n.º 2
0
def test_get_num_from_state():
    state = State(board=[[0, 1, 2], [0, 1, 2], [0, 0, 0]], turn=2)
    num = state.get_num()
    assert num == int('2012012000', 3)
    state = State(board=[[1, 2, 1], [2, 1, 2], [1, 2, 2]])
    num = state.get_num()
    assert num == int('1121212122', 3)
Ejemplo n.º 3
0
def test_be_greedy():
    policy = TabularPolicy()
    best = State(board=[[0, 0, 0], [1, 0, 0], [0, 0, 0]], turn=2)
    policy.v_dict[best.get_num()] = 1
    assert policy.be_greedy()
    state = State()
    assert policy.move_dict[state.get_num()] == best.get_num()
    assert not policy.be_greedy()  # No more change when run the second time
Ejemplo n.º 4
0
 def TrainOneRound(self, afterstate_num, alpha=.1):
     """ Q learning following Sutton and Barto 6.5
     Input:
         afterstate: the afterstate of target_policy to start trainng with
         Note that the opponent makes a move first, then the target policy.
     """
     afterstate = State(from_base10=afterstate_num)
     while not afterstate.is_terminal():
         beforestate_num = self.random_policy.move(
             afterstate.get_num())  # opponent makes a move
         beforestate = State(from_base10=beforestate_num)
         if beforestate.is_terminal():
             r = beforestate.get_reward()
             self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                 r - self.target_policy.v_dict[afterstate.get_num()])
             break
         else:
             s_primes = beforestate.legal_afterstates()
             candidates = []
             for s_prime in s_primes:
                 r = State(from_base10=s_prime).get_reward()
                 q = self.target_policy.v_dict[s_prime]
                 candidates.append(r + q)
             if beforestate.turn == 1:
                 self.target_policy.v_dict[
                     afterstate.get_num()] += alpha * (
                         max(candidates) -
                         self.target_policy.v_dict[afterstate.get_num()])
             else:
                 self.target_policy.v_dict[
                     afterstate.get_num()] += alpha * (
                         min(candidates) -
                         self.target_policy.v_dict[afterstate.get_num()])
             afterstate_num = self.random_policy.move(beforestate_num)
             afterstate = State(from_base10=afterstate_num)
 def AutoPlay(self, policy_1, policy_2, n_games=100):
     """ Let policy_1 and policy_2 play against each other for n_games
     Input: self explanatory.
     Returns:
          A list of game results, i.e. reward for player 1.
     """
     game_results = []
     for i in range(n_games):
         state = self.GetInitialState()
         if state.turn == 2:
             state = State(from_base10=policy_2.move_dict[state.get_num()])
         while not state.is_terminal():
             state = State(from_base10=policy_1.move_dict[state.get_num()])
             if state.is_terminal():
                 break
             state = State(from_base10=policy_2.move_dict[state.get_num()])
         game_results.append(state.get_reward())
     return game_results
Ejemplo n.º 6
0
 def TrainOneRound(self, afterstate_num, alpha=0.1):
     """ TD(0) following Sutton and Barto 6.1
     Input:
         afterstate: the afterstate of target_policy to start trainng with
         Note that the opponent mamkes a move first, then the target policy.
     """
     afterstate = State(from_base10=afterstate_num)
     while not afterstate.is_terminal():
         beforestate_num = self.opponent_policy.move(afterstate.get_num())
         beforestate = State(from_base10=beforestate_num)
         if beforestate.is_terminal():
             r = beforestate.get_reward()
             self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                 r - self.target_policy.v_dict[afterstate.get_num()])
             break
         else:
             s_prime_num = self.target_policy.move(beforestate_num)
             s_prime = State(from_base10=s_prime_num)
             r = s_prime.get_reward()
             self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                 r + self.target_policy.v_dict[s_prime_num] -
                 self.target_policy.v_dict[afterstate.get_num()])
             afterstate = s_prime
    def PolicyEvaluation(self):
        """Policy Evaluation following Sutton Barto 4.3
           Against rush opponent, with afterstates
        """
        theta = 0.01
        t = time.time()
        while True:
            delta = 0
            for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
                v = self.policy_1.v_dict[num]
                state = State(from_base10=num)  # here s is afterstate

                # terminal state, v function equals game result (no reward for transition)
                if state.is_terminal():
                    self.policy_1.v_dict[num] = state.get_reward()
                else:
                    # non-terminal afterstates
                    opponent_afterstate = State(
                        from_base10=self.policy_2.move_dict[num])
                    if opponent_afterstate.is_terminal():
                        self.policy_1.v_dict[
                            num] = opponent_afterstate.get_reward()
                    else:
                        s_prime_num = self.policy_1.move_dict[
                            opponent_afterstate.get_num()]
                        self.policy_1.v_dict[num] = self.policy_1.v_dict[
                            s_prime_num]

                delta = max(delta, np.abs(v - self.policy_1.v_dict[num]))

            self.i_epoch += 1

            if delta < theta:
                print('Value function has converged!')
                print("Trained %i epochs so far." % self.i_epoch)
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
                break

            if time.time() - t > 10:
                t = time.time()
                print("Trained %i epochs so far." % self.i_epoch)
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
Ejemplo n.º 8
0
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 15 19:58:57 2019

@author: daugh
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

state = State(board=[[2, 1, 2], [0, 1, 0], [0, 0, 0]])
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1, abs=theta
), 'Player 1 can win next step, expect value 1, got %f' % policy.v_dict[
    state.get_num()]
""" Keep this print statement at the end
"""
print('All assertions passed.')
Ejemplo n.º 9
0
@author: daugh
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

state = State(board=[[1, 2, 1], [2, 2, 1], [1, 0, 0]], turn=2)
assert policy.v_dict[state.get_num()] == pytest.approx(
    -0.5, abs=theta
), 'Player 2 plays random, one move is winning and one move is leading to a tie, expect value -0.5. Got %f' % policy.v_dict[
    state.get_num()]

state = State(board=[[2, 1, 0], [2, 1, 0], [1, 2, 0]])
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1. / 3,
    abs=theta), 'Player 1 players random, one move is winning, the other\
        two moves lead to a draw because player 2 (target policy) plays rush. \
        expect value 1/3. Got %f' % policy.v_dict[state.get_num()]
""" Keep this print statement at the end
"""
print('All assertions passed.')
Ejemplo n.º 10
0
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Policy iteration against rush opponent. Accuracy %f' % theta)

state = State(board=[[1, 1, 1], [2, 2, 1], [2, 2, 1]], turn=2)
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1, abs=theta
), 'Player 1 wins, expect value 1. Got %f' % policy.v_dict[state.get_num()]

state = State(board=[[1, 1, 2], [2, 2, 1], [1, 2, 1]], turn=2)
assert policy.v_dict[state.get_num()] == pytest.approx(
    0,
    abs=theta), 'Tied. Expect value 0. Got %f' % policy.v_dict[state.get_num()]

state = State(board=[[1, 1, 0], [2, 2, 1], [2, 1, 0]], turn=2)
assert policy.v_dict[state.get_num()] == pytest.approx(
    -1, abs=theta
), 'One step before losing. Expect value -1. Got %f' % policy.v_dict[
    state.get_num()]

state = State(board=[[1, 1, 0], [2, 2, 1], [0, 2, 1]], turn=2)
Ejemplo n.º 11
0
import os
import pickle

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

opponent_policy = TabularPolicy(epsilon=1)
results = []
for i in range(1000):
    state = State()
    while True:
        state = State(from_base10=policy.move_dict[state.get_num()])
        if state.is_terminal():
            break
        else:
            state = State(from_base10=opponent_policy.move(state.get_num()))
            if state.is_terminal():
                break
    results.append(state.get_reward())

print("Average reward %f over 1000 games as player X against random policy." %
      (sum(results) / 1000.))

results = []
for i in range(1000):
    state = State()
    while True:
Ejemplo n.º 12
0
@author: daugh
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

# state = State(board=[[1, 2, 1],
#                     [2, 2, 1],
#                     [1, 0, 0]], turn=2)
# state.print_board()
# assert policy.v_dict[state.get_num()] == pytest.approx(
#    -0.5, abs=theta), 'Player 2 plays random, one move is winning and one move is leading to a tie, expect value -0.5. Got %f' % policy.v_dict[state.get_num()]

state = State(board=[[1, 0, 0], [0, 0, 0], [0, 0, 0]], turn=2)
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1, abs=theta
), 'Both play rush, player 1 will win. Got %f' % policy.v_dict[state.get_num()]
""" Keep this print statement at the end
"""
print('All assertions passed.')