Esempio n. 1
0
def test_rush_policy():
    """
    Only one possible move.
    """
    state = State(board=[[1, 2, 1], [2, 2, 1], [0, 1, 2]], turn=1)
    policy = TabularPolicy()
    after_state = State(from_base10=policy.move_dict[state.get_num()])
    expected_after_state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]],
                                 turn=2)
    assert after_state.board == expected_after_state.board
    assert after_state.turn == expected_after_state.turn
    """
    Multiple possible moves.
    """
    state = State(board=[[1, 0, 0], [2, 2, 1], [0, 1, 2]], turn=2)
    policy = TabularPolicy()
    after_state = State(from_base10=policy.move_dict[state.get_num()])
    expected_board = [[1, 2, 0], [2, 2, 1], [0, 1, 2]]
    assert after_state.board == expected_board
    assert after_state.turn == 1
    """
    Filled board
    """
    state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]], turn=2)
    policy = TabularPolicy()
    with pytest.raises(KeyError):
        after_state = State(from_base10=policy.move_dict[state.get_num()])
Esempio n. 2
0
def test_random_move():
    policy = TabularPolicy(epsilon=1)
    collect = []
    for _ in range(10000):
        collect.append(policy.move(31206))
    assert collect.count(51618) == pytest.approx(3333, abs=100)
    assert collect.count(50916) == pytest.approx(3333, abs=100)
    assert collect.count(50890) == pytest.approx(3333, abs=100)
Esempio n. 3
0
def test_be_greedy():
    policy = TabularPolicy()
    best = State(board=[[0, 0, 0], [1, 0, 0], [0, 0, 0]], turn=2)
    policy.v_dict[best.get_num()] = 1
    assert policy.be_greedy()
    state = State()
    assert policy.move_dict[state.get_num()] == best.get_num()
    assert not policy.be_greedy()  # No more change when run the second time
Esempio n. 4
0
class Train:
    def __init__(self, path, read_first=False):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.target_policy, self.i_epoch = pickle.load(open(path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' %
                  self.i_epoch)
        else:
            self.target_policy = TabularPolicy()
            self.i_epoch = 0
        self.opponent_policy = TabularPolicy()
        self.path = path
        # num for the state with an empty board and with player 1 to make a move.
        self.start_num = int('1' + '0' * 9, 3)

    def TrainContinuously(self, n_epoch=1e99):
        t = time.time()
        while self.i_epoch < n_epoch:
            while time.time() - t < 10 and self.i_epoch < n_epoch:
                # Target policy as player 1
                self.TrainOneRound(
                    self.target_policy.move_dict[self.start_num])
                self.i_epoch += 1
                # Target policy as player 2
                self.TrainOneRound(self.start_num)
            t = time.time()
            pickle.dump((self.target_policy, self.i_epoch),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)

    def TrainOneRound(self, afterstate_num, alpha=0.1):
        """ TD(0) following Sutton and Barto 6.1
        Input:
            afterstate: the afterstate of target_policy to start trainng with
            Note that the opponent mamkes a move first, then the target policy.
        """
        afterstate = State(from_base10=afterstate_num)
        while not afterstate.is_terminal():
            beforestate_num = self.opponent_policy.move(afterstate.get_num())
            beforestate = State(from_base10=beforestate_num)
            if beforestate.is_terminal():
                r = beforestate.get_reward()
                self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                    r - self.target_policy.v_dict[afterstate.get_num()])
                break
            else:
                s_prime_num = self.target_policy.move(beforestate_num)
                s_prime = State(from_base10=s_prime_num)
                r = s_prime.get_reward()
                self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                    r + self.target_policy.v_dict[s_prime_num] -
                    self.target_policy.v_dict[afterstate.get_num()])
                afterstate = s_prime
Esempio n. 5
0
def test_get_trajectory():
    trainer = Train(path='foo', read_first=False)
    trainer.epsilon = 0
    trajectory = trainer.GetOneTrajectory(TabularPolicy(), TabularPolicy())
    num1 = State(board=[[0, 0, 0], [0, 0, 0], [0, 0, 0]]).get_num()
    num2 = State(board=[[1, 0, 0], [0, 0, 0], [0, 0, 0]], turn=2).get_num()
    num3 = State(board=[[1, 2, 0], [0, 0, 0], [0, 0, 0]]).get_num()
    num4 = State(board=[[1, 2, 1], [0, 0, 0], [0, 0, 0]], turn=2).get_num()
    num5 = State(board=[[1, 2, 1], [2, 0, 0], [0, 0, 0]]).get_num()
    num6 = State(board=[[1, 2, 1], [2, 1, 0], [0, 0, 0]], turn=2).get_num()
    num7 = State(board=[[1, 2, 1], [2, 1, 2], [0, 0, 0]]).get_num()
    num8 = State(board=[[1, 2, 1], [2, 1, 2], [1, 0, 0]], turn=2).get_num()
    assert trajectory == [num1, num2, num3, num4, num5, num6, num7, num8]
Esempio n. 6
0
 def __init__(self, path, read_first=False):
     """
     Input:
          path: the path to save the policy
          read_first: if true, read from the path first
     """
     if read_first:
         self.policy_1, self.i_epoch = pickle.load(open(path, 'rb'))
         print('Policy read from file. Trained for %i epochs.' %
               self.i_epoch)
     else:
         self.policy_1 = TabularPolicy()
         self.i_epoch = 0
     self.path = path
     self.policy_stable = True
class Train:
    def __init__(self, path, read_first=False, epsilon=.9):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.policy_1, self.i_epoch, self.returns = pickle.load(
                open(path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' %
                  self.i_epoch)
        else:
            self.policy_1 = TabularPolicy(epsilon=epsilon)
            self.i_epoch = 0
            self.returns = dict()
        self.path = path
        self.policy_stable = True
        self.epsilon = epsilon
        self.policy_1.epsilon = epsilon

    def OnPolicyMCControl(self):
        """ On-policy MC control following Sutton Barto 5.4
        """
        t = time.time()
        while True:
            while time.time() - t < 10:
                num = State().get_num()
                history = [num]
                while not State(from_base10=num).is_terminal():
                    num = self.policy_1.move(num)
                    history.append(num)
                # g is a constant for our case
                g = State(from_base10=num).get_reward()
                for i, num in enumerate(history):
                    if num in self.returns:
                        self.returns[num].append(g)
                    else:
                        self.returns[num] = [g]
                    self.policy_1.v_dict[num] = np.average(self.returns[num])
                if self.policy_1.be_greedy(history):
                    self.policy_stable = False
                self.i_epoch += 1

            t = time.time()
            pickle.dump((self.policy_1, self.i_epoch, self.returns),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)
 def MCPrediction(self, n_epoch):
     """ MC prediction following Sutton Barto 5.1
         Against rush opponent
     Input:
          n_epoch: the number of episodes to be trained
     """
     self.policy_2 = TabularPolicy()
     returns = dict()
     for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
         returns[num] = []
     for _ in range(n_epoch):
         # generate an episode following policy_1
         s = State().get_num()
         history = [s]
         while not State(from_base10=s).is_terminal():
             s = self.policy_1.move_dict[s]
             history.append(s)
             if State(from_base10=s).is_terminal():
                 break
             s = self.policy_2.move_dict[s]
             history.append(s)
         # in our special case, g is a constant
         g = State(from_base10=s).get_reward()
         for i, s in enumerate(history):
             returns[s].append(g)
             if i % 2 == 0:
                 self.policy_1.v_dict[s] = np.average(returns[s])
             else:
                 self.policy_2.v_dict[s] = np.average(returns[s])
     for num in range(int('2' + '0' * 9, 3), int('2' * 10, 3) + 1):
         self.policy_1.v_dict[num] = self.policy_2.v_dict[num]
     self.i_epoch += 1
     pickle.dump((self.policy_1, self.i_epoch), open(self.path, "wb"))
     print('MC prediction finished.')
Esempio n. 9
0
 def __init__(self, path, read_first=False):
     """
     Input:
          path: the path to save the policy
          read_first: if true, read from the path first
     """
     if read_first:
         self.target_policy, self.i_epoch = pickle.load(open(path, 'rb'))
         print('Policy read from file. Trained for %i epochs.' %
               self.i_epoch)
     else:
         self.target_policy = TabularPolicy()
         self.i_epoch = 0
     self.opponent_policy = TabularPolicy()
     self.path = path
     # num for the state with an empty board and with player 1 to make a move.
     self.start_num = int('1' + '0' * 9, 3)
 def __init__(self, write_path, read_path=None, self_play=False):
     """
     Input:
          n_game: number of games to train for
          read_path, write_path: paths for reading or saving the model
     """
     self.read_path = read_path
     self.write_path = write_path
     if read_path:
         self.policy_1, self.i_epoch = pickle.load(
             open(self.read_path, 'rb'))
         print('Policy read from file. Trained for %i epochs.' %
               self.i_epoch)
     else:
         self.policy_1 = TabularPolicy()
         self.i_epoch = 0
         print('Training new policy.')
         self.read_path = self.write_path  # for later iterative training
     if self_play:
         self.policy_2 = self.policy_1
     else:
         self.policy_2 = TabularPolicy()
     self.policy_ever_changed = True  # Set to true to state iterative training
Esempio n. 11
0
class TrainOneRound:
    def __init__(self, path, read_first=False):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.policy_1, self.i_epoch = pickle.load(open(path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' %
                  self.i_epoch)
        else:
            self.policy_1 = TabularPolicy()
            self.i_epoch = 0
        self.path = path
        self.policy_stable = True

    def MCES(self):
        """ MC exploring start following Sutton Barto 5.3
            Against rush opponent
        """
        t = time.time()
        # No need to use a list of returns, since the game is deterministic
        for s in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
            #             print(self.policy_1.v_dict[47042])
            history = [s]
            while not State(from_base10=s).is_terminal():
                s = self.policy_1.move_dict[s]
                history.append(s)
                if State(from_base10=s).is_terminal():
                    break
                s = self.policy_1.move_dict[s]
                history.append(s)
            g = State(from_base10=s).get_reward()
            for i, s in enumerate(history):
                self.policy_1.v_dict[s] = g
            if self.policy_1.be_greedy(history):
                self.policy_stable = False
            self.i_epoch += 1
            if time.time() - t > 10:
                t = time.time()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.path, "wb"))
                print("Trained %i epochs so far." % self.i_epoch)

        pickle.dump((self.policy_1, self.i_epoch), open(self.path, "wb"))
        print('MC exploring start finished.')
class TrainOneRound:
    def __init__(self, write_path, read_path=None, self_play=False):
        """
        Input:
             n_game: number of games to train for
             read_path, write_path: paths for reading or saving the model
        """
        self.read_path = read_path
        self.write_path = write_path
        if read_path:
            self.policy_1, self.i_epoch = pickle.load(
                open(self.read_path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' %
                  self.i_epoch)
        else:
            self.policy_1 = TabularPolicy()
            self.i_epoch = 0
            print('Training new policy.')
            self.read_path = self.write_path  # for later iterative training
        if self_play:
            self.policy_2 = self.policy_1
        else:
            self.policy_2 = TabularPolicy()
        self.policy_ever_changed = True  # Set to true to state iterative training

    def ValueIteration(self, theta=0.01):
        t = time.time()
        while True:
            delta = 0
            for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
                v = self.policy_1.v_dict[num]
                state = State(from_base10=num)
                if state.is_terminal():
                    self.policy_1.v_dict[num] = state.get_reward()
                else:
                    opponent_afterstate = State(
                        from_base10=self.policy_2.move_dict[num])
                    if opponent_afterstate.is_terminal():
                        self.policy_1.v_dict[
                            num] = opponent_afterstate.get_reward()
                    else:
                        s_prime_choices = opponent_afterstate.legal_afterstates(
                        )
                        if state.turn == 2:
                            vi_update = max([
                                self.policy_1.v_dict[x]
                                for x in s_prime_choices
                            ])
                        else:
                            vi_update = min([
                                self.policy_1.v_dict[x]
                                for x in s_prime_choices
                            ])
                        self.policy_1.v_dict[num] = vi_update
                delta = max(delta, np.abs(v - self.policy_1.v_dict[num]))

            self.i_epoch += 1

            if delta < theta:
                print('Value function has converged!')
                print("Trained %i epochs so far." % self.i_epoch)
                self.policy_ever_changed = self.policy_1.be_greedy()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
                break

            if time.time() - t > 10:
                t = time.time()
                print("Trained %i epochs so far." % self.i_epoch)
                self.policy_ever_changed = self.policy_1.be_greedy()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
Esempio n. 13
0
@author: daugh
"""
from ttt_play import State
from ttt_policies import TabularPolicy
import os
import pickle

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

opponent_policy = TabularPolicy(epsilon=1)
results = []
for i in range(1000):
    state = State()
    while True:
        state = State(from_base10=policy.move_dict[state.get_num()])
        if state.is_terminal():
            break
        else:
            state = State(from_base10=opponent_policy.move(state.get_num()))
            if state.is_terminal():
                break
    results.append(state.get_reward())

print("Average reward %f over 1000 games as player X against random policy." %
      (sum(results) / 1000.))
class Train:
    def __init__(self, path, read_first=False):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.target_policy, self.i_epoch, self.c = pickle.load(
                open(path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' % self.i_epoch)
        else:
            self.target_policy = TabularPolicy()
            self.i_epoch = 0
            self.c = {}

        self.behavior_policy = TabularPolicy(epsilon=1)
        self.path = path
        self.policy_stable = True

    def TrainContinuously(self, n_epoch=1e99):
        t = time.time()
        while self.i_epoch < n_epoch:
            while time.time() - t < 10 and self.i_epoch < n_epoch:
                self.TrainOneRound()
                self.i_epoch += 1
            t = time.time()
            pickle.dump((self.target_policy, self.i_epoch, self.c),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)

    def TrainOneRound(self):
        """ Off-policy MC prediction following Sutton Barto 5.6
        """
        # behavior policy playing player 1
        trajectory = self.GetOneTrajectory(
            self.behavior_policy, self.target_policy)
        self.OffPolicyMCControl(trajectory, 1)
        # behavior policy playing player 2
        trajectory = self.GetOneTrajectory(
            self.target_policy, self.behavior_policy)
        self.OffPolicyMCControl(trajectory, 2)

    def GetOneTrajectory(self, policy_1, policy_2):
        """ 
        Returns: list of state nums of a trajectory
        """
        num = State().get_num()
        trajectory = [num]
        while not State(from_base10=num).is_terminal():
            num = policy_1.move(num)
            trajectory.append(num)
            if not State(from_base10=num).is_terminal():
                num = policy_2.move(num)
                trajectory.append(num)
            else:
                break
        return trajectory

    def OffPolicyMCControl(self, trajectory, role_behavior_policy):
        """ Incremental implementation of off-policy MC prediction
        Input:
            trajectory
            role_behavior_policy: 1 or 2, denoting which player the behavior policy acted as in this trajectory
        """
        # g is a constant for our case
        g = State(from_base10=trajectory[-1]).get_reward()
        w = 1.
        i = len(trajectory) - 1
        for i, state in reversed(list(enumerate(trajectory))):
            if i == len(trajectory) - 1:
                # ignore the very last state, which is not a beforestate
                continue
            if (i % 2 + 1) != role_behavior_policy:
                # i denotes the number of pieces on the board. i%2+1 is 1 if
                # this is player 1's before state, and is 2 if this is player
                # 2's before state.
                continue
            afterstate = trajectory[i+1]
            if afterstate in self.c:
                self.c[afterstate] += w
            else:
                self.c[afterstate] = w
            self.target_policy.v_dict[afterstate] += w / \
                self.c[afterstate] * \
                (g - self.target_policy.v_dict[afterstate])
            self.target_policy.be_greedy([state])
            if self.target_policy.move_dict[trajectory[i]] != afterstate:
                break
            else:
                w = w * \
                    len(State(from_base10=trajectory[i]).legal_afterstates())
Esempio n. 15
0
class Train:
    def __init__(self, path, read_first=False):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.target_policy, self.i_epoch = pickle.load(open(path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' %
                  self.i_epoch)
        else:
            self.target_policy = TabularPolicy()
            self.i_epoch = 0
        self.random_policy = TabularPolicy(epsilon=1)
        self.path = path
        # num for the state with an empty board and with player 1 to make a move.
        self.start_num = int('1' + '0' * 9, 3)

    def TrainContinuously(self, n_epoch=1e99):
        t = time.time()
        while self.i_epoch < n_epoch:
            while time.time() - t < 10 and self.i_epoch < n_epoch:
                # Target policy as player 1
                self.TrainOneRound(self.random_policy.move(self.start_num))
                self.i_epoch += 1
                # Target policy as player 2
                self.TrainOneRound(self.start_num)
            t = time.time()
            self.target_policy.be_greedy()
            pickle.dump((self.target_policy, self.i_epoch),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)

    def TrainOneRound(self, afterstate_num, alpha=.1):
        """ Q learning following Sutton and Barto 6.5
        Input:
            afterstate: the afterstate of target_policy to start trainng with
            Note that the opponent makes a move first, then the target policy.
        """
        afterstate = State(from_base10=afterstate_num)
        while not afterstate.is_terminal():
            beforestate_num = self.random_policy.move(
                afterstate.get_num())  # opponent makes a move
            beforestate = State(from_base10=beforestate_num)
            if beforestate.is_terminal():
                r = beforestate.get_reward()
                self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                    r - self.target_policy.v_dict[afterstate.get_num()])
                break
            else:
                s_primes = beforestate.legal_afterstates()
                candidates = []
                for s_prime in s_primes:
                    r = State(from_base10=s_prime).get_reward()
                    q = self.target_policy.v_dict[s_prime]
                    candidates.append(r + q)
                if beforestate.turn == 1:
                    self.target_policy.v_dict[
                        afterstate.get_num()] += alpha * (
                            max(candidates) -
                            self.target_policy.v_dict[afterstate.get_num()])
                else:
                    self.target_policy.v_dict[
                        afterstate.get_num()] += alpha * (
                            min(candidates) -
                            self.target_policy.v_dict[afterstate.get_num()])
                afterstate_num = self.random_policy.move(beforestate_num)
                afterstate = State(from_base10=afterstate_num)