Exemple #1
0
class Train:
    def __init__(self, path, read_first=False):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.target_policy, self.i_epoch = pickle.load(open(path, 'rb'))
            self.target_policy.epsilon = 0.3
            print('Policy read from file. Trained for %i epochs.' % self.i_epoch)
        else:
            self.target_policy = TabularPolicy(epsilon = 0.3)
            self.i_epoch = 0
        self.opponent_policy = TabularPolicy(epsilon=1)
        self.path = path
        # num for the state with an empty board and with player 1 to make a move.
        self.start_num = int('1' + '0' * 9, 3)
        self.counter = {}
        self.counter[24138] = 0

    def TrainContinuously(self, n_epoch=1e99):
        t = time.time()
        while self.i_epoch < n_epoch:
            while time.time() - t < 10 and self.i_epoch < n_epoch:
                # Target policy as player 1
                self.TrainOneRound(self.target_policy.move(self.start_num))
                self.i_epoch += 1
                # Target policy as player 2
                self.TrainOneRound(self.start_num)
            t = time.time()
            pickle.dump((self.target_policy, self.i_epoch),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)

    def TrainOneRound(self, afterstate_num, alpha=.1):
        """ Sarsa following Sutton and Barto 6.2
        Input:
            afterstate: the afterstate of target_policy to start trainng with
            Note that the opponent makes a move first, then the target policy.
        """
        afterstate = State(from_base10=afterstate_num)
        while not afterstate.is_terminal():
            beforestate_num = self.opponent_policy.move(afterstate.get_num())
            beforestate = State(from_base10=beforestate_num)
            if beforestate.is_terminal():
                r = beforestate.get_reward()
                self.target_policy.v_dict[afterstate.get_num(
                )] += alpha * (r - self.target_policy.v_dict[afterstate.get_num()])
                break
            else:
                self.target_policy.be_greedy([beforestate_num])
                s_prime_num = self.target_policy.move(beforestate_num)
                s_prime = State(from_base10=s_prime_num)
                r = s_prime.get_reward()
                self.target_policy.v_dict[afterstate.get_num(
                )] += alpha * (r + self.target_policy.v_dict[s_prime_num] - self.target_policy.v_dict[afterstate.get_num()])
                afterstate = s_prime
        self.target_policy.be_greedy([self.start_num])
Exemple #2
0
def test_random_move():
    policy = TabularPolicy(epsilon=1)
    collect = []
    for _ in range(10000):
        collect.append(policy.move(31206))
    assert collect.count(51618) == pytest.approx(3333, abs=100)
    assert collect.count(50916) == pytest.approx(3333, abs=100)
    assert collect.count(50890) == pytest.approx(3333, abs=100)
class Train:
    def __init__(self, path, read_first=False, epsilon=.9):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.policy_1, self.i_epoch, self.returns = pickle.load(
                open(path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' %
                  self.i_epoch)
        else:
            self.policy_1 = TabularPolicy(epsilon=epsilon)
            self.i_epoch = 0
            self.returns = dict()
        self.path = path
        self.policy_stable = True
        self.epsilon = epsilon
        self.policy_1.epsilon = epsilon

    def OnPolicyMCControl(self):
        """ On-policy MC control following Sutton Barto 5.4
        """
        t = time.time()
        while True:
            while time.time() - t < 10:
                num = State().get_num()
                history = [num]
                while not State(from_base10=num).is_terminal():
                    num = self.policy_1.move(num)
                    history.append(num)
                # g is a constant for our case
                g = State(from_base10=num).get_reward()
                for i, num in enumerate(history):
                    if num in self.returns:
                        self.returns[num].append(g)
                    else:
                        self.returns[num] = [g]
                    self.policy_1.v_dict[num] = np.average(self.returns[num])
                if self.policy_1.be_greedy(history):
                    self.policy_stable = False
                self.i_epoch += 1

            t = time.time()
            pickle.dump((self.policy_1, self.i_epoch, self.returns),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)
Exemple #4
0
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

opponent_policy = TabularPolicy(epsilon=1)
results = []
for i in range(1000):
    state = State()
    while True:
        state = State(from_base10=policy.move_dict[state.get_num()])
        if state.is_terminal():
            break
        else:
            state = State(from_base10=opponent_policy.move(state.get_num()))
            if state.is_terminal():
                break
    results.append(state.get_reward())

print("Average reward %f over 1000 games as player X against random policy." %
      (sum(results) / 1000.))

results = []
for i in range(1000):
    state = State()
    while True:
        state = State(from_base10=opponent_policy.move(state.get_num()))
        if state.is_terminal():
            break
        else:
class Train:
    def __init__(self, path, read_first=False):
        """
        Input:
             path: the path to save the policy
             read_first: if true, read from the path first
        """
        if read_first:
            self.target_policy, self.i_epoch = pickle.load(open(path, 'rb'))
            print('Policy read from file. Trained for %i epochs.' %
                  self.i_epoch)
        else:
            self.target_policy = TabularPolicy()
            self.i_epoch = 0
        self.random_policy = TabularPolicy(epsilon=1)
        self.path = path
        # num for the state with an empty board and with player 1 to make a move.
        self.start_num = int('1' + '0' * 9, 3)

    def TrainContinuously(self, n_epoch=1e99):
        t = time.time()
        while self.i_epoch < n_epoch:
            while time.time() - t < 10 and self.i_epoch < n_epoch:
                # Target policy as player 1
                self.TrainOneRound(self.random_policy.move(self.start_num))
                self.i_epoch += 1
                # Target policy as player 2
                self.TrainOneRound(self.start_num)
            t = time.time()
            self.target_policy.be_greedy()
            pickle.dump((self.target_policy, self.i_epoch),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)

    def TrainOneRound(self, afterstate_num, alpha=.1):
        """ Q learning following Sutton and Barto 6.5
        Input:
            afterstate: the afterstate of target_policy to start trainng with
            Note that the opponent makes a move first, then the target policy.
        """
        afterstate = State(from_base10=afterstate_num)
        while not afterstate.is_terminal():
            beforestate_num = self.random_policy.move(
                afterstate.get_num())  # opponent makes a move
            beforestate = State(from_base10=beforestate_num)
            if beforestate.is_terminal():
                r = beforestate.get_reward()
                self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                    r - self.target_policy.v_dict[afterstate.get_num()])
                break
            else:
                s_primes = beforestate.legal_afterstates()
                candidates = []
                for s_prime in s_primes:
                    r = State(from_base10=s_prime).get_reward()
                    q = self.target_policy.v_dict[s_prime]
                    candidates.append(r + q)
                if beforestate.turn == 1:
                    self.target_policy.v_dict[
                        afterstate.get_num()] += alpha * (
                            max(candidates) -
                            self.target_policy.v_dict[afterstate.get_num()])
                else:
                    self.target_policy.v_dict[
                        afterstate.get_num()] += alpha * (
                            min(candidates) -
                            self.target_policy.v_dict[afterstate.get_num()])
                afterstate_num = self.random_policy.move(beforestate_num)
                afterstate = State(from_base10=afterstate_num)