def TrainOneRound(self, afterstate_num, alpha=.1):
     """ Q learning following Sutton and Barto 6.5
     Input:
         afterstate: the afterstate of target_policy to start trainng with
         Note that the opponent makes a move first, then the target policy.
     """
     afterstate = State(from_base10=afterstate_num)
     while not afterstate.is_terminal():
         beforestate_num = self.random_policy.move(
             afterstate.get_num())  # opponent makes a move
         beforestate = State(from_base10=beforestate_num)
         if beforestate.is_terminal():
             r = beforestate.get_reward()
             self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                 r - self.target_policy.v_dict[afterstate.get_num()])
             break
         else:
             s_primes = beforestate.legal_afterstates()
             candidates = []
             for s_prime in s_primes:
                 r = State(from_base10=s_prime).get_reward()
                 q = self.target_policy.v_dict[s_prime]
                 candidates.append(r + q)
             if beforestate.turn == 1:
                 self.target_policy.v_dict[
                     afterstate.get_num()] += alpha * (
                         max(candidates) -
                         self.target_policy.v_dict[afterstate.get_num()])
             else:
                 self.target_policy.v_dict[
                     afterstate.get_num()] += alpha * (
                         min(candidates) -
                         self.target_policy.v_dict[afterstate.get_num()])
             afterstate_num = self.random_policy.move(beforestate_num)
             afterstate = State(from_base10=afterstate_num)
Beispiel #2
0
def test_is_terminal():
    """ Board not full, but player 1 has won
    """
    state = State(board=[[0, 2, 1], [0, 1, 2], [1, 2, 2]])
    assert state.is_terminal()
    """ Board full
  """
    state = State(board=[[1, 2, 1], [2, 1, 2], [1, 2, 2]])
    assert state.is_terminal()
    def ValueIteration(self, theta=0.01):
        t = time.time()
        while True:
            delta = 0
            for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
                v = self.policy_1.v_dict[num]
                state = State(from_base10=num)
                if state.is_terminal():
                    self.policy_1.v_dict[num] = state.get_reward()
                else:
                    opponent_afterstate = State(
                        from_base10=self.policy_2.move_dict[num])
                    if opponent_afterstate.is_terminal():
                        self.policy_1.v_dict[
                            num] = opponent_afterstate.get_reward()
                    else:
                        s_prime_choices = opponent_afterstate.legal_afterstates(
                        )
                        if state.turn == 2:
                            vi_update = max([
                                self.policy_1.v_dict[x]
                                for x in s_prime_choices
                            ])
                        else:
                            vi_update = min([
                                self.policy_1.v_dict[x]
                                for x in s_prime_choices
                            ])
                        self.policy_1.v_dict[num] = vi_update
                delta = max(delta, np.abs(v - self.policy_1.v_dict[num]))

            self.i_epoch += 1

            if delta < theta:
                print('Value function has converged!')
                print("Trained %i epochs so far." % self.i_epoch)
                self.policy_ever_changed = self.policy_1.be_greedy()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
                break

            if time.time() - t > 10:
                t = time.time()
                print("Trained %i epochs so far." % self.i_epoch)
                self.policy_ever_changed = self.policy_1.be_greedy()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
 def AutoPlay(self, policy_1, policy_2, n_games=100):
     """ Let policy_1 and policy_2 play against each other for n_games
     Input: self explanatory.
     Returns:
          A list of game results, i.e. reward for player 1.
     """
     game_results = []
     for i in range(n_games):
         state = self.GetInitialState()
         if state.turn == 2:
             state = State(from_base10=policy_2.move_dict[state.get_num()])
         while not state.is_terminal():
             state = State(from_base10=policy_1.move_dict[state.get_num()])
             if state.is_terminal():
                 break
             state = State(from_base10=policy_2.move_dict[state.get_num()])
         game_results.append(state.get_reward())
     return game_results
    def PolicyEvaluation(self):
        """Policy Evaluation following Sutton Barto 4.3
           Against rush opponent, with afterstates
        """
        theta = 0.01
        t = time.time()
        while True:
            delta = 0
            for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
                v = self.policy_1.v_dict[num]
                state = State(from_base10=num)  # here s is afterstate

                # terminal state, v function equals game result (no reward for transition)
                if state.is_terminal():
                    self.policy_1.v_dict[num] = state.get_reward()
                else:
                    # non-terminal afterstates
                    opponent_afterstate = State(
                        from_base10=self.policy_2.move_dict[num])
                    if opponent_afterstate.is_terminal():
                        self.policy_1.v_dict[
                            num] = opponent_afterstate.get_reward()
                    else:
                        s_prime_num = self.policy_1.move_dict[
                            opponent_afterstate.get_num()]
                        self.policy_1.v_dict[num] = self.policy_1.v_dict[
                            s_prime_num]

                delta = max(delta, np.abs(v - self.policy_1.v_dict[num]))

            self.i_epoch += 1

            if delta < theta:
                print('Value function has converged!')
                print("Trained %i epochs so far." % self.i_epoch)
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
                break

            if time.time() - t > 10:
                t = time.time()
                print("Trained %i epochs so far." % self.i_epoch)
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
Beispiel #6
0
 def TrainOneRound(self, afterstate_num, alpha=0.1):
     """ TD(0) following Sutton and Barto 6.1
     Input:
         afterstate: the afterstate of target_policy to start trainng with
         Note that the opponent mamkes a move first, then the target policy.
     """
     afterstate = State(from_base10=afterstate_num)
     while not afterstate.is_terminal():
         beforestate_num = self.opponent_policy.move(afterstate.get_num())
         beforestate = State(from_base10=beforestate_num)
         if beforestate.is_terminal():
             r = beforestate.get_reward()
             self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                 r - self.target_policy.v_dict[afterstate.get_num()])
             break
         else:
             s_prime_num = self.target_policy.move(beforestate_num)
             s_prime = State(from_base10=s_prime_num)
             r = s_prime.get_reward()
             self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                 r + self.target_policy.v_dict[s_prime_num] -
                 self.target_policy.v_dict[afterstate.get_num()])
             afterstate = s_prime
 def PolicyImprovement(self):
     """ Policy Improvement following Sutton Barto 4.3
         Against rush opponent, with afterstates
     """
     self.policy_stable = True
     for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
         state = State(from_base10=num)
         if not state.is_terminal():
             old_action_num = self.policy_1.move_dict[num]
             # get the best afterstates
             afterstate_nums = state.legal_afterstates()
             afterstate_values = [
                 self.policy_1.v_dict[x] for x in afterstate_nums
             ]
             best = np.argmax(
                 afterstate_values) if state.turn == 1 else np.argmin(
                     afterstate_values)
             self.policy_1.move_dict[num] = afterstate_nums[best]
             if old_action_num != self.policy_1.move_dict[num]:
                 self.policy_stable = False
                 self.policy_ever_changed = True
Beispiel #8
0
import pickle

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

opponent_policy = TabularPolicy(epsilon=1)
results = []
for i in range(1000):
    state = State()
    while True:
        state = State(from_base10=policy.move_dict[state.get_num()])
        if state.is_terminal():
            break
        else:
            state = State(from_base10=opponent_policy.move(state.get_num()))
            if state.is_terminal():
                break
    results.append(state.get_reward())

print("Average reward %f over 1000 games as player X against random policy." %
      (sum(results) / 1000.))

results = []
for i in range(1000):
    state = State()
    while True:
        state = State(from_base10=opponent_policy.move(state.get_num()))