def test_rush_policy(): """ Only one possible move. """ state = State(board=[[1, 2, 1], [2, 2, 1], [0, 1, 2]], turn=1) policy = TabularPolicy() after_state = State(from_base10=policy.move_dict[state.get_num()]) expected_after_state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]], turn=2) assert after_state.board == expected_after_state.board assert after_state.turn == expected_after_state.turn """ Multiple possible moves. """ state = State(board=[[1, 0, 0], [2, 2, 1], [0, 1, 2]], turn=2) policy = TabularPolicy() after_state = State(from_base10=policy.move_dict[state.get_num()]) expected_board = [[1, 2, 0], [2, 2, 1], [0, 1, 2]] assert after_state.board == expected_board assert after_state.turn == 1 """ Filled board """ state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]], turn=2) policy = TabularPolicy() with pytest.raises(KeyError): after_state = State(from_base10=policy.move_dict[state.get_num()])
def test_get_num_from_state(): state = State(board=[[0, 1, 2], [0, 1, 2], [0, 0, 0]], turn=2) num = state.get_num() assert num == int('2012012000', 3) state = State(board=[[1, 2, 1], [2, 1, 2], [1, 2, 2]]) num = state.get_num() assert num == int('1121212122', 3)
def test_be_greedy(): policy = TabularPolicy() best = State(board=[[0, 0, 0], [1, 0, 0], [0, 0, 0]], turn=2) policy.v_dict[best.get_num()] = 1 assert policy.be_greedy() state = State() assert policy.move_dict[state.get_num()] == best.get_num() assert not policy.be_greedy() # No more change when run the second time
def TrainOneRound(self, afterstate_num, alpha=.1): """ Q learning following Sutton and Barto 6.5 Input: afterstate: the afterstate of target_policy to start trainng with Note that the opponent makes a move first, then the target policy. """ afterstate = State(from_base10=afterstate_num) while not afterstate.is_terminal(): beforestate_num = self.random_policy.move( afterstate.get_num()) # opponent makes a move beforestate = State(from_base10=beforestate_num) if beforestate.is_terminal(): r = beforestate.get_reward() self.target_policy.v_dict[afterstate.get_num()] += alpha * ( r - self.target_policy.v_dict[afterstate.get_num()]) break else: s_primes = beforestate.legal_afterstates() candidates = [] for s_prime in s_primes: r = State(from_base10=s_prime).get_reward() q = self.target_policy.v_dict[s_prime] candidates.append(r + q) if beforestate.turn == 1: self.target_policy.v_dict[ afterstate.get_num()] += alpha * ( max(candidates) - self.target_policy.v_dict[afterstate.get_num()]) else: self.target_policy.v_dict[ afterstate.get_num()] += alpha * ( min(candidates) - self.target_policy.v_dict[afterstate.get_num()]) afterstate_num = self.random_policy.move(beforestate_num) afterstate = State(from_base10=afterstate_num)
def AutoPlay(self, policy_1, policy_2, n_games=100): """ Let policy_1 and policy_2 play against each other for n_games Input: self explanatory. Returns: A list of game results, i.e. reward for player 1. """ game_results = [] for i in range(n_games): state = self.GetInitialState() if state.turn == 2: state = State(from_base10=policy_2.move_dict[state.get_num()]) while not state.is_terminal(): state = State(from_base10=policy_1.move_dict[state.get_num()]) if state.is_terminal(): break state = State(from_base10=policy_2.move_dict[state.get_num()]) game_results.append(state.get_reward()) return game_results
def TrainOneRound(self, afterstate_num, alpha=0.1): """ TD(0) following Sutton and Barto 6.1 Input: afterstate: the afterstate of target_policy to start trainng with Note that the opponent mamkes a move first, then the target policy. """ afterstate = State(from_base10=afterstate_num) while not afterstate.is_terminal(): beforestate_num = self.opponent_policy.move(afterstate.get_num()) beforestate = State(from_base10=beforestate_num) if beforestate.is_terminal(): r = beforestate.get_reward() self.target_policy.v_dict[afterstate.get_num()] += alpha * ( r - self.target_policy.v_dict[afterstate.get_num()]) break else: s_prime_num = self.target_policy.move(beforestate_num) s_prime = State(from_base10=s_prime_num) r = s_prime.get_reward() self.target_policy.v_dict[afterstate.get_num()] += alpha * ( r + self.target_policy.v_dict[s_prime_num] - self.target_policy.v_dict[afterstate.get_num()]) afterstate = s_prime
def PolicyEvaluation(self): """Policy Evaluation following Sutton Barto 4.3 Against rush opponent, with afterstates """ theta = 0.01 t = time.time() while True: delta = 0 for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1): v = self.policy_1.v_dict[num] state = State(from_base10=num) # here s is afterstate # terminal state, v function equals game result (no reward for transition) if state.is_terminal(): self.policy_1.v_dict[num] = state.get_reward() else: # non-terminal afterstates opponent_afterstate = State( from_base10=self.policy_2.move_dict[num]) if opponent_afterstate.is_terminal(): self.policy_1.v_dict[ num] = opponent_afterstate.get_reward() else: s_prime_num = self.policy_1.move_dict[ opponent_afterstate.get_num()] self.policy_1.v_dict[num] = self.policy_1.v_dict[ s_prime_num] delta = max(delta, np.abs(v - self.policy_1.v_dict[num])) self.i_epoch += 1 if delta < theta: print('Value function has converged!') print("Trained %i epochs so far." % self.i_epoch) pickle.dump((self.policy_1, self.i_epoch), open(self.write_path, "wb")) break if time.time() - t > 10: t = time.time() print("Trained %i epochs so far." % self.i_epoch) pickle.dump((self.policy_1, self.i_epoch), open(self.write_path, "wb"))
# -*- coding: utf-8 -*- """ Created on Fri Nov 15 19:58:57 2019 @author: daugh """ from ttt_play import State import os import pickle import pytest policy, i_epoch = pickle.load( open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb')) print('This value function has been trained for %i epochs.' % i_epoch) theta = 0.01 print('Accuracy %f' % theta) state = State(board=[[2, 1, 2], [0, 1, 0], [0, 0, 0]]) state.print_board() assert policy.v_dict[state.get_num()] == pytest.approx( 1, abs=theta ), 'Player 1 can win next step, expect value 1, got %f' % policy.v_dict[ state.get_num()] """ Keep this print statement at the end """ print('All assertions passed.')
@author: daugh """ from ttt_play import State import os import pickle import pytest policy, i_epoch = pickle.load( open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb')) print('This value function has been trained for %i epochs.' % i_epoch) theta = 0.01 print('Accuracy %f' % theta) state = State(board=[[1, 2, 1], [2, 2, 1], [1, 0, 0]], turn=2) assert policy.v_dict[state.get_num()] == pytest.approx( -0.5, abs=theta ), 'Player 2 plays random, one move is winning and one move is leading to a tie, expect value -0.5. Got %f' % policy.v_dict[ state.get_num()] state = State(board=[[2, 1, 0], [2, 1, 0], [1, 2, 0]]) state.print_board() assert policy.v_dict[state.get_num()] == pytest.approx( 1. / 3, abs=theta), 'Player 1 players random, one move is winning, the other\ two moves lead to a draw because player 2 (target policy) plays rush. \ expect value 1/3. Got %f' % policy.v_dict[state.get_num()] """ Keep this print statement at the end """ print('All assertions passed.')
""" from ttt_play import State import os import pickle import pytest policy, i_epoch = pickle.load( open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb')) print('This value function has been trained for %i epochs.' % i_epoch) theta = 0.01 print('Policy iteration against rush opponent. Accuracy %f' % theta) state = State(board=[[1, 1, 1], [2, 2, 1], [2, 2, 1]], turn=2) state.print_board() assert policy.v_dict[state.get_num()] == pytest.approx( 1, abs=theta ), 'Player 1 wins, expect value 1. Got %f' % policy.v_dict[state.get_num()] state = State(board=[[1, 1, 2], [2, 2, 1], [1, 2, 1]], turn=2) assert policy.v_dict[state.get_num()] == pytest.approx( 0, abs=theta), 'Tied. Expect value 0. Got %f' % policy.v_dict[state.get_num()] state = State(board=[[1, 1, 0], [2, 2, 1], [2, 1, 0]], turn=2) assert policy.v_dict[state.get_num()] == pytest.approx( -1, abs=theta ), 'One step before losing. Expect value -1. Got %f' % policy.v_dict[ state.get_num()] state = State(board=[[1, 1, 0], [2, 2, 1], [0, 2, 1]], turn=2)
import os import pickle policy, i_epoch = pickle.load( open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb')) print('This value function has been trained for %i epochs.' % i_epoch) theta = 0.01 print('Accuracy %f' % theta) opponent_policy = TabularPolicy(epsilon=1) results = [] for i in range(1000): state = State() while True: state = State(from_base10=policy.move_dict[state.get_num()]) if state.is_terminal(): break else: state = State(from_base10=opponent_policy.move(state.get_num())) if state.is_terminal(): break results.append(state.get_reward()) print("Average reward %f over 1000 games as player X against random policy." % (sum(results) / 1000.)) results = [] for i in range(1000): state = State() while True:
@author: daugh """ from ttt_play import State import os import pickle import pytest policy, i_epoch = pickle.load( open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb')) print('This value function has been trained for %i epochs.' % i_epoch) theta = 0.01 print('Accuracy %f' % theta) # state = State(board=[[1, 2, 1], # [2, 2, 1], # [1, 0, 0]], turn=2) # state.print_board() # assert policy.v_dict[state.get_num()] == pytest.approx( # -0.5, abs=theta), 'Player 2 plays random, one move is winning and one move is leading to a tie, expect value -0.5. Got %f' % policy.v_dict[state.get_num()] state = State(board=[[1, 0, 0], [0, 0, 0], [0, 0, 0]], turn=2) state.print_board() assert policy.v_dict[state.get_num()] == pytest.approx( 1, abs=theta ), 'Both play rush, player 1 will win. Got %f' % policy.v_dict[state.get_num()] """ Keep this print statement at the end """ print('All assertions passed.')