def test_rush_policy(): """ Only one possible move. """ state = State(board=[[1, 2, 1], [2, 2, 1], [0, 1, 2]], turn=1) policy = TabularPolicy() after_state = State(from_base10=policy.move_dict[state.get_num()]) expected_after_state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]], turn=2) assert after_state.board == expected_after_state.board assert after_state.turn == expected_after_state.turn """ Multiple possible moves. """ state = State(board=[[1, 0, 0], [2, 2, 1], [0, 1, 2]], turn=2) policy = TabularPolicy() after_state = State(from_base10=policy.move_dict[state.get_num()]) expected_board = [[1, 2, 0], [2, 2, 1], [0, 1, 2]] assert after_state.board == expected_board assert after_state.turn == 1 """ Filled board """ state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]], turn=2) policy = TabularPolicy() with pytest.raises(KeyError): after_state = State(from_base10=policy.move_dict[state.get_num()])
def test_random_move(): policy = TabularPolicy(epsilon=1) collect = [] for _ in range(10000): collect.append(policy.move(31206)) assert collect.count(51618) == pytest.approx(3333, abs=100) assert collect.count(50916) == pytest.approx(3333, abs=100) assert collect.count(50890) == pytest.approx(3333, abs=100)
def test_be_greedy(): policy = TabularPolicy() best = State(board=[[0, 0, 0], [1, 0, 0], [0, 0, 0]], turn=2) policy.v_dict[best.get_num()] = 1 assert policy.be_greedy() state = State() assert policy.move_dict[state.get_num()] == best.get_num() assert not policy.be_greedy() # No more change when run the second time
class Train: def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.target_policy, self.i_epoch = pickle.load(open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.target_policy = TabularPolicy() self.i_epoch = 0 self.opponent_policy = TabularPolicy() self.path = path # num for the state with an empty board and with player 1 to make a move. self.start_num = int('1' + '0' * 9, 3) def TrainContinuously(self, n_epoch=1e99): t = time.time() while self.i_epoch < n_epoch: while time.time() - t < 10 and self.i_epoch < n_epoch: # Target policy as player 1 self.TrainOneRound( self.target_policy.move_dict[self.start_num]) self.i_epoch += 1 # Target policy as player 2 self.TrainOneRound(self.start_num) t = time.time() pickle.dump((self.target_policy, self.i_epoch), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch) def TrainOneRound(self, afterstate_num, alpha=0.1): """ TD(0) following Sutton and Barto 6.1 Input: afterstate: the afterstate of target_policy to start trainng with Note that the opponent mamkes a move first, then the target policy. """ afterstate = State(from_base10=afterstate_num) while not afterstate.is_terminal(): beforestate_num = self.opponent_policy.move(afterstate.get_num()) beforestate = State(from_base10=beforestate_num) if beforestate.is_terminal(): r = beforestate.get_reward() self.target_policy.v_dict[afterstate.get_num()] += alpha * ( r - self.target_policy.v_dict[afterstate.get_num()]) break else: s_prime_num = self.target_policy.move(beforestate_num) s_prime = State(from_base10=s_prime_num) r = s_prime.get_reward() self.target_policy.v_dict[afterstate.get_num()] += alpha * ( r + self.target_policy.v_dict[s_prime_num] - self.target_policy.v_dict[afterstate.get_num()]) afterstate = s_prime
def test_get_trajectory(): trainer = Train(path='foo', read_first=False) trainer.epsilon = 0 trajectory = trainer.GetOneTrajectory(TabularPolicy(), TabularPolicy()) num1 = State(board=[[0, 0, 0], [0, 0, 0], [0, 0, 0]]).get_num() num2 = State(board=[[1, 0, 0], [0, 0, 0], [0, 0, 0]], turn=2).get_num() num3 = State(board=[[1, 2, 0], [0, 0, 0], [0, 0, 0]]).get_num() num4 = State(board=[[1, 2, 1], [0, 0, 0], [0, 0, 0]], turn=2).get_num() num5 = State(board=[[1, 2, 1], [2, 0, 0], [0, 0, 0]]).get_num() num6 = State(board=[[1, 2, 1], [2, 1, 0], [0, 0, 0]], turn=2).get_num() num7 = State(board=[[1, 2, 1], [2, 1, 2], [0, 0, 0]]).get_num() num8 = State(board=[[1, 2, 1], [2, 1, 2], [1, 0, 0]], turn=2).get_num() assert trajectory == [num1, num2, num3, num4, num5, num6, num7, num8]
def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.policy_1, self.i_epoch = pickle.load(open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.policy_1 = TabularPolicy() self.i_epoch = 0 self.path = path self.policy_stable = True
class Train: def __init__(self, path, read_first=False, epsilon=.9): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.policy_1, self.i_epoch, self.returns = pickle.load( open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.policy_1 = TabularPolicy(epsilon=epsilon) self.i_epoch = 0 self.returns = dict() self.path = path self.policy_stable = True self.epsilon = epsilon self.policy_1.epsilon = epsilon def OnPolicyMCControl(self): """ On-policy MC control following Sutton Barto 5.4 """ t = time.time() while True: while time.time() - t < 10: num = State().get_num() history = [num] while not State(from_base10=num).is_terminal(): num = self.policy_1.move(num) history.append(num) # g is a constant for our case g = State(from_base10=num).get_reward() for i, num in enumerate(history): if num in self.returns: self.returns[num].append(g) else: self.returns[num] = [g] self.policy_1.v_dict[num] = np.average(self.returns[num]) if self.policy_1.be_greedy(history): self.policy_stable = False self.i_epoch += 1 t = time.time() pickle.dump((self.policy_1, self.i_epoch, self.returns), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch)
def MCPrediction(self, n_epoch): """ MC prediction following Sutton Barto 5.1 Against rush opponent Input: n_epoch: the number of episodes to be trained """ self.policy_2 = TabularPolicy() returns = dict() for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1): returns[num] = [] for _ in range(n_epoch): # generate an episode following policy_1 s = State().get_num() history = [s] while not State(from_base10=s).is_terminal(): s = self.policy_1.move_dict[s] history.append(s) if State(from_base10=s).is_terminal(): break s = self.policy_2.move_dict[s] history.append(s) # in our special case, g is a constant g = State(from_base10=s).get_reward() for i, s in enumerate(history): returns[s].append(g) if i % 2 == 0: self.policy_1.v_dict[s] = np.average(returns[s]) else: self.policy_2.v_dict[s] = np.average(returns[s]) for num in range(int('2' + '0' * 9, 3), int('2' * 10, 3) + 1): self.policy_1.v_dict[num] = self.policy_2.v_dict[num] self.i_epoch += 1 pickle.dump((self.policy_1, self.i_epoch), open(self.path, "wb")) print('MC prediction finished.')
def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.target_policy, self.i_epoch = pickle.load(open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.target_policy = TabularPolicy() self.i_epoch = 0 self.opponent_policy = TabularPolicy() self.path = path # num for the state with an empty board and with player 1 to make a move. self.start_num = int('1' + '0' * 9, 3)
def __init__(self, write_path, read_path=None, self_play=False): """ Input: n_game: number of games to train for read_path, write_path: paths for reading or saving the model """ self.read_path = read_path self.write_path = write_path if read_path: self.policy_1, self.i_epoch = pickle.load( open(self.read_path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.policy_1 = TabularPolicy() self.i_epoch = 0 print('Training new policy.') self.read_path = self.write_path # for later iterative training if self_play: self.policy_2 = self.policy_1 else: self.policy_2 = TabularPolicy() self.policy_ever_changed = True # Set to true to state iterative training
class TrainOneRound: def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.policy_1, self.i_epoch = pickle.load(open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.policy_1 = TabularPolicy() self.i_epoch = 0 self.path = path self.policy_stable = True def MCES(self): """ MC exploring start following Sutton Barto 5.3 Against rush opponent """ t = time.time() # No need to use a list of returns, since the game is deterministic for s in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1): # print(self.policy_1.v_dict[47042]) history = [s] while not State(from_base10=s).is_terminal(): s = self.policy_1.move_dict[s] history.append(s) if State(from_base10=s).is_terminal(): break s = self.policy_1.move_dict[s] history.append(s) g = State(from_base10=s).get_reward() for i, s in enumerate(history): self.policy_1.v_dict[s] = g if self.policy_1.be_greedy(history): self.policy_stable = False self.i_epoch += 1 if time.time() - t > 10: t = time.time() pickle.dump((self.policy_1, self.i_epoch), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch) pickle.dump((self.policy_1, self.i_epoch), open(self.path, "wb")) print('MC exploring start finished.')
class TrainOneRound: def __init__(self, write_path, read_path=None, self_play=False): """ Input: n_game: number of games to train for read_path, write_path: paths for reading or saving the model """ self.read_path = read_path self.write_path = write_path if read_path: self.policy_1, self.i_epoch = pickle.load( open(self.read_path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.policy_1 = TabularPolicy() self.i_epoch = 0 print('Training new policy.') self.read_path = self.write_path # for later iterative training if self_play: self.policy_2 = self.policy_1 else: self.policy_2 = TabularPolicy() self.policy_ever_changed = True # Set to true to state iterative training def ValueIteration(self, theta=0.01): t = time.time() while True: delta = 0 for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1): v = self.policy_1.v_dict[num] state = State(from_base10=num) if state.is_terminal(): self.policy_1.v_dict[num] = state.get_reward() else: opponent_afterstate = State( from_base10=self.policy_2.move_dict[num]) if opponent_afterstate.is_terminal(): self.policy_1.v_dict[ num] = opponent_afterstate.get_reward() else: s_prime_choices = opponent_afterstate.legal_afterstates( ) if state.turn == 2: vi_update = max([ self.policy_1.v_dict[x] for x in s_prime_choices ]) else: vi_update = min([ self.policy_1.v_dict[x] for x in s_prime_choices ]) self.policy_1.v_dict[num] = vi_update delta = max(delta, np.abs(v - self.policy_1.v_dict[num])) self.i_epoch += 1 if delta < theta: print('Value function has converged!') print("Trained %i epochs so far." % self.i_epoch) self.policy_ever_changed = self.policy_1.be_greedy() pickle.dump((self.policy_1, self.i_epoch), open(self.write_path, "wb")) break if time.time() - t > 10: t = time.time() print("Trained %i epochs so far." % self.i_epoch) self.policy_ever_changed = self.policy_1.be_greedy() pickle.dump((self.policy_1, self.i_epoch), open(self.write_path, "wb"))
@author: daugh """ from ttt_play import State from ttt_policies import TabularPolicy import os import pickle policy, i_epoch = pickle.load( open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb')) print('This value function has been trained for %i epochs.' % i_epoch) theta = 0.01 print('Accuracy %f' % theta) opponent_policy = TabularPolicy(epsilon=1) results = [] for i in range(1000): state = State() while True: state = State(from_base10=policy.move_dict[state.get_num()]) if state.is_terminal(): break else: state = State(from_base10=opponent_policy.move(state.get_num())) if state.is_terminal(): break results.append(state.get_reward()) print("Average reward %f over 1000 games as player X against random policy." % (sum(results) / 1000.))
class Train: def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.target_policy, self.i_epoch, self.c = pickle.load( open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.target_policy = TabularPolicy() self.i_epoch = 0 self.c = {} self.behavior_policy = TabularPolicy(epsilon=1) self.path = path self.policy_stable = True def TrainContinuously(self, n_epoch=1e99): t = time.time() while self.i_epoch < n_epoch: while time.time() - t < 10 and self.i_epoch < n_epoch: self.TrainOneRound() self.i_epoch += 1 t = time.time() pickle.dump((self.target_policy, self.i_epoch, self.c), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch) def TrainOneRound(self): """ Off-policy MC prediction following Sutton Barto 5.6 """ # behavior policy playing player 1 trajectory = self.GetOneTrajectory( self.behavior_policy, self.target_policy) self.OffPolicyMCControl(trajectory, 1) # behavior policy playing player 2 trajectory = self.GetOneTrajectory( self.target_policy, self.behavior_policy) self.OffPolicyMCControl(trajectory, 2) def GetOneTrajectory(self, policy_1, policy_2): """ Returns: list of state nums of a trajectory """ num = State().get_num() trajectory = [num] while not State(from_base10=num).is_terminal(): num = policy_1.move(num) trajectory.append(num) if not State(from_base10=num).is_terminal(): num = policy_2.move(num) trajectory.append(num) else: break return trajectory def OffPolicyMCControl(self, trajectory, role_behavior_policy): """ Incremental implementation of off-policy MC prediction Input: trajectory role_behavior_policy: 1 or 2, denoting which player the behavior policy acted as in this trajectory """ # g is a constant for our case g = State(from_base10=trajectory[-1]).get_reward() w = 1. i = len(trajectory) - 1 for i, state in reversed(list(enumerate(trajectory))): if i == len(trajectory) - 1: # ignore the very last state, which is not a beforestate continue if (i % 2 + 1) != role_behavior_policy: # i denotes the number of pieces on the board. i%2+1 is 1 if # this is player 1's before state, and is 2 if this is player # 2's before state. continue afterstate = trajectory[i+1] if afterstate in self.c: self.c[afterstate] += w else: self.c[afterstate] = w self.target_policy.v_dict[afterstate] += w / \ self.c[afterstate] * \ (g - self.target_policy.v_dict[afterstate]) self.target_policy.be_greedy([state]) if self.target_policy.move_dict[trajectory[i]] != afterstate: break else: w = w * \ len(State(from_base10=trajectory[i]).legal_afterstates())
class Train: def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.target_policy, self.i_epoch = pickle.load(open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.target_policy = TabularPolicy() self.i_epoch = 0 self.random_policy = TabularPolicy(epsilon=1) self.path = path # num for the state with an empty board and with player 1 to make a move. self.start_num = int('1' + '0' * 9, 3) def TrainContinuously(self, n_epoch=1e99): t = time.time() while self.i_epoch < n_epoch: while time.time() - t < 10 and self.i_epoch < n_epoch: # Target policy as player 1 self.TrainOneRound(self.random_policy.move(self.start_num)) self.i_epoch += 1 # Target policy as player 2 self.TrainOneRound(self.start_num) t = time.time() self.target_policy.be_greedy() pickle.dump((self.target_policy, self.i_epoch), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch) def TrainOneRound(self, afterstate_num, alpha=.1): """ Q learning following Sutton and Barto 6.5 Input: afterstate: the afterstate of target_policy to start trainng with Note that the opponent makes a move first, then the target policy. """ afterstate = State(from_base10=afterstate_num) while not afterstate.is_terminal(): beforestate_num = self.random_policy.move( afterstate.get_num()) # opponent makes a move beforestate = State(from_base10=beforestate_num) if beforestate.is_terminal(): r = beforestate.get_reward() self.target_policy.v_dict[afterstate.get_num()] += alpha * ( r - self.target_policy.v_dict[afterstate.get_num()]) break else: s_primes = beforestate.legal_afterstates() candidates = [] for s_prime in s_primes: r = State(from_base10=s_prime).get_reward() q = self.target_policy.v_dict[s_prime] candidates.append(r + q) if beforestate.turn == 1: self.target_policy.v_dict[ afterstate.get_num()] += alpha * ( max(candidates) - self.target_policy.v_dict[afterstate.get_num()]) else: self.target_policy.v_dict[ afterstate.get_num()] += alpha * ( min(candidates) - self.target_policy.v_dict[afterstate.get_num()]) afterstate_num = self.random_policy.move(beforestate_num) afterstate = State(from_base10=afterstate_num)