class Train: def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.target_policy, self.i_epoch = pickle.load(open(path, 'rb')) self.target_policy.epsilon = 0.3 print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.target_policy = TabularPolicy(epsilon = 0.3) self.i_epoch = 0 self.opponent_policy = TabularPolicy(epsilon=1) self.path = path # num for the state with an empty board and with player 1 to make a move. self.start_num = int('1' + '0' * 9, 3) self.counter = {} self.counter[24138] = 0 def TrainContinuously(self, n_epoch=1e99): t = time.time() while self.i_epoch < n_epoch: while time.time() - t < 10 and self.i_epoch < n_epoch: # Target policy as player 1 self.TrainOneRound(self.target_policy.move(self.start_num)) self.i_epoch += 1 # Target policy as player 2 self.TrainOneRound(self.start_num) t = time.time() pickle.dump((self.target_policy, self.i_epoch), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch) def TrainOneRound(self, afterstate_num, alpha=.1): """ Sarsa following Sutton and Barto 6.2 Input: afterstate: the afterstate of target_policy to start trainng with Note that the opponent makes a move first, then the target policy. """ afterstate = State(from_base10=afterstate_num) while not afterstate.is_terminal(): beforestate_num = self.opponent_policy.move(afterstate.get_num()) beforestate = State(from_base10=beforestate_num) if beforestate.is_terminal(): r = beforestate.get_reward() self.target_policy.v_dict[afterstate.get_num( )] += alpha * (r - self.target_policy.v_dict[afterstate.get_num()]) break else: self.target_policy.be_greedy([beforestate_num]) s_prime_num = self.target_policy.move(beforestate_num) s_prime = State(from_base10=s_prime_num) r = s_prime.get_reward() self.target_policy.v_dict[afterstate.get_num( )] += alpha * (r + self.target_policy.v_dict[s_prime_num] - self.target_policy.v_dict[afterstate.get_num()]) afterstate = s_prime self.target_policy.be_greedy([self.start_num])
def test_random_move(): policy = TabularPolicy(epsilon=1) collect = [] for _ in range(10000): collect.append(policy.move(31206)) assert collect.count(51618) == pytest.approx(3333, abs=100) assert collect.count(50916) == pytest.approx(3333, abs=100) assert collect.count(50890) == pytest.approx(3333, abs=100)
class Train: def __init__(self, path, read_first=False, epsilon=.9): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.policy_1, self.i_epoch, self.returns = pickle.load( open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.policy_1 = TabularPolicy(epsilon=epsilon) self.i_epoch = 0 self.returns = dict() self.path = path self.policy_stable = True self.epsilon = epsilon self.policy_1.epsilon = epsilon def OnPolicyMCControl(self): """ On-policy MC control following Sutton Barto 5.4 """ t = time.time() while True: while time.time() - t < 10: num = State().get_num() history = [num] while not State(from_base10=num).is_terminal(): num = self.policy_1.move(num) history.append(num) # g is a constant for our case g = State(from_base10=num).get_reward() for i, num in enumerate(history): if num in self.returns: self.returns[num].append(g) else: self.returns[num] = [g] self.policy_1.v_dict[num] = np.average(self.returns[num]) if self.policy_1.be_greedy(history): self.policy_stable = False self.i_epoch += 1 t = time.time() pickle.dump((self.policy_1, self.i_epoch, self.returns), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch)
open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb')) print('This value function has been trained for %i epochs.' % i_epoch) theta = 0.01 print('Accuracy %f' % theta) opponent_policy = TabularPolicy(epsilon=1) results = [] for i in range(1000): state = State() while True: state = State(from_base10=policy.move_dict[state.get_num()]) if state.is_terminal(): break else: state = State(from_base10=opponent_policy.move(state.get_num())) if state.is_terminal(): break results.append(state.get_reward()) print("Average reward %f over 1000 games as player X against random policy." % (sum(results) / 1000.)) results = [] for i in range(1000): state = State() while True: state = State(from_base10=opponent_policy.move(state.get_num())) if state.is_terminal(): break else:
class Train: def __init__(self, path, read_first=False): """ Input: path: the path to save the policy read_first: if true, read from the path first """ if read_first: self.target_policy, self.i_epoch = pickle.load(open(path, 'rb')) print('Policy read from file. Trained for %i epochs.' % self.i_epoch) else: self.target_policy = TabularPolicy() self.i_epoch = 0 self.random_policy = TabularPolicy(epsilon=1) self.path = path # num for the state with an empty board and with player 1 to make a move. self.start_num = int('1' + '0' * 9, 3) def TrainContinuously(self, n_epoch=1e99): t = time.time() while self.i_epoch < n_epoch: while time.time() - t < 10 and self.i_epoch < n_epoch: # Target policy as player 1 self.TrainOneRound(self.random_policy.move(self.start_num)) self.i_epoch += 1 # Target policy as player 2 self.TrainOneRound(self.start_num) t = time.time() self.target_policy.be_greedy() pickle.dump((self.target_policy, self.i_epoch), open(self.path, "wb")) print("Trained %i epochs so far." % self.i_epoch) def TrainOneRound(self, afterstate_num, alpha=.1): """ Q learning following Sutton and Barto 6.5 Input: afterstate: the afterstate of target_policy to start trainng with Note that the opponent makes a move first, then the target policy. """ afterstate = State(from_base10=afterstate_num) while not afterstate.is_terminal(): beforestate_num = self.random_policy.move( afterstate.get_num()) # opponent makes a move beforestate = State(from_base10=beforestate_num) if beforestate.is_terminal(): r = beforestate.get_reward() self.target_policy.v_dict[afterstate.get_num()] += alpha * ( r - self.target_policy.v_dict[afterstate.get_num()]) break else: s_primes = beforestate.legal_afterstates() candidates = [] for s_prime in s_primes: r = State(from_base10=s_prime).get_reward() q = self.target_policy.v_dict[s_prime] candidates.append(r + q) if beforestate.turn == 1: self.target_policy.v_dict[ afterstate.get_num()] += alpha * ( max(candidates) - self.target_policy.v_dict[afterstate.get_num()]) else: self.target_policy.v_dict[ afterstate.get_num()] += alpha * ( min(candidates) - self.target_policy.v_dict[afterstate.get_num()]) afterstate_num = self.random_policy.move(beforestate_num) afterstate = State(from_base10=afterstate_num)