def step(self, action): self.state[action] = self.turn self.turn = self.turn % 2 + 1 done, winner = is_finished(self.state) reward = 0 if done and winner == 1: reward = 1 if done and winner == 2: reward = -1 if done and winner == 0: reward = 0 return self.state, done, reward, winner
def policy(self, state, turn, available, epsilon=0): action_list = [] for i in available: state[i] = turn done, winner = is_finished(state) state[i] = 0 if done: action_list.append(i) if len(action_list) == 0: action_list = available return random.choice(action_list)
def init_value(self): state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) encoded = encode(state) done, winner = is_finished(state) if not done: self._value[encoded] = random.uniform(-0.5, 0.5) self._policy[encoded] = random.choice(available_actions(state)) # terminal state value else: self._value[encoded] = 0
def predict(state, action): next_state = copy.copy(state) turn = ret_turn(state) next_state[action] = turn done, winner = is_finished(next_state) reward = 0 if done and winner == 1: reward = 1 if done and winner == 2: reward = -1 if done and winner == 0: reward = 0 return next_state, reward
def init_value(self): state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) done, winner = is_finished(state) encoded = encode(state) if not done: self.value[encoded] = 0 elif winner == 1: self.value[encoded] = 1 elif winner == 2: self.value[encoded] = -1 else: self.value[encoded] = 0
def polcy_improvement(agent): policy_stable = True state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) done, winner = is_finished(state) if not done: # except for terminal state available = available_actions(state) turn = ret_turn(state) old_action = agent.policy(state) max_value = -9999999 min_value = 9999999 if turn == 1: for action in available: next_state, reward = predict(state, action) value = reward + discount_factor * agent.value(next_state) if value > max_value: max_value = value new_action = action else: for action in available: next_state, reward = predict(state, action) value = reward + discount_factor * agent.value(next_state) if value < min_value: min_value = value new_action = action agent.assign_policy(state, new_action) if old_action != new_action: policy_stable = False if policy_stable: return True else: return False
def policy_evaluation(agent): theta = 1e-9 while True: delta = 0.0 state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) done, winner = is_finished(state) if not done: # except for terminal state v = agent.value(state) action = agent.policy(state) next_state, reward = predict(state, action) agent.assign_value( state, reward + discount_factor * agent.value(next_state)) delta = max([delta, abs(v - agent.value(state))]) if delta < theta: break
def step(self, action): self.state[action] = self.turn self.turn = self.turn % 2 + 1 done, winner = is_finished(self.state) return self.state, done, winner