예제 #1
0
    def step(self, action):
        self.state[action] = self.turn
        self.turn = self.turn % 2 + 1
        done, winner = is_finished(self.state)
        reward = 0
        if done and winner == 1: reward = 1
        if done and winner == 2: reward = -1
        if done and winner == 0: reward = 0

        return self.state, done, reward, winner
예제 #2
0
    def policy(self, state, turn, available, epsilon=0):
        action_list = []

        for i in available:
            state[i] = turn
            done, winner = is_finished(state)
            state[i] = 0
            if done:
                action_list.append(i)
        if len(action_list) == 0:
            action_list = available

        return random.choice(action_list)
예제 #3
0
    def init_value(self):
        state_list = itertools.product([0, 1, 2], repeat=9)

        for state in state_list:
            state = list(state)
            encoded = encode(state)
            done, winner = is_finished(state)
            if not done:
                self._value[encoded] = random.uniform(-0.5, 0.5)
                self._policy[encoded] = random.choice(available_actions(state))
            # terminal state value
            else:
                self._value[encoded] = 0
def predict(state, action):
    next_state = copy.copy(state)
    turn = ret_turn(state)

    next_state[action] = turn
    done, winner = is_finished(next_state)

    reward = 0
    if done and winner == 1: reward = 1
    if done and winner == 2: reward = -1
    if done and winner == 0: reward = 0

    return next_state, reward
예제 #5
0
    def init_value(self):

        state_list = itertools.product([0, 1, 2], repeat=9)

        for state in state_list:
            state = list(state)
            done, winner = is_finished(state)
            encoded = encode(state)
            if not done:
                self.value[encoded] = 0
            elif winner == 1:
                self.value[encoded] = 1
            elif winner == 2:
                self.value[encoded] = -1
            else:
                self.value[encoded] = 0
예제 #6
0
def polcy_improvement(agent):
    policy_stable = True

    state_list = itertools.product([0, 1, 2], repeat=9)
    for state in state_list:
        state = list(state)
        done, winner = is_finished(state)
        if not done:  # except for terminal state
            available = available_actions(state)
            turn = ret_turn(state)

            old_action = agent.policy(state)

            max_value = -9999999
            min_value = 9999999
            if turn == 1:
                for action in available:
                    next_state, reward = predict(state, action)
                    value = reward + discount_factor * agent.value(next_state)
                    if value > max_value:
                        max_value = value
                        new_action = action
            else:
                for action in available:
                    next_state, reward = predict(state, action)
                    value = reward + discount_factor * agent.value(next_state)
                    if value < min_value:
                        min_value = value
                        new_action = action

            agent.assign_policy(state, new_action)

            if old_action != new_action:
                policy_stable = False

    if policy_stable:
        return True
    else:
        return False
예제 #7
0
def policy_evaluation(agent):
    theta = 1e-9

    while True:
        delta = 0.0

        state_list = itertools.product([0, 1, 2], repeat=9)
        for state in state_list:
            state = list(state)
            done, winner = is_finished(state)
            if not done:  # except for terminal state
                v = agent.value(state)

                action = agent.policy(state)
                next_state, reward = predict(state, action)
                agent.assign_value(
                    state, reward + discount_factor * agent.value(next_state))

                delta = max([delta, abs(v - agent.value(state))])

        if delta < theta:
            break
예제 #8
0
 def step(self, action):
     self.state[action] = self.turn
     self.turn = self.turn % 2 + 1
     done, winner = is_finished(self.state)
     return self.state, done, winner