Beispiel #1
0
def play():
    done = 0
    env.reset()
    state = copy.copy(env.state)

    i = 0
    while not done:
        i += 1
        turn = copy.copy(env.turn)
        if i % 2 == 1:
            action = agent1.policy(state,
                                   turn,
                                   available_actions(state),
                                   epsilon=0)
        else:
            action = agent2.policy(state,
                                   turn,
                                   available_actions(state),
                                   epsilon=0)
        next_state, done, winner = env.step(action)

        # update AI agent
        update(agent2, state, next_state)

        state = copy.copy(next_state)
        env.render()

    if winner == 0:
        print("Draw!")
    else:
        print("Winner is agent %d!" % winner)

    # save data
    agent2.save()
Beispiel #2
0
    def policy(self, state, turn, epsilon=0.08):
        maxvalue = -999999
        minvalue = 999999
        available = available_actions(state)
        action_list = []

        if np.random.rand(1) < epsilon:
            action_list = available
        else:
            value = self.action_value.predict(state)
            value = np.reshape(value, 9)

            if turn == 1:
                for action in available:
                    if value[action] > maxvalue:
                        action_list = []
                        maxvalue = value[action]
                        action_list.append(action)
                    elif value[action] == maxvalue:
                        action_list.append(action)
            else:
                for action in available:
                    if value[action] < minvalue:
                        action_list = []
                        minvalue = value[action]
                        action_list.append(action)
                    elif value[action] == minvalue:
                        action_list.append(action)

        return random.choice(action_list)
Beispiel #3
0
    def policy(self, state, turn, epsilon=0.08):
        maxvalue = -99999
        minvalue = 99999
        encoded_state = encode(state)
        available = available_actions(state)
        action_list = []

        if np.random.rand(1) < epsilon:
            action_list = available
        else:
            if turn == 1:
                for action in available:
                    encoded = encoded_state + str(action)
                    if self.action_value[encoded] > maxvalue:
                        action_list = []
                        maxvalue = self.action_value[encoded]
                        action_list.append(action)
                    elif self.action_value[encoded] == maxvalue:
                        action_list.append(action)
            else:
                for action in available:
                    encoded = encoded_state + str(action)
                    if self.action_value[encoded] < minvalue:
                        action_list = []
                        minvalue = self.action_value[encoded]
                        action_list.append(action)
                    elif self.action_value[encoded] == minvalue:
                        action_list.append(action)

        return random.choice(action_list)
Beispiel #4
0
    def policy(self, state, turn, epsilon=0):
        available = available_actions(state)

        while True:
            ret = int(input("input [0 1 2 / 3 4 5 / 6 7 8] : "))
            if ret in available:
                break
        return ret
Beispiel #5
0
    def init_value(self):
        state_list = itertools.product([0, 1, 2], repeat=9)

        for state in state_list:
            state = list(state)
            encoded_state = encode(state)
            available = available_actions(state)

            for action in available:
                encoded = encoded_state + str(action)
                self.action_value[encoded] = 0
Beispiel #6
0
    def init_value(self):
        state_list = itertools.product([0, 1, 2], repeat=9)

        for state in state_list:
            state = list(state)
            encoded = encode(state)
            done, winner = is_finished(state)
            if not done:
                self._value[encoded] = random.uniform(-0.5, 0.5)
                self._policy[encoded] = random.choice(available_actions(state))
            # terminal state value
            else:
                self._value[encoded] = 0
Beispiel #7
0
    def policy(self, state, turn, epsilon=0):
        available = available_actions(state)
        action_list = []

        for i in available:
            state[i] = turn
            done, winner = is_finished(state)
            state[i] = 0
            if done:
                action_list.append(i)
        if len(action_list) == 0:
            action_list = available

        return random.choice(action_list)
Beispiel #8
0
def polcy_improvement(agent):
    policy_stable = True

    state_list = itertools.product([0, 1, 2], repeat=9)
    for state in state_list:
        state = list(state)
        done, winner = is_finished(state)
        if not done:  # except for terminal state
            available = available_actions(state)
            turn = ret_turn(state)

            old_action = agent.policy(state)

            max_value = -9999999
            min_value = 9999999
            if turn == 1:
                for action in available:
                    next_state, reward = predict(state, action)
                    value = reward + discount_factor * agent.value(next_state)
                    if value > max_value:
                        max_value = value
                        new_action = action
            else:
                for action in available:
                    next_state, reward = predict(state, action)
                    value = reward + discount_factor * agent.value(next_state)
                    if value < min_value:
                        min_value = value
                        new_action = action

            agent.assign_policy(state, new_action)

            if old_action != new_action:
                policy_stable = False

    if policy_stable:
        return True
    else:
        return False
Beispiel #9
0
def train():
    win_rate_list = []
    win_rate_mean = []

    episode = 0
    while True:  # episode < total_episode:

        # epsilon = random.choice(epsilon_list)

        # training stage1 (self-training)
        for _ in range(train_episode1):
            episode += 1
            done = 0
            env.reset()
            state = copy.copy(env.state)

            while not done:
                turn = copy.copy(env.turn)
                action = agent.policy(state,
                                      turn,
                                      available_actions(state),
                                      epsilon=epsilon)
                next_state, done, winner = env.step(action)
                update(agent, state, next_state, learning_rate=learning_rate)
                state = copy.copy(next_state)

        # training stage2 (vs agent_base)
        for i in range(train_episode2):
            episode += 1
            done = 0
            env.reset()
            state = copy.copy(env.state)

            j = 0
            while not done:
                j += 1
                turn = copy.copy(env.turn)
                if (i + j) % 2 == 1:
                    action = agent.policy(state,
                                          turn,
                                          available_actions(state),
                                          epsilon=epsilon)
                else:
                    action = agent_base.policy(state, turn,
                                               available_actions(state))
                next_state, done, winner = env.step(action)
                if done:
                    update(agent,
                           state,
                           next_state,
                           learning_rate=learning_rate)
                state = copy.copy(next_state)

        # verification stage
        win = lose = draw = 0
        for i in range(verify_episode):
            done = 0
            env.reset()
            state = copy.copy(env.state)

            j = 0
            while not done:
                j += 1
                turn = copy.copy(env.turn)
                if (i + j) % 2 == 1:
                    # epsilon 0
                    action = agent.policy(state,
                                          turn,
                                          available_actions(state),
                                          epsilon=0)
                else:
                    action = agent_base.policy(state, turn,
                                               available_actions(state))
                next_state, done, winner = env.step(action)
                state = copy.copy(next_state)

            if winner == 0:
                draw += 1
            elif (i + j) % 2 == 1:
                win += 1
            else:
                lose += 1
        win_rate = (win + draw) / verify_episode
        print("[Episode %d] Win : %d Draw : %d Lose : %d Win_rate: %.2f" %
              (episode, win, draw, lose, win_rate))
        agent.save()

        if win_rate > 0.97:
            break

        # print status (each train_episode * 100)
        win_rate_list.append(win_rate)
        if episode % ((train_episode1 + train_episode2) * 100) == 0:
            mean = np.mean(win_rate_list)
            win_rate_mean.append(np.round(mean, 2))
            win_rate_list.clear()
            print("[ ", end='')
            for x in win_rate_mean:
                print("%.2f" % x, end=' ')
            print("]")