コード例 #1
0
ファイル: player.py プロジェクト: jaywon99/tictactoe
    def _choose(self, state, available_actions):
        if self.is_train_mode and random.random() < self.exploit_rate:
            return random.choice(available_actions)

        ob = OptimalBoard(state)
        converted_actions = ob.convert_action_to_optimal(available_actions)
        action = self.q.rargmax(ob.board_id, converted_actions)
        return ob.convert_action_to_original(action)
コード例 #2
0
ファイル: player.py プロジェクト: jaywon99/tictactoe
    def _feedback(self, state, action, next_state, reward, done):
        state_ob = OptimalBoard(state)
        converted_action = state_ob.convert_action_to_optimal(action)
        converted_state = self.convert_state(state_ob.optimal_board)
        next_ob = OptimalBoard(next_state)
        converted_next_state = self.convert_state(next_ob.optimal_board)

        self.network.add_train_set(converted_state, converted_action, reward,
                                   converted_next_state, done)
        self.network.study()
コード例 #3
0
ファイル: find_all.py プロジェクト: jaywon99/tictactoe
def find_next(board, color, seq):
    actions = SP.available_actions(board)

    for action in actions:
        new_board = board[:]
        reward, done = SP.play(new_board, action, color)
        if done == True:
            # print it?
            if reward == 0:
                print(seq + str(action), '=', OB.board_to_id(new_board))
            else:
                print(seq + str(action), MARKER[color],
                      OB.board_to_id(new_board))
        else:
            find_next(new_board, SP.next(color), seq + str(action))
コード例 #4
0
    def _choose(self, state, actions):
        if self.is_train_mode and random.random() < self.exploit_rate:
            next_pos = random.choice(actions)
            if self.debug: print("SELECT", actions, "RANDOM", next_pos)

            return next_pos

        found_p = -1.0
        found_c = []
        
        ob = OptimalBoard(state)
        _id = ob.board_id
        if self.debug: print("FROM", _id)

        scores = self.p_table.lookup(_id)
        converted_actions = ob.convert_action_to_optimal(actions)
        for action in converted_actions:
            p = scores[action]
            if self.debug: print("ACTION", ob.convert_action_to_original(action), p)
            if p > found_p:
                found_p = p
                found_c = [ob.convert_action_to_original(action)]
            elif p == found_p:
                found_c.append(ob.convert_action_to_original(action))

        next_pos = random.choice(found_c)
        if self.debug: print("SELECT", found_c, found_p, next_pos)

        return next_pos
コード例 #5
0
ファイル: player.py プロジェクト: jaywon99/tictactoe
    def _calculate_reward(self, history, final_reward):
        ''' convert turn history to learning data and
        calculate reward (multiply gamma)
        '''

        replay_buffer = []
        size = len(history)
        for idx, turn in enumerate(history):
            optimal_board = OptimalBoard(turn[self.HISTORY_STATE])
            converted_action = optimal_board.convert_action_to_optimal(
                turn[self.HISTORY_ACTION])
            converted_state = self.convert_state(optimal_board.optimal_board)
            replay_buffer.append([
                converted_state, converted_action,
                final_reward * GAMMA**(size - idx - 1)
            ])

        running_add = final_reward
        for i in reversed(range(len(replay_buffer))):
            replay_buffer[i][2] = running_add  # 2 is reward of every turn
            running_add = running_add * GAMMA

        return replay_buffer
コード例 #6
0
    def negamax(self, state, color, depth=10):
        ''' implement negamax algorithm
        https://en.wikipedia.org/wiki/Negamax
        '''
        # negamax.counter += 1

        # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small
        # LEAF NODE is checked on play time

        # Transposition Table related work
        (state)
        # _id = ob.board_id
        _id = OptimalBoard.board_to_id(state)

        cache = self.tp.get(_id)
        if cache is not None:  # BUG FIX! cache can be 0, so should check None
            # case 1
            # return cache
            # case 2
            return cache[0], random.choice(cache[1])

        # RECURSIVE
        actions = SP.available_actions(state)
        random.shuffle(
            actions)  # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐
        best_score = -math.inf
        best_actions = []
        for action in actions:
            next_s = state[:]
            score, done = SP.play(next_s, action, color)
            if not done:
                score, _ = self.negamax(next_s, SP.next(color), depth - 1)
                score = -score  # negamax

            # pick from all best moves
            if score > best_score:
                best_score = score
                best_actions = [action]
            elif score == best_score:
                best_actions.append(action)

        # case 1: choose random value 1 time
        # choosed_result = random.choice(best_scores)
        # tp.put(_id, choosed_result)
        # return choosed_result

        # case 2: choose random value every time
        self.tp.put(_id, (best_score, best_actions))
        return (best_score, random.choice(best_actions))
コード例 #7
0
ファイル: player.py プロジェクト: jaywon99/tictactoe
    def _choose(self, state, available_actions):
        optimal_board = OptimalBoard(state)
        converted_actions = optimal_board.convert_action_to_optimal(
            available_actions)
        converted_state = self.convert_state(optimal_board.optimal_board)
        ###
        if self.is_train_mode:
            if random.random() < self.egreedy:
                action = random.choice(converted_actions)
            else:
                action = self.network.predict_one(converted_state)
        else:
            action = self.network.predict_one(converted_state)

        if action not in converted_actions:
            # 여기에 뭐를 학습으로 넣을 지 고민
            # 아니면, predict_one에서 필터를 넣을 지 고민
            self.network.add_train_set(converted_state, action, -1,
                                       self.convert_state([-1] * 9), True)
            action = random.choice(converted_actions)

        original_action = optimal_board.convert_action_to_original(action)

        return original_action
コード例 #8
0
    def _episode_feedback(self, reward):
        # for winner
        history_left = reversed(self.all_history())
        (state, action, _, _, _) = next(history_left)   # pop last history and set it.
        ob = OptimalBoard(state)
        reward = self.p_table.set(ob.board_id, ob.convert_action_to_optimal(action), reward)

        for (state, action, _, _, _) in history_left:
            ob = OptimalBoard(state)
            reward = self.p_table.learn(ob.board_id, ob.convert_action_to_optimal(action), reward)
コード例 #9
0
    def negamax_alpha_beta_pruning(self,
                                   state,
                                   color,
                                   alpha=-math.inf,
                                   beta=math.inf,
                                   depth=10):
        ''' implement negamax algorithm with alpha-beta purning
        https://en.wikipedia.org/wiki/Negamax
        '''
        # negamax.counter += 1

        # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small
        # LEAF NODE is checked on play time

        orig_alpha = alpha

        # Transposition Table related work
        # ob = OptimalBoard(state)
        # _id = ob.board_id
        _id = OptimalBoard.board_to_id(state)
        cache = self.tp.get(_id)
        if cache and cache['depth'] >= depth:
            (cached_score, cached_action) = cache['value']
            if cache['flag'] == self.tp.EXACT:
                return (cached_score, cached_action)
            elif cache['flag'] == self.tp.LOWERBOUND:
                alpha = max(alpha, cached_score)
            elif cache['flag'] == self.tp.UPPERBOUND:
                beta = min(beta, cached_score)
            if alpha >= beta:
                return cached_score, cached_action
        # else:
        #     print("MISS", t.seq)

        # RECURSIVE
        actions = SP.available_actions(state)
        random.shuffle(
            actions)  # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐
        best_score = -math.inf
        best_move = -1
        for action in actions:
            next_s = state[:]
            score, done = SP.play(next_s, action, color)
            if not done:
                score, _ = self.negamax_alpha_beta_pruning(next_s,
                                                           SP.next(color),
                                                           alpha=-beta,
                                                           beta=-alpha,
                                                           depth=depth - 1)
                score = -score  # negamax

            # just pick up 1 first best move (random.shuffle make randomness)
            if best_score < score or (score == best_score
                                      and random.random() < 0.5):
                best_score = score
                best_move = action

            if alpha < score:
                alpha = score
                # 결국 alpha = max(alpha, best_score)
                if alpha > beta:
                    break

        if best_score <= orig_alpha:
            flag = self.tp.UPPERBOUND
        elif best_score >= beta:
            flag = self.tp.LOWERBOUND
        else:
            flag = self.tp.EXACT

        self.tp.put(key=_id,
                    depth=depth,
                    value=(best_score, best_move),
                    flag=flag)

        return (alpha, best_move)
コード例 #10
0
ファイル: player.py プロジェクト: jaywon99/tictactoe
 def _feedback(self, state, action, next_state, reward, done):
     ob1 = OptimalBoard(state)
     ob2 = OptimalBoard(next_state)
     self.q.learn(ob1.board_id, ob1.convert_action_to_optimal(action),
                  reward, ob2.board_id)
コード例 #11
0
 def to_board_id(board):
     ''' board id to make node '''
     return OptimalBoard(board).board_id