Example #1
0
 def get_pv(self, root_id):
     state = utils.get_state_pt(root_id, self.board_size, self.inplanes)
     self.model.eval()
     with torch.no_grad():
         state_input = torch.tensor([state]).to(device).float()
         policy, value = self.model(state_input)
         p = policy.data.cpu().numpy()[0]
         v = value.data.cpu().numpy()[0]
     return p, v
Example #2
0
    def _expansion_evaluation(self, leaf_id, win_index):
        leaf_state = utils.get_state_pt(leaf_id, self.board_size,
                                        self.inplanes)
        self.model.eval()
        with torch.no_grad():
            state_input = torch.tensor([leaf_state]).to(device).float()
            policy, value = self.model(state_input)
            policy = policy.cpu().numpy()[0]
            value = value.cpu().numpy()[0]

        if win_index == 0:
            # expansion
            actions = utils.legal_actions(leaf_id, self.board_size)
            prior_prob = np.zeros(self.board_size**2)

            # re-nomalization
            for action_index in actions:
                prior_prob[action_index] = policy[action_index]

            prior_prob /= prior_prob.sum()

            if self.noise:
                # root node noise
                if leaf_id == self.root_id:
                    noise_probs = np.random.dirichlet(self.alpha *
                                                      np.ones(len(actions)))

            for i, action_index in enumerate(actions):
                child_id = leaf_id + (action_index, )

                prior_p = prior_prob[action_index]

                if self.noise:
                    if leaf_id == self.root_id:
                        prior_p = 0.75 * prior_p + 0.25 * noise_probs[i]

                self.tree[child_id] = {
                    'child': [],
                    'n': 0.,
                    'w': 0.,
                    'q': 0.,
                    'p': prior_p
                }

                self.tree[leaf_id]['child'].append(action_index)
            # return value
            reward = False
            return value, reward
        else:
            # terminal node
            # return reward
            reward = 1.
            value = False
            return value, reward
Example #3
0
    def get_pv(self, root_id):
        # state
        # s[t = (0, 1, ... , inplanes - 2)] = (inplanes - 2 - t)턴 전에 착수한 플레이어의 모든 돌 위치가 one hot 인코딩된 array
        # s[inplanes-1] = color feature (마지막 착수가 흑이었으면 all 0, 백이면 all 1)
        state = utils.get_state_pt(root_id, self.board_size, self.inplanes)
        self.model.eval()  # 드롭아웃 및 배치 정규화를 평가 모드로 설정
        with torch.no_grad():  # Tensor로 부터의 기록 추적과 메모리 사용 방지
            state_input = torch.tensor(
                [state]).to(device).float()  # 지정한 디바이스에 새로운 Tensor 인스턴스 생성
            policy, value = self.model(state_input)  # 모델에 Tensor 적용
            p = policy.data.cpu().numpy()[
                0]  # policy : 각 착수 위치의 승리 가능성이 높을수록 높게 책정된다
            v = value.data.cpu().numpy()[
                0]  # value : (-1 ~ 1) 마지막 턴 플레이어의 승리 가능성이 높으면 낮은 값을 반환

        return p, v
Example #4
0
def self_play(n_selfplay):
    global cur_memory, rep_memory
    global Agent

    state_black = deque()
    state_white = deque()
    pi_black = deque()
    pi_white = deque()

    if RESIGN_MODE:
        resign_val_balck = []
        resign_val_white = []
        resign_val = []
        resign_v = -1.0
        n_resign_thres = N_SELFPLAY // 4

    for episode in range(n_selfplay):
        if (episode + 1) % 10 == 0:
            logging.warning('Playing Episode {:3}'.format(episode + 1))

        env = game.GameState('text')
        board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float')
        turn = 0
        root_id = (0, )
        win_index = 0
        time_steps = 0
        action_index = None

        if RESIGN_MODE:
            resign_index = 0

        while win_index == 0:
            if PRINT_SELFPLAY:
                utils.render_str(board, BOARD_SIZE, action_index)

            # ====================== start MCTS ============================ #

            if time_steps < TAU_THRES:
                tau = 1
            else:
                tau = 0

            pi = Agent.get_pi(root_id, tau)

            # ===================== collect samples ======================== #

            state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES)

            if turn == 0:
                state_black.appendleft(state)
                pi_black.appendleft(pi)
            else:
                state_white.appendleft(state)
                pi_white.appendleft(pi)

            # ======================== get action ========================== #

            action, action_index = utils.get_action(pi)
            root_id += (action_index, )

            # ====================== print evaluation ====================== #

            if PRINT_SELFPLAY:
                Agent.model.eval()
                with torch.no_grad():
                    state_input = torch.tensor([state]).to(device).float()
                    p, v = Agent.model(state_input)
                    p = p.cpu().numpy()[0]
                    v = v.item()

                    print('\nPi:\n{}'.format(
                        pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))
                    print('\nPolicy:\n{}'.format(
                        p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))

                if turn == 0:
                    print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100))
                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            resign_val_balck.append(v)
                        elif v < resign_v:
                            resign_index = 2
                            if PRINT_SELFPLAY:
                                print('"Black Resign!"')
                else:
                    print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100))
                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            resign_val_white.append(v)
                        elif v < resign_v:
                            resign_index = 1
                            if PRINT_SELFPLAY:
                                print('"White Resign!"')

            # =========================== step ============================= #

            board, _, win_index, turn, _ = env.step(action)
            time_steps += 1

            # ========================== result ============================ #

            if RESIGN_MODE:
                if resign_index != 0:
                    win_index = resign_index
                    result['Resign'] += 1

            if win_index != 0:
                if win_index == 1:
                    reward_black = 1.
                    reward_white = -1.
                    result['Black'] += 1

                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            for val in resign_val_balck:
                                resign_val.append(val)
                            resign_val_balck.clear()
                            resign_val_white.clear()

                elif win_index == 2:
                    reward_black = -1.
                    reward_white = 1.
                    result['White'] += 1

                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            for val in resign_val_white:
                                resign_val.append(val)
                            resign_val_white.clear()
                            resign_val_balck.clear()
                else:
                    reward_black = 0.
                    reward_white = 0.
                    result['Draw'] += 1

                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            for val in resign_val_balck:
                                resign_val.append(val)
                            for val in resign_val_white:
                                resign_val.append(val)
                            resign_val_balck.clear()
                            resign_val_white.clear()

                if RESIGN_MODE:
                    if episode + 1 == n_resign_thres:
                        resign_v = min(resign_val)
                        resign_val.clear()

                    if PRINT_SELFPLAY:
                        print('Resign win%: {:.2f}%'.format(
                            (resign_v + 1) / 2 * 100))

            # ====================== store in memory ======================= #

                while state_black or state_white:
                    if state_black:
                        cur_memory.append(
                            (state_black.pop(), pi_black.pop(), reward_black))
                    if state_white:
                        cur_memory.append(
                            (state_white.pop(), pi_white.pop(), reward_white))

            # =========================  result  =========================== #

                if PRINT_SELFPLAY:
                    utils.render_str(board, BOARD_SIZE, action_index)

                    bw, ww, dr, rs = result['Black'], result['White'], \
                        result['Draw'], result['Resign']
                    print('')
                    print('=' * 20, " {:3} Game End   ".format(episode + 1),
                          '=' * 20)
                    print('Black Win: {:3}   '
                          'White Win: {:3}   '
                          'Draw: {:2}   '
                          'Win%: {:.2f}%'
                          '\nResign: {:2}'.format(bw, ww, dr, (bw + 0.5 * dr) /
                                                  (bw + ww + dr) * 100, rs))
                    print('current memory size:', len(cur_memory))

                Agent.reset()

    rep_memory.extend(utils.augment_dataset(cur_memory, BOARD_SIZE))
Example #5
0
def self_play(agent, cur_memory, rank=0):
    agent.model.eval()
    state_black = deque()
    state_white = deque()
    pi_black = deque()
    pi_white = deque()
    episode = 0
    while True:
        if (episode + 1) % 10 == 0:
            logging.info('Playing Episode {:3}'.format(episode + 1))

        env = game.GameState('text')
        board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float')
        turn = 0
        root_id = (0, )
        win_index = 0
        time_steps = 0
        action_index = None

        while win_index == 0:
            if PRINT_SELFPLAY and rank == 0:
                utils.render_str(board, BOARD_SIZE, action_index)

            # ====================== start MCTS ============================ #

            if time_steps < TAU_THRES:
                tau = 1
            else:
                tau = 0

            pi = agent.get_pi(root_id, tau, rank)

            # ===================== collect samples ======================== #

            state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES)

            if turn == 0:
                state_black.appendleft(state)
                pi_black.appendleft(pi)
            else:
                state_white.appendleft(state)
                pi_white.appendleft(pi)

            # ======================== get action ========================== #

            action, action_index = utils.get_action(pi)
            root_id += (action_index, )

            # ====================== print evaluation ====================== #

            if PRINT_SELFPLAY and rank == 0:
                with torch.no_grad():
                    state_input = torch.tensor([state]).to(device).float()
                    p, v = agent.model(state_input)
                    p = p.cpu().numpy()[0]
                    v = v.item()

                    print('\nPi:\n{}'.format(
                        pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))
                    print('\nPolicy:\n{}'.format(
                        p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))

                if turn == 0:
                    print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100))
                else:
                    print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100))

            # =========================== step ============================= #

            board, _, win_index, turn, _ = env.step(action)
            time_steps += 1

            # ========================== result ============================ #

            if win_index != 0:
                if win_index == 1:
                    reward_black = 1.
                    reward_white = -1.
                    result['Black'] += 1

                elif win_index == 2:
                    reward_black = -1.
                    reward_white = 1.
                    result['White'] += 1

                else:
                    reward_black = 0.
                    reward_white = 0.
                    result['Draw'] += 1

            # ====================== store in memory ======================= #

                while state_black or state_white:
                    if state_black:
                        cur_memory.append(
                            (state_black.pop(), pi_black.pop(), reward_black))
                    if state_white:
                        cur_memory.append(
                            (state_white.pop(), pi_white.pop(), reward_white))

            # =========================  result  =========================== #

                if PRINT_SELFPLAY and rank == 0:
                    utils.render_str(board, BOARD_SIZE, action_index)

                    bw, ww, dr = result['Black'], result['White'], \
                        result['Draw']
                    print('')
                    print('=' * 20, " {:3} Game End   ".format(episode + 1),
                          '=' * 20)
                    print('Black Win: {:3}   '
                          'White Win: {:3}   '
                          'Draw: {:2}   '
                          'Win%: {:.2f}%'.format(bw, ww, dr, (bw + 0.5 * dr) /
                                                 (bw + ww + dr) * 100))
                    print('current memory size:', len(cur_memory))
                episode += 1
                agent.reset()
                if len(cur_memory) >= MEMORY_SIZE:
                    return utils.augment_dataset(cur_memory, BOARD_SIZE)
Example #6
0
    def _expansion_evaluation(self, leaf_id, win_index):
        # 최근 몇턴간의 one hot 인코딩된 흑돌의 위치와 one hot 인코딩된 백돌의 위치, 그리고 색깔 정보
        leaf_state = utils.get_state_pt(leaf_id, self.board_size,
                                        self.inplanes)
        self.model.eval()  # 드롭아웃 및 배치 정규화를 평가 모드로 설정
        with torch.no_grad():  # Tensor로 부터의 기록 추적과 메모리 사용 방지
            state_input = torch.tensor([
                leaf_state
            ]).to(device).float()  # 지정한 디바이스에 새로운 Tensor 인스턴스 생성
            policy, value = self.model(state_input)  # 모델에 Tensor 적용
            policy = policy.cpu().numpy()[0]  # policy : 승리가능성이 높을수록 높게 책정된다
            value = value.cpu().numpy()[
                0]  # value : (-1 ~ 1) 마지막 턴 플레이어의 승리 가능성이 높으면 낮은 값을 반환

        if win_index == 0:  # 승패가 결정되지 않은 경우
            # expansion
            actions = utils.legal_actions(
                leaf_id, self.board_size)  # 잎 노드의 보드 상황에서 모든 가능한 착수 위치
            prior_prob = np.zeros(
                self.board_size**2)  # policy의 정규화 값이 저장될 array

            # re-nomalization
            for action_index in actions:
                prior_prob[action_index] = policy[action_index]

            prior_prob /= prior_prob.sum()

            if self.noise:  # 노이즈 생성
                # root node noise
                if leaf_id == self.root_id:
                    noise_probs = np.random.dirichlet(self.alpha *
                                                      np.ones(len(actions)))

            # 잎 노드에서 착수 가능한 위치에 해당하는 자식 노드 생성
            for i, action_index in enumerate(actions):
                child_id = leaf_id + (action_index, )

                prior_p = prior_prob[action_index]

                if self.noise:
                    if leaf_id == self.root_id:
                        prior_p = 0.75 * prior_p + 0.25 * noise_probs[i]

                # 트리에 자식 노드 추가
                self.tree[child_id] = {
                    'child': [],
                    'n': 0.,
                    'w': 0.,
                    'q': 0.,
                    'p': prior_p
                }

                self.tree[leaf_id]['child'].append(
                    action_index)  # 잎 노드의 child 밸류 수정
            # return value
            reward = False
            return value, reward
        else:  # 게임의 승패가 결정됐을 때
            # terminal node
            # return reward
            reward = 1.
            value = False
            return value, reward