Beispiel #1
0
 def reset(self):
     '''
     Reset state, go_board, curr_player, prev_player_passed,
     done, return state
     '''
     self.state_ = gogame.init_state(self.size)
     self.done = False
     return np.copy(self.state_)
Beispiel #2
0
 def __init__(self, size, komi=0, reward_method='real'):
     '''
     @param reward_method: either 'heuristic' or 'real'
     heuristic: gives # black pieces - # white pieces.
     real: gives 0 for in-game move, 1 for winning, -1 for losing,
         0 for draw, all from black player's perspective
     '''
     self.size = size
     self.komi = komi
     self.state_ = gogame.init_state(size)
     self.reward_method = RewardMethod(reward_method)
     self.observation_space = gym.spaces.Box(np.float32(0),
                                             np.float32(govars.NUM_CHNLS),
                                             shape=(govars.NUM_CHNLS, size,
                                                    size))
     self.action_space = gym.spaces.Discrete(gogame.action_size(
         self.state_))
     self.done = False
Beispiel #3
0
    def exceute_episode(self):

        train_examples = []
        current_player = 1
        state = gogame.init_state(self.args['boardSize'])

        while True:
            #print("while True")
            canonical_board = gogame.canonical_form(state)

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model, canonical_board, to_play=1)

            action_probs = [
                0 for _ in range((self.args['boardSize'] *
                                  self.args['boardSize']) + 1)
            ]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count

            action_probs = action_probs / np.sum(action_probs)
            train_examples.append(
                (canonical_board, current_player, action_probs))

            action = root.select_action(temperature=1)
            state = gogame.next_state(state, action, canonical=False)
            current_player = -current_player
            reward = gogame.winning(
                state) * current_player if gogame.game_ended(state) else None

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    # [Board, currentPlayer, actionProbabilities, Reward]
                    tfBoard = np.array(
                        [hist_state[0], hist_state[1],
                         hist_state[3]]).transpose().tolist()
                    #ret.append(np.array([tfBoard,tfBoard, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))]))
                    ret.append(
                        (tfBoard, hist_action_probs, reward *
                         ((-1)**(hist_current_player != current_player))))
                return ret
Beispiel #4
0
    def getInitBoard(self):
        # return initial board (numpy board)

        return gogame.init_state(self.size)