Esempio n. 1
0
def get_board(game):
    board = Board()
    board.init_board(0)

    for move in game.moves:
        board.do_move(move.location)

    return board
Esempio n. 2
0
class GameStrategy_MZhang():
    def __init__(self, startplayer=0):
        model_file = 'models/resnet/output318/current_policy.model'
        policy_param = None
        self.height = 15
        self.width = 15
        '''if model_file is not None:
            print('loading...', model_file)
            try:
                policy_param = pickle.load(open(model_file, 'rb'))
            except:
                policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')'''
        policy_value_net = PolicyValueNet(self.height,
                                          self.width,
                                          model_file=model_file,
                                          output='output/')
        self.mcts_player = MCTSPlayer(policy_value_net.policy_value_fn,
                                      c_puct=1,
                                      n_playout=1000)
        self.board = Board(width=self.width, height=self.height, n_in_row=5)
        self.board.init_board(startplayer)
        self.game = Game(self.board)
        p1, p2 = self.board.players
        print('players:', p1, p2)
        self.mcts_player.set_player_ind(p1)
        pass

    def play_one_piece(self, user, gameboard):
        print('user:'******'gameboard:', gameboard.move_history)
        lastm = gameboard.get_lastmove()
        if lastm[0] != -1:
            usr, n, row, col = lastm
            mv = (self.height - row - 1) * self.height + col
            # if not self.board.states.has_key(mv):
            self.board.do_move(mv)

        print('board:', self.board.states.items())
        move = self.mcts_player.get_action(self.board)
        self.board.do_move(move)
        self.game.graphic(self.board, *self.board.players)
        outmv = (self.height - move // self.height - 1, move % self.width)

        return outmv
Esempio n. 3
0
class RL_QG_agent(object):
    def __init__(self):
        self.temp = 1e-3  # the temperature param
        self.n_playout = 200  # num of simulations for each move
        self.c_puct = 5
        self.board_width = 8
        self.board_height = 8
        self.model_path = os.path.join("./models/curr_model_100rollout.pt")
        #self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params=None)
        #self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout)
        self.mcts_player = MCTS_Pure(c_puct=5, n_playout=self.n_playout)
        self.env = gym.make("Reversi8x8-v0")
        self.init_model()
        #self.load_model()

    def init_model(self):
        self.board = Board(env=self.env,
                           width=self.board_width,
                           height=self.board_height)
        self.board.init_board()
        self.game = Game(self.board)
        self.have_step = False

    def place(self, state, enables, player=None):
        curr_state = bit_to_board(self.board.black, self.board.white)
        curr_state = 1 - (curr_state[0] + curr_state[1])
        reverse_change = np.where((curr_state - state[2]) == -1)
        if self.have_step == False:
            pass
        elif reverse_change[0].shape[0] > 1:
            self.board.init_board()
            self.have_step = False
        curr_state = bit_to_board(self.board.black, self.board.white)
        curr_state = 1 - (curr_state[0] + curr_state[1])
        change = np.where((curr_state - state[2]) == 1)
        if change[0].shape[0] == 1:
            action = change[0][0] * self.board_width + change[1][0]
            self.board.do_move(action)
        else:
            if self.have_step == False:
                pass
            else:
                action = 65
                self.board.do_move(action)

        move = self.mcts_player.get_action(self.board)
        self.board.do_move(move)
        self.have_step = True

        return move

    def load_model(self):
        self.policy_value_net.policy_value_net.load_state_dict(
            torch.load(self.model_path))
Esempio n. 4
0
def player_moved():
    receive_data = request.get_json()
    print(receive_data)

    board = Board(width=9, height=9, n_in_row=5)
    board.init_board(0)

    states_loc = receive_data['states_loc']
    if states_loc != None:
        board.states_loc = states_loc
        board.states_loc_to_states()

    # 플레이어가 둔 돌의 위치를 받고
    player_loc = receive_data['player_moved']
    player_move = board.location_to_move(player_loc)
    board.do_move(player_move)
    board.set_forbidden()  # 금수 자리 업데이트

    print(np.array(board.states_loc))
    print(board.states)

    # 승리 판정 (플레이어가 이겼는지)
    end, winner = board.game_end()
    if end:
        if winner == -1: message = "tie"
        else: message = winner

        data = {
            'ai_moved': None,
            'forbidden': board.forbidden_locations,
            'message': message
        }
        return jsonify(data)

    # AI가 둘 위치를 보낸다.
    # 난이도에 해당하는 player 불러옴.
    hard_idx = receive_data['hard_idx']
    hards = [2500, 5000, 7500, 10000, 12500, 15000, 17500, 20000]
    model_file = f'./model/policy_9_{hards[hard_idx]}.model'
    policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')
    best_policy = PolicyValueNetNumpy(9, 9, policy_param)
    mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                             c_puct=5,
                             n_playout=400)

    ai_move = mcts_player.get_action(board)
    ai_loc = board.move_to_location(ai_move)
    board.do_move(ai_move)
    board.set_forbidden()  # 금수 자리 업데이트

    print(np.array(board.states_loc))

    # 승리 판정 (AI가 이겼는지)
    message = None
    end, winner = board.game_end()
    if end:
        if winner == -1: message = "tie"
        else: message = winner

    data = {
        'ai_moved': list(map(int, ai_loc)),
        'states_loc': board.states_loc,
        'forbidden': board.forbidden_locations,
        'message': message
    }
    return jsonify(data)
Esempio n. 5
0
class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        self.manual = Manual(self.board)
        # training params
        self.learn_rate = 1e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 100  # num of simulations for each move
        self.c_puct = 1
        self.buffer_size = 100000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.episode_len = 0
        self.kl_targ = 0.02
        self.check_freq = 1
        self.game_batch_num = 5
        self.best_win_ratio = 0.55
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        self.lock = threading.Lock()
        if init_model:
            # start training from an initial policy-value net
            self.g1 = tf.Graph()
            with self.g1.as_default():
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       model_file=init_model,
                                                       graph=self.g1,
                                                       output='/data/data/')
            # tf.reset_default_graph()
            self.g2 = tf.Graph()
            with self.g2.as_default():
                self.policy_value_net_train = PolicyValueNet(self.board_width,
                                                             self.board_height,
                                                             model_file=init_model,
                                                             graph=self.g2,
                                                             output='/data/output/')
        else:
            # start training from a new policy-value net
            self.g1 = tf.Graph()
            with self.g1.as_default():
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       graph=self.g1,
                                                       output='./data/')
            # tf.reset_default_graph()
            self.g2 = tf.Graph()
            with self.g2.as_default():
                self.policy_value_net_train = PolicyValueNet(self.board_width,
                                                             self.board_height,
                                                             graph=self.g2,
                                                             output='./output/')

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(np.flipud(
                    mcts_porb.reshape(self.board_height, self.board_width)), i)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            # self.lock.acquire()
            # print("game {}".format(i))
            with self.g1.as_default():
                '''mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout,
                                         is_selfplay=1)
                board = Board(width=self.board_width,
                              height=self.board_height,
                              n_in_row=self.n_in_row)
                game = Game(board)'''
                winner, play_data = self.game.start_self_play(self.mcts_player,
                                                              is_shown=0,
                                                              temp=self.temp)
            # self.lock.release()

            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

        # print("self play end...")

    def collect_manual_data(self, file):
        winner, play_data = self.manual.read_manual_data(file)
        # read the chess manual fail
        if winner == 0:
            return

        play_data = list(play_data)[:]
        self.episode_len = len(play_data)
        # augment the data
        play_data = self.get_equi_data(play_data)
        self.data_buffer.extend(play_data)

    def collect_test_data(self):
        self.board.init_board()
        states, mcts_probs, current_players = [], [], []
        move = 128
        self.board.do_move(112)
        states.append(self.board.current_state())
        probs = np.zeros(self.board.width * self.board.height)
        probs[[move]] = 1
        mcts_probs.append(probs)
        current_players.append(self.board.current_player)
        winners_z = np.array([1])
        play_data = zip(states, mcts_probs, winners_z)
        play_data = list(play_data)[:]
        self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        with self.g2.as_default():
            for i in range(self.epochs):
                loss, entropy = self.policy_value_net_train.train_step(
                        state_batch,
                        mcts_probs_batch,
                        winner_batch,
                        self.learn_rate*self.lr_multiplier)

        print((
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               ).format(
                        self.lr_multiplier,
                        loss,
                        entropy))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        print("evaluating...")
        current_mcts_player = MCTSPlayer(self.policy_value_net_train.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.pure_mcts_playout_num)
        best_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.pure_mcts_playout_num)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          best_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
                self.pure_mcts_playout_num,
                win_cnt[1], win_cnt[2], win_cnt[-1]))

        # save the current_model
        self.policy_value_net_train.save_model('/data/output/current_policy.model')
        if win_ratio > self.best_win_ratio:
            print("New best policy!!!!!!!!")
            # update the best_policy
            self.policy_value_net_train.save_model('/data/output/best_policy.model')
            self.g1 = tf.Graph()
            with self.g1.as_default():
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       model_file='/data/output/best_policy.model',
                                                       graph=self.g1,
                                                       output='/data/data/')

        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            '''coord = tf.train.Coordinator()
            self_play = [threading.Thread(target=self.collect_selfplay_data, args=(self.play_batch_size,)) for i in range(4)]
            for sp in self_play:
                sp.start()
            coord.join(self_play)
            while len(self.data_buffer) < self.batch_size:
                print(len(self.data_buffer))
                time.sleep(3)
                pass'''
            multiplier = [0.1, 0.1, 0.01, 0.01, 0.01]
            step = 0
            for n in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                # self.collect_test_data()
                self.policy_value_net.n_step += 1

                print("batch i:{}, episode_len:{}".format(
                   self.policy_value_net.n_step, self.episode_len))

                # optimisation
                if len(self.data_buffer) > self.batch_size:
                    for i in range(100):
                        self.policy_update()

                # evaluation
                if self.policy_value_net.n_step % self.check_freq == 0:
                    # self.lr_multiplier = multiplier[step]
                    # step += 1
                    self.mcts_player.mcts._discount = 1 - 0.98*(1 - self.mcts_player.mcts._discount)
                    print("current self-play batch: {}, discount: {}".format(
                        self.policy_value_net.n_step, self.mcts_player.mcts._discount))

                    # self.lock.acquire()
                    self.policy_evaluate(n_games=15)
                    # self.lock.release()
        except KeyboardInterrupt:
            print('\n\rquit')
Esempio n. 6
0
class GoBang(QWidget):
    def __init__(self):
        super().__init__()
        self.initUI()

    def initUI(self):
        self.tup = (None, None)
        self.board = Board()  # 棋盘类
        self.board.init_board(1)
        palette1 = QPalette()  # 设置棋盘背景
        palette1.setBrush(self.backgroundRole(),
                          QtGui.QBrush(QtGui.QPixmap('img/linesofaction.png')))
        self.setPalette(palette1)
        # self.setStyleSheet("board-image:url(img/chessboard.jpg)")  # 不知道这为什么不行
        self.setCursor(Qt.PointingHandCursor)  # 鼠标变成手指形状
        # self.sound_piece = QSound("sound/luozi.wav")  # 加载落子音效
        # self.sound_win = QSound("sound/win.wav")  # 加载胜利音效
        # self.sound_defeated = QSound("sound/defeated.wav")  # 加载失败音效

        self.resize(WIDTH, HEIGHT)  # 固定大小 540*540
        self.setMinimumSize(QtCore.QSize(WIDTH, HEIGHT))
        self.setMaximumSize(QtCore.QSize(WIDTH, HEIGHT))

        self.setWindowTitle("Lines-Of-Action")  # 窗口名称
        self.setWindowIcon(QIcon('img/black.png'))  # 窗口图标

        # self.lb1 = QLabel('            ', self)
        # self.lb1.move(20, 10)

        self.black = QPixmap('img/black.png')
        self.white = QPixmap('img/white.png')

        self.piece_now = BLACK  # 黑棋先行
        self.my_turn = True  # 玩家先行
        self.step = 0  # 步数
        self.x, self.y = 1000, 1000

        #self.mouse_point = LaBel(self)  # 将鼠标图片改为棋子
        # self.mouse_point.setScaledContents(True)
        # self.mouse_point.setPixmap(self.black)  # 加载黑棋
        # self.mouse_point.setGeometry(270, 270, PIECE, PIECE)
        self.pieces = [[
            LaBel(self),
            LaBel(self),
            LaBel(self),
            LaBel(self),
            LaBel(self),
            LaBel(self),
            LaBel(self),
            LaBel(self)
        ] for _ in range(8)]  # 新建棋子标签,准备在棋盘上绘制棋子
        # for piece in self.pieces:
        #     piece.setVisible(True)  # 图片可视
        #     piece.setScaledContents(True)  # 图片大小根据标签大小可变
        for i in range(8):
            for j in range(8):
                self.pieces[i][j].setVisible(True)
                self.pieces[i][j].setScaledContents(True)
        #self.mouse_point.raise_()  # 鼠标始终在最上层
        self.ai_down = True  # AI已下棋,主要是为了加锁,当值是False的时候说明AI正在思考,这时候玩家鼠标点击失效,要忽略掉 mousePressEvent

        self.setMouseTracking(True)

        self.DrawPieces()

        self.show()

    def DrawPieces(self):
        for i in range(8):
            for j in range(8):
                if self.board.map[i][j] == -1:
                    x, y = self.coordinate_transform_map2pixel(i, j)
                    self.pieces[i][j].setPixmap(self.black)
                    self.pieces[i][j].setGeometry(x, y, PIECE, PIECE)
                if self.board.map[i][j] == 1:
                    x, y = self.coordinate_transform_map2pixel(i, j)
                    self.pieces[i][j].setPixmap(self.white)
                    self.pieces[i][j].setGeometry(x, y, PIECE, PIECE)
                if self.board.map[i][j] == 0:
                    x, y = self.coordinate_transform_map2pixel(i, j)
                    self.pieces[i][j].setPixmap(QPixmap(""))
                    self.pieces[i][j].setGeometry(x, y, PIECE, PIECE)

    def paintEvent(self, event):  # 画出指示箭头
        qp = QPainter()
        qp.begin(self)
        self.drawLines(qp)
        qp.end()

    def mouseMoveEvent(self, e):  # 黑色棋子随鼠标移动
        # self.lb1.setText(str(e.x()) + ' ' + str(e.y()))
        # self.mouse_point.move(e.x() - 16, e.y() - 16)
        e.accept()

    def mousePressEvent(self, e):  # 玩家下棋
        if e.button() == Qt.LeftButton and self.ai_down == True:
            x, y = e.x(), e.y()  # 鼠标坐标
            i, j = self.coordinate_transform_pixel2map(x, y)  # 对应棋盘坐标
            new_x, new_y = self.coordinate_transform_map2pixel(i, j)
            if not i is None and not j is None:  # 棋子落在棋盘上,排除边缘
                if self.board.map[i][j] == -1:  # 人类玩家执黑子
                    self.board.current_player = 1
                    # self.draw(i, j)
                    print(self.tup)
                    self.tup = (i, j)
                else:
                    (old_i, old_j) = self.tup
                    if old_i is None or old_j is None:
                        return
                    moves = self.board.get_available(1)
                    my_move = str(old_i) + str(old_j) + str(i) + str(j)
                    print("人类当前走法")
                    print(moves)
                    print("人类当前棋盘")
                    print(self.board.map)
                    if my_move in moves:
                        self.board.do_move(my_move)
                        self.pieces[old_i][old_j].setPixmap(QPixmap(""))
                        self.pieces[i][j].setPixmap(self.black)
                        self.pieces[i][j].setGeometry(new_x, new_y, PIECE,
                                                      PIECE)
                        end, winner = self.board.game_end()
                        if end:
                            self.gameover(winner)
                        if self.board.current_player == 2:
                            self.ai_down = False
                            board = self.board
                            self.AI = AI(board)  # 新建线程对象,传入棋盘参数
                            self.AI.finishSignal.connect(
                                self.AI_draw)  # 结束线程,传出参数
                            self.AI.start()
                    else:
                        print("error move")
                    # run

    def AI_draw(self, i, j, nxt_i, nxt_j):
        print(i, j, nxt_i, nxt_j)
        self.pieces[i][j].setPixmap(QPixmap(""))
        self.pieces[nxt_i][nxt_j].setPixmap(self.white)  # AI
        x, y = self.coordinate_transform_map2pixel(nxt_i, nxt_j)
        self.pieces[nxt_i][nxt_j].setGeometry(x, y, PIECE, PIECE)
        end, winner = self.board.game_end()
        if end:
            self.gameover(winner)
        self.ai_down = True
        self.update()

    def drawLines(self, qp):  # 指示AI当前下的棋子
        if self.step != 0:
            pen = QtGui.QPen(QtCore.Qt.black, 2, QtCore.Qt.SolidLine)
            qp.setPen(pen)
            qp.drawLine(self.x - 5, self.y - 5, self.x + 3, self.y + 3)
            qp.drawLine(self.x + 3, self.y, self.x + 3, self.y + 3)
            qp.drawLine(self.x, self.y + 3, self.x + 3, self.y + 3)

    def coordinate_transform_map2pixel(self, i, j):
        # 从 chessMap 里的逻辑坐标到 UI 上的绘制坐标的转换
        return MARGIN + j * GRID - PIECE / 2, MARGIN + i * GRID - PIECE / 2

    def coordinate_transform_pixel2map(self, x, y):
        # 从 UI 上的绘制坐标到 chessMap 里的逻辑坐标的转换
        i, j = int(round((y - MARGIN) / GRID)), int(round((x - MARGIN) / GRID))
        # 有MAGIN, 排除边缘位置导致 i,j 越界
        if i < 0 or i >= 15 or j < 0 or j >= 15:
            return None, None
        else:
            return i, j

    def gameover(self, winner):
        if winner == 1:
            #self.sound_win.play()
            reply = QMessageBox.question(self, 'You Win!', 'Continue?',
                                         QMessageBox.Yes | QMessageBox.No,
                                         QMessageBox.No)
        else:
            if winner == 2:
                #self.sound_defeated.play()
                reply = QMessageBox.question(self, 'You Lost!', 'Continue?',
                                             QMessageBox.Yes | QMessageBox.No,
                                             QMessageBox.No)
            else:
                reply = QMessageBox.question(self, 'Tie', 'Continue?',
                                             QMessageBox.Yes | QMessageBox.No,
                                             QMessageBox.No)

        if reply == QMessageBox.Yes:  # 复位
            # self.piece_now = BLACK
            # self.mouse_point.setPixmap(self.black)
            # self.step = 0
            # for piece in self.pieces:
            #     piece.clear()
            # self.chessboard.reset()
            self.board.init_board(1)
            self.ai_down = True
            self.board.current_player = 1
            self.DrawPieces()
            self.update()
        else:
            self.close()