コード例 #1
0
 def reset(self):
     self.tetromino = Tetromino(isRandomNextPiece=False)
     # 下落的方块
     self.fallpiece = self.tetromino.getnewpiece()
     # 下一个待下落的方块
     self.nextpiece = self.tetromino.getnewpiece()
     # 是否结束
     self.terminal = False
     # 得分
     self.score = 0
     # 当前步得分
     self.reward = 0
     # 等级
     self.level = 0
     # 全部步长
     self.steps = 0
     # 每个方块的步长
     self.piecesteps = 0
     # 方块的数量
     self.piececount = 0
     # 面板
     self.board = self.tetromino.getblankboard()
     # 状态: 0 下落过程中 1 更换方块 2 结束一局
     self.state = 0
     # 上一个下落方块的截图
     self.prev_fallpiece_boards = None
     # 每个方块的高度
     self.pieces_height = []
     # 下降的状态
     self.fallpiece_status = deque(maxlen=10)
     for i in range(9):
         self.fallpiece_status.append(np.zeros((self.height, self.width)))
     self.fallpiece_status.append(self.get_fallpiece_board())
     # 下一个可用步骤
     self.availables = self.get_availables()
コード例 #2
0
ファイル: 01.py プロジェクト: one-leaf/pytorch
                self.tetromino, self.board, self.fallpiece)  # 计算下一步最佳分值
        return reward, screen_image, is_terminal, shape, self.rewards

    def autoStep(self):
        if self.fallpiece['rotation'] != self.reward_r:
            self.step(KEY_ROTATION)
            return
        if self.fallpiece['x'] > self.reward_x:
            self.step(KEY_LEFT)
            return
        if self.fallpiece['x'] < self.reward_x:
            self.step(KEY_RIGHT)
            return
        self.step(None)


def main(agent):
    while True:
        # time.sleep(1)
        for event in pygame.event.get():  # 需要事件循环,否则白屏
            if event.type == QUIT:
                pygame.quit()
                sys.exit()

        agent.autoStep()


if __name__ == "__main__":
    tetromino = Tetromino()
    agent = Agent(tetromino)
    main(agent)
コード例 #3
0
class Agent(object):
    def __init__(self, isRandomNextPiece=False):
        self.width = 10
        self.height = 20
        self.actions_num = len(ACTIONS)
        self.isRandomNextPiece = isRandomNextPiece
        self.reset()

    def reset(self):
        self.tetromino = Tetromino(isRandomNextPiece=self.isRandomNextPiece)
        # 下落的方块
        self.fallpiece = self.tetromino.getnewpiece()
        # 下一个待下落的方块
        self.nextpiece = self.tetromino.getnewpiece()
        # 是否结束
        self.terminal = False
        # 得分
        self.score = 0
        # 当前步得分
        self.reward = 0
        # 等级
        self.level = 0
        # 全部步长
        self.steps = 0
        # 每个方块的步长
        self.piecesteps = 0
        # 方块的数量
        self.piececount = 0
        # 方块的最高高度
        self.pieceheight = 0
        # 面板
        self.board = self.tetromino.getblankboard()
        # 状态: 0 下落过程中 1 更换方块 2 结束一局
        self.state = 0
        # 上一个下落方块的截图
        self.prev_fallpiece_boards = None
        # 每个方块的高度
        self.pieces_height = []
        # 盘面的状态
        self.status = deque(maxlen=10)
        for i in range(9):
            self.status.append(np.zeros((self.height, self.width)))
        self.status.append(self.get_fallpiece_board())
        # 下一个可用步骤
        self.availables = self.get_availables()
        # 显示mcts中间过程
        self.show_mcts_process = False
        # pos
        self.pos_board = self.get_board_pos()
        # key
        self.key = 0

    # 概率的索引位置转action
    def position_to_action(self, position):
        return ACTIONS[position]

    def position_to_action_name(self, position):
        return ACTIONS_NAME[position]

    def positions_to_actions(self, positions):
        return [self.position_to_action(i) for i in positions]

    # action转概率的索引位置
    def action_to_position(self, action):
        return action

    def actions_to_positions(self, actions):
        return [act for act in actions]

    # 获取可用步骤, 保留一个旋转始终有用
    # 将单人游戏变为双人博弈,一个正常下,一个只下走,
    def get_availables(self):
        acts = [KEY_ROTATION, KEY_LEFT, KEY_RIGHT, KEY_DOWN, KEY_NONE]

        if not self.tetromino.validposition(self.board, self.fallpiece, ax=-1):
            acts.remove(KEY_LEFT)
        if not self.tetromino.validposition(self.board, self.fallpiece, ax=1):
            acts.remove(KEY_RIGHT)
        if not self.tetromino.validposition(self.board, self.fallpiece, ay=1):
            acts.remove(KEY_DOWN)

        if self.fallpiece['shape'] == "o":
            acts.remove(KEY_ROTATION)
        else:
            r = self.fallpiece['rotation']
            self.fallpiece['rotation'] = (self.fallpiece['rotation'] +
                                          1) % len(
                                              pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board, self.fallpiece):
                acts.remove(KEY_ROTATION)
            self.fallpiece['rotation'] = r

    # if not KEY_DOWN in acts : acts.append(KEY_NONE)

        random.shuffle(acts)

        return acts

    def game_end(self):
        return self.terminal

    def step(self, action, env=None):
        # 状态 0 下落过程中 1 更换方块 2 结束一局

        self.reward = 0
        self.steps += 1
        self.piecesteps += 1
        self.level, self.fallfreq = self.tetromino.calculate(self.score)

        # self.actions.append(action)

        if action == KEY_LEFT and self.tetromino.validposition(
                self.board, self.fallpiece, ax=-1):
            self.fallpiece['x'] -= 1

        if action == KEY_RIGHT and self.tetromino.validposition(
                self.board, self.fallpiece, ax=1):
            self.fallpiece['x'] += 1

        if (action == KEY_DOWN) and self.tetromino.validposition(
                self.board, self.fallpiece, ay=1):
            self.fallpiece['y'] += 1

        if action == KEY_ROTATION:
            self.fallpiece['rotation'] = (self.fallpiece['rotation'] +
                                          1) % len(
                                              pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board, self.fallpiece):
                self.fallpiece['rotation'] = (
                    self.fallpiece['rotation'] - 1) % len(
                        pieces[self.fallpiece['shape']])

        isFalling = True
        if self.tetromino.validposition(self.board, self.fallpiece, ay=1):
            self.fallpiece['y'] += 1
        else:
            isFalling = False

        fallpiece_y = self.fallpiece['y']

        self.status.append(self.get_fallpiece_board() + self.getBoard())
        self.set_key()

        if not isFalling:
            self.tetromino.addtoboard(self.board, self.fallpiece)
            self.reward = self.tetromino.removecompleteline(self.board)

            self.score += self.reward
            self.pieceheight = self.getMaxHeight()
            self.pieces_height.append(20 - fallpiece_y - self.reward)
            self.fallpiece = None

        if env:
            env.checkforquit()
            env.render(self.board, self.score, self.level, self.fallpiece,
                       self.nextpiece)

        if not isFalling:
            self.fallpiece = self.nextpiece
            self.nextpiece = self.tetromino.getnewpiece()
            self.piecesteps = 0
            self.piececount += 1

            if not self.tetromino.validposition(
                    self.board, self.fallpiece, ay=1):
                self.terminal = True
                self.state = 2
                self.availables = [KEY_NONE]
                return self.state, self.reward
            else:
                self.state = 1
        else:
            self.state = 0

        self.availables = self.get_availables()

        return self.state, self.reward

    def set_key(self):
        info = self.status[-1]
        self.key = hash(info.data.tobytes())

    def get_key(self):
        return self.key

    # 打印
    def print2(self):
        info = self.status[-1]
        for y in range(self.height):
            line = str(y % 10) + " "
            for x in range(self.width):
                if info[y][x] == 0:
                    line = line + "  "
                else:
                    line = line + "* "
            print(line)
        print(" " + " -" * self.width)
        print("level:", self.level, "score:", self.score, "steps:", self.steps,
              "piececount:", self.piececount)

    def print(self):
        for y in range(self.height):
            line = "| "
            for x in range(self.width):
                if self.board[x][y] == blank:
                    line = line + "  "
                else:
                    line = line + str(self.board[x][y]) + " "
            print(line)
        print(" " + " -" * self.width)
        print("level:", self.level, "score:", self.score, "steps:", self.steps,
              "piececount:", self.piececount)

    # 统计当前最大高度
    def getMaxHeight(self):
        c = -1
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y] != blank:
                    c = y
                    break
            if c != -1: break
        h = 0 if c == -1 else self.height - c
        return h

    # 统计非空的个数
    def getNoEmptyCount(self):
        c = 0
        for y in range(self.height):
            line_c = 0
            for x in range(self.width):
                if self.board[x][y] != blank:
                    line_c += 1
            c += line_c * (0.9**(self.height - y - 1))

            # if line_c == 9: c += 1
        return c

    # 计算得分,只计算被挡住的
    def getScore(self):
        empty_count = 0
        fill_count = 0
        for x in range(self.width):
            line_f, line_e = 0, -1
            for y in range(self.height):
                if self.board[x][y] != blank:
                    line_f += 1
                    if line_e == -1: line_e = 0
                else:
                    if line_e != -1: line_e += 1
            empty_count += line_e
            fill_count += line_f

        if fill_count == 0: return 0
        return max(-1, -1 * (empty_count / fill_count))

    # 获得当前局面信息
    def getBoard(self):
        board = np.zeros((self.height, self.width))
        # 得到当前面板的值
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y] != blank:
                    board[y][x] = 1
        return board

    # 获得下落方块的信息
    def get_fallpiece_board(self):
        board = np.zeros((self.height, self.width))
        # 需要加上当前下落方块的值
        if self.fallpiece != None:
            piece = self.fallpiece
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x] != blank:
                        px, py = x + piece['x'], y + piece['y']
                        if px >= 0 and py >= 0:
                            board[y + piece['y']][x + piece['x']] = -1
        return board

    # 获得待下落方块的信息
    def get_nextpiece_borad(self):
        board = np.zeros((self.height, self.width))
        if self.nextpiece != None:
            piece = self.nextpiece
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x] != blank:
                        board[y][x] = -1
        return board

    # 得到面板的坐标信息
    def get_board_pos(self):
        pos = []
        size = self.width * self.height
        for i in range(size):
            if i % 2 == 0:
                pos.append(math.sin(i / size))
            else:
                pos.append(math.cos(i / size))
        pos = np.array(pos).reshape((self.height, self.width))
        return pos

    # 获得当前的全部特征
    # 背景 + 前2步走法 = 3
    # 返回 [3, height, width]
    def current_state(self):
        state = np.zeros((3, self.height, self.width))
        for i in range(3):
            state[-1 * (i + 1)] = self.status[-1 * (i + 1)]

        return state
コード例 #4
0
ファイル: agent.py プロジェクト: one-leaf/pytorch
 def __init__(self):
     self.tetromino = Tetromino(isRandomNextPiece=False)
     self.width = 10
     self.height = 20
     self.actions_num = len(ACTIONS)
     self.reset()
コード例 #5
0
ファイル: agent.py プロジェクト: one-leaf/pytorch
class Agent(object):
    def __init__(self):
        self.tetromino = Tetromino(isRandomNextPiece=False)
        self.width = 10
        self.height = 20
        self.actions_num = len(ACTIONS)
        self.reset()

    def reset(self):
        # 下落的方块
        self.fallpiece = self.tetromino.getnewpiece()
        # 下一个待下落的方块
        self.nextpiece = self.tetromino.getnewpiece()
        # 是否结束
        self.terminal = False
        # 得分
        self.score = 0
        # 当前步得分
        self.reward = 0
        # 等级
        self.level = 0
        # 全部步长
        self.steps = 0
        # 每个方块的步长
        self.piecesteps = 0
        # 方块的数量
        self.piececount = 0
        # 面板
        self.board = self.tetromino.getblankboard()
        # 状态: 0 下落过程中 1 更换方块 2 结束一局
        self.state = 0
        # 上一个下落方块的截图
        self.prev_fallpiece_boards = None
        # 当前player
        self.curr_player = 0
        # 最大方块数量
        self.limit_piece_count = 0
        # 每个方块的高度
        self.pieces_height = []
        # 下降的状态
        self.fallpiece_status = [self.get_fallpiece_board()]
        # 忽略的步骤
        self.ig_action = None
        # 下一个可用步骤
        self.availables = self.get_availables()
        # 最大游戏高度
        self.limit_max_height = -1
        # 游戏动作
        # self.actions=[]

    # 概率的索引位置转action
    def position_to_action(self, position):
        return ACTIONS[position]

    def position_to_action_name(self, position):
        return ACTIONS_NAME[position]

    def positions_to_actions(self, positions):
        return [self.position_to_action(i) for i in positions]

    # action转概率的索引位置
    def action_to_position(self, action):
        return action

    def actions_to_positions(self, actions):
        return [act for act in actions]

    # 获取可用步骤, 保留一个旋转始终有用
    # 将单人游戏变为双人博弈,一个正常下,一个只下走,
    def get_availables(self):
        if self.curr_player == 1: return [
                KEY_DOWN,
        ]
        # if self.fallpiece['y']>10: return [KEY_NONE,]

        acts = [KEY_ROTATION, KEY_LEFT, KEY_RIGHT, KEY_NONE, KEY_DOWN]

        if not self.tetromino.validposition(self.board, self.fallpiece, ax=-1):
            acts.remove(KEY_LEFT)
        if not self.tetromino.validposition(self.board, self.fallpiece, ax=1):
            acts.remove(KEY_RIGHT)
        if not self.tetromino.validposition(self.board, self.fallpiece, ay=1):
            acts.remove(KEY_DOWN)

        if self.fallpiece['shape'] == "o":
            acts.remove(KEY_ROTATION)
        else:
            r = self.fallpiece['rotation']
            self.fallpiece['rotation'] = (self.fallpiece['rotation'] +
                                          1) % len(
                                              pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board, self.fallpiece):
                acts.remove(KEY_ROTATION)
            self.fallpiece['rotation'] = r

        random.shuffle(acts)
        if self.ig_action != None and len(acts) >= 2:
            if self.ig_action in acts:
                acts.remove(self.ig_action)

        return acts

    def step(self, action, env=None):
        # 状态 0 下落过程中 1 更换方块 2 结束一局

        self.reward = 0
        self.steps += 1
        self.piecesteps += 1
        # self.curr_player = (self.curr_player+1)%2
        self.curr_player = 1 if self.curr_player == 0 else 0
        self.level, self.fallfreq = self.tetromino.calculate(self.score)
        # self.actions.append(action)

        if action == KEY_LEFT and self.tetromino.validposition(
                self.board, self.fallpiece, ax=-1):
            self.fallpiece['x'] -= 1

        if action == KEY_RIGHT and self.tetromino.validposition(
                self.board, self.fallpiece, ax=1):
            self.fallpiece['x'] += 1

        if (action == KEY_DOWN) and self.tetromino.validposition(
                self.board, self.fallpiece, ay=1):
            self.fallpiece['y'] += 1

        if action == KEY_ROTATION:
            self.fallpiece['rotation'] = (self.fallpiece['rotation'] +
                                          1) % len(
                                              pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board, self.fallpiece):
                self.fallpiece['rotation'] = (
                    self.fallpiece['rotation'] - 1) % len(
                        pieces[self.fallpiece['shape']])

        # if self.tetromino.validposition(self.board,self.fallpiece,ay = 1):
        #     self.fallpiece['y'] +=1

        fallpiece_y = self.fallpiece['y']

        self.fallpiece_status.append(self.get_fallpiece_board())

        if not self.tetromino.validposition(self.board, self.fallpiece, ay=1):
            self.tetromino.addtoboard(self.board, self.fallpiece)
            self.reward = self.tetromino.removecompleteline(self.board)
            self.score += self.reward
            self.level, self.fallfreq = self.tetromino.calculate(self.score)
            fallpiece_y += self.reward

            r = 0.5 if self.reward > 0 else 0

            self.pieces_height.append(20 - fallpiece_y - r)
            self.fallpiece = None

        if env:
            env.checkforquit()
            env.render(self.board, self.score, self.level, self.fallpiece,
                       self.nextpiece)

        if self.fallpiece == None:
            self.fallpiece = self.nextpiece
            self.nextpiece = self.tetromino.getnewpiece()
            self.piecesteps = 0
            self.piececount += 1
            self.state = 1
            self.fallpiece_status = [self.get_fallpiece_board()]

            # print(self.limit_piece_count, self.piececount)
            if (not self.tetromino.validposition(self.board,self.fallpiece)) or \
                (self.limit_max_height>0 and self.getMaxHeight()>self.limit_max_height):
                self.terminal = True
                self.state = 1
                self.reward = -1
                # print(">>>>>", len(self.pieces_height), self.pieces_height, self.actions[-10:])
                return self.state, self.reward  #

            # if (not self.tetromino.validposition(self.board,self.fallpiece)):
            #     self.terminal = True
            #     self.state = 2
            #     return self.state, self.reward #
        else:
            self.state = 0

        # 早期训练中,如果得分就表示游戏结束
        # if reward>0: self.terminal=True

        self.availables = self.get_availables()

        return self.state, self.reward

    def get_key(self, is_include_player=True):
        info = self.getBoard() + self.fallpiece_status[-1]
        if is_include_player and self.curr_player == 1:
            info = info + np.ones((self.height, self.width))
        return hash(info.data.tobytes())

    # 打印
    def print2(self, add_fallpiece=False):
        info = self.getBoard()
        if add_fallpiece:
            info += self.fallpiece_status[-1]
        for y in range(self.height):
            line = ""
            for x in range(self.width):
                if info[y][x] == 0:
                    line = line + "  "
                else:
                    line = line + "* "
            print(line)
        print("level:", self.level, "score:", self.score, "steps:", self.steps,
              "piececount:", self.piececount)

    def print(self):
        for y in range(self.height):
            line = "| "
            for x in range(self.width):
                if self.board[x][y] == blank:
                    line = line + "  "
                else:
                    line = line + str(self.board[x][y]) + " "
            print(line)
        print(" " + " -" * self.width)
        print("level:", self.level, "score:", self.score, "steps:", self.steps,
              "piececount:", self.piececount)

    # 统计当前最大高度
    def getMaxHeight(self):
        c = 0
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y] != blank:
                    c = y
                    break
            if c != 0: break
        h = 0 if c == 0 else self.height - c
        return h

    # 获得当前局面信息
    def getBoard(self):
        board = np.zeros((self.height, self.width))
        # 得到当前面板的值
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y] != blank:
                    board[y][x] = 1
        return board

    # 获得下落方块的信息
    def get_fallpiece_board(self):
        board = np.zeros((self.height, self.width))
        # 需要加上当前下落方块的值
        if self.fallpiece != None:
            piece = self.fallpiece
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x] != blank:
                        px, py = x + piece['x'], y + piece['y']
                        if px >= 0 and py >= 0:
                            board[y + piece['y']][x + piece['x']] = 1
        return board

    # 获得待下落方块的信息
    def get_nextpiece_borad(self):
        board = np.zeros((self.height, self.width))
        if self.nextpiece != None:
            piece = self.nextpiece
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x] != blank:
                        board[y][x] = 1
        return board

    # 获得当前的全部特征
    # 背景 + 下落方块位置 + 下一次的方块 = 3
    # 返回 [3, height, width]
    def current_state(self):
        board_background = self.getBoard()
        fallpiece_1 = self.fallpiece_status[-1]
        fallpiece_2 = np.zeros((self.height, self.width))
        fallpiece_3 = np.zeros((self.height, self.width))
        fallpiece_4 = np.zeros((self.height, self.width))
        fallpiece_5 = np.zeros((self.height, self.width))

        if len(self.fallpiece_status) > 1:
            fallpiece_2 = self.fallpiece_status[-2]
        if len(self.fallpiece_status) > 2:
            fallpiece_3 = self.fallpiece_status[-3]
        if len(self.fallpiece_status) > 3:
            fallpiece_4 = self.fallpiece_status[-4]
        if len(self.fallpiece_status) > 4:
            fallpiece_5 = self.fallpiece_status[-5]

        # board_fallpiece = self.get_fallpiece_board()
        # board_nextpiece = self.get_nextpiece_borad()

        # if self.curr_player==0:
        #     step_state = np.ones([self.height, self.width])
        # else:
        #     step_state = np.zeros([self.height, self.width])

        state = np.stack([
            board_background, fallpiece_5, fallpiece_4, fallpiece_3,
            fallpiece_2, fallpiece_1
        ])
        return state

    # 交替个数也就是从空到非空算一次,边界算非空
    # 高度从底部 20 --> 0 上部
    def getTransCount(self, board=None):
        if board == None: board = self.board

        _rowTransitions = rowTransitions(board)
        _colTransitions = colTransitions(board)
        _emptyHoles = emptyHoles(board)
        _wellNums = wellNums(board)
        return  3.2178882868487753 * _rowTransitions \
                + 9.348695305445199 * _colTransitions \
                + 7.899265427351652 * _emptyHoles \
                + 3.3855972247263626 * _wellNums

        transCount = 0

        # 统计一列的个数
        for x in range(self.width):
            curr_state = 1
            for y in range(self.height)[::-1]:
                state = 0 if board[x][y] == blank else 1
                if curr_state != state:
                    transCount += self.height - y + 1
                    curr_state = state

        # 统计一行的个数
        for y in range(self.height):
            curr_state = 1
            for x in range(self.width):
                state = 0 if board[x][y] == blank else 1
                if curr_state != state:
                    transCount += self.height - y + 1
                    curr_state = state
            if curr_state == 0: transCount += self.height - y + 1

        return transCount

    # 检测这一步是否优,如果好+1,不好-1,无法评价0
    # def checkActionisBest(self, include_fallpiece=True):
    #     board = [[0]*self.width for i in range(self.height)]
    #     for y in range(self.height):
    #         for x in range(self.width):
    #             board[y][x]=self.board[x][y]

    #     if self.fallpiece != None and include_fallpiece:
    #         piece = self.fallpiece
    #         shapedraw = pieces[piece['shape']][piece['rotation']]
    #         offset_y = 0
    #         for t in range(self.height):
    #             find=False
    #             for y in range(templatenum):
    #                 for x in range(templatenum):
    #                     if shapedraw[y][x]!=blank:
    #                         px, py = x+piece['x'], y+piece['y']+t
    #                         if py>=self.height or board[py][px]!=blank:
    #                             find=True
    #                             break
    #                 if find: break
    #             if find:
    #                 offset_y=t-1
    #                 break

    #         for y in range(templatenum):
    #             for x in range(templatenum):
    #                 if shapedraw[y][x]!=blank:
    #                     px, py = x+piece['x'], y+piece['y']+offset_y
    #                     if px>=0 and py>=0:
    #                         board[py][px]=shapedraw[y][x]
    #     transCount = self.getTransCount(board)
    #     # v = self.transCount - transCount
    #     # if self.state != 0:
    #     #     self.transCount = transCount
    #     return transCount

    # 检查游戏是否结束,如果有奖励,下棋的赢了,否则输了
    def game_end(self):
        if self.terminal:
            # if self.score>0:
            #     return True, 0
            # else:
            return True, 1
        else:
            return False, -1

    # 这里假定第一个人选择下[左移,右移,翻转,下降,无动作],第二个人只有[下降]
    # def start_self_play(self, player, temp=1e-3):
    #     states, mcts_probs = [], []
    #     self.reset()
    #     player.reset_player()
    #     for i in count():
    #         # temp 权重 ,return_prob 是否返回概率数据
    #         action, move_probs = player.get_action(self, temp=temp/(self.piecesteps+1), return_prob=1)
    #         # 保存数据
    #         states.append(self.current_state())
    #         mcts_probs.append(move_probs)

    #         # 前几步是乱走的
    #         # if self.piecesteps<10-self.piececount and random.random()>0.5:
    #         # 有20%是乱走的
    #         # if random.random()>0.8:
    #         #     action = random.choice(self.availables())

    #         # 执行一步
    #         self.step(action)
    #         # 如果游戏结束
    #         if self.terminal: break
    #     self.print()

    #     picece_count = self.getTransCount()
    #     print("transCount:", picece_count)
    #     # 增加最大步骤
    #     if picece_count>maxstep:
    #         states1 = states
    #         mcts_probs1 = mcts_probs
    #         winners1 = [-1.0 for i in range(len(states))]
    #         maxstep = picece_count
    #     elif picece_count==maxstep:
    #         states1 = states1 + states
    #         mcts_probs1 = mcts_probs1 + mcts_probs
    #         winners1 = winners1+ [-1.0 for i in range(len(states))]
    #     # 增加最小步骤
    #     if picece_count<minstep:
    #         states0 = states
    #         mcts_probs0 = mcts_probs
    #         winners0 = [1.0 for i in range(len(states))]
    #         minstep = picece_count
    #     # 增加最小步骤
    #     elif picece_count==minstep:
    #         states0 = states0 + states
    #         mcts_probs0 = mcts_probs0 + mcts_probs
    #         winners0 = winners0 + [1.0 for i in range(len(states))]

    #     print("minstep",minstep,"maxstep",maxstep)
    #     states = states0 + states1
    #     mcts_probs = mcts_probs0 + mcts_probs1
    #     winners = winners0 + winners1

    #     assert len(states)==len(mcts_probs)==len(winners)
    #     return -1, zip(states, mcts_probs, winners)

    # # 使用 mcts 训练,重用搜索树,并保存数据
    def start_self_play(self, player, temp=1e-3):
        # 这里下两局,按步数对比

        # self.ig_action = random.choice([None,KEY_NONE,KEY_DOWN])

        if self.limit_max_height > 0:
            limit_max_height = self.limit_max_height
        else:
            if random.random() < 0.2:
                limit_max_height = 10
            else:
                limit_max_height = random.randint(5, 25)

            self.limit_max_height = limit_max_height

        # 玩几局订胜负,如果最高为5,玩4局,否则玩2局
        if limit_max_height != 10:
            game_num = 5
        else:
            game_num = 2

        game_states, game_mcts_probs, game_masks = [], [], []
        game_piececount, game_score = [], []
        print("limit_max_height", self.limit_max_height)
        for j in range(game_num):
            _states, _mcts_probs, _masks = [], [], []
            game = copy.deepcopy(self)
            game.limit_max_height = 5
            # ig_action=random.choice([None,KEY_NONE,KEY_DOWN])
            # game.ig_action = ig_action

            for i in count():
                action, move_probs = player.get_action(game,
                                                       temp=temp,
                                                       return_prob=1)
                _states.append(game.current_state())
                if game.curr_player == 0:
                    _mcts_probs.append(move_probs)
                    _masks.append(1)
                else:
                    _mcts_probs.append(
                        np.ones([game.actions_num]) / game.actions_num)
                    _masks.append(0)

                game.step(action)
                if game.state != 0:
                    game.limit_max_height = max(game.pieces_height) + 3
                    if game.limit_max_height > limit_max_height:
                        game.limit_max_height = limit_max_height
                    print('reward:', game.reward, 'len:',
                          len(game.pieces_height), "limit_max_height:",
                          game.limit_max_height, "next:",
                          game.fallpiece['shape'], game.pieces_height)

                if game.terminal:
                    break

            game_states.append(_states)
            game_mcts_probs.append(_mcts_probs)
            game_masks.append(_masks)

            game_piececount.append(game.piececount)
            game_score.append(game.score)

            game.print()

            if j >= 2 and limit_max_height != 10:
                max_p = max(game_piececount)
                if game_piececount.count(max_p) == 1:
                    break

        game_num = len(game_states)
        max_piececount = max(game_piececount)
        max_score = max(game_score)

        game_win = [-1 for _ in range(game_num)]
        game_loss = [1 for _ in range(game_num)]
        for j in range(game_num):
            if game_piececount[j] == max_piececount:
                game_win[j] = 1
                # if game_score[j]>=limit_max_height//5 and game_score[j] == max_score:
                game_loss[j] = -1

        print("game_piececount", game_piececount, "game_score", game_score)
        print("win", game_win, "score", game_loss)

        states, mcts_probs, winers, masks = [], [], [], []
        for j in range(game_num):
            for o in game_states[j]:
                states.append(o)
            for o in game_masks[j]:
                masks.append(o)
            for o in game_mcts_probs[j]:
                mcts_probs.append(o)
            for m in game_masks[j]:
                if m == 1:
                    winers.append(game_win[j])
                else:
                    winers.append(game_loss[j])

        winners_z = np.array(winers)

        assert len(states) == len(mcts_probs)
        assert len(states) == len(winners_z)
        assert len(states) == len(masks)

        print("add %s to dataset" % len(winers))
        reward, piececount, agentcount = 0, 0, 0
        reward = sum(game_score)
        piececount = sum(game_piececount)
        agentcount = game_num

        return reward, piececount, agentcount, zip(states, mcts_probs,
                                                   winners_z, masks)
コード例 #6
0
class Agent(object):
    def __init__(self):
        self.width = 10
        self.height = 20
        self.actions_num = len(ACTIONS)    
        # self.lock = random.choice([0,1])  
        self.reset()        

    def reset(self):
        self.tetromino = Tetromino(isRandomNextPiece=False)
        # 下落的方块
        self.fallpiece = self.tetromino.getnewpiece()
        # 下一个待下落的方块
        self.nextpiece = self.tetromino.getnewpiece()
        # 是否结束
        self.terminal = False
        # 得分
        self.score = 0
        # 当前步得分
        self.reward = 0
        # 等级
        self.level = 0
        # 全部步长
        self.steps = 0
        # 每个方块的步长
        self.piecesteps = 0
        # 方块的数量
        self.piececount = 0
        # 面板
        self.board = self.tetromino.getblankboard()
        # 状态: 0 下落过程中 1 更换方块 2 结束一局
        self.state =0
        # 上一个下落方块的截图
        self.prev_fallpiece_boards=None
        # 当前player
        self.curr_player = 0         
        # 每个方块的高度
        self.pieces_height = []     
        # 下降的状态
        self.fallpiece_status = deque(maxlen=10)
        for i in range(9):
            self.fallpiece_status.append(np.zeros((self.height, self.width)))
        self.fallpiece_status.append(self.get_fallpiece_board())
        # 忽略的步骤
        self.ig_action = None
        # 下一个可用步骤
        self.availables=self.get_availables()
        # 最大游戏高度
        self.limit_max_height = -1
        # 游戏的奖励
        self.player_reward=[0, 0]
        # 游戏的奖励
        self.winner=-1


    # 概率的索引位置转action
    def position_to_action(self, position):
        return ACTIONS[position]

    def position_to_action_name(self, position):
        return ACTIONS_NAME[position]

    def positions_to_actions(self, positions):
        return [self.position_to_action(i) for i in positions]

    # action转概率的索引位置
    def action_to_position(self, action):
        return action

    def actions_to_positions(self, actions):
        return [act for act in actions]

    # 获取可用步骤, 保留一个旋转始终有用
    # 将单人游戏变为双人博弈,一个正常下,一个只下走,
    def get_availables(self):
        acts=[KEY_ROTATION, KEY_LEFT, KEY_RIGHT, KEY_DOWN]
        # if self.curr_player == self.lock:
        #     return [KEY_DOWN, KEY_NONE]
        # else:
        #     acts.remove(KEY_DOWN)

        if not self.tetromino.validposition(self.board,self.fallpiece,ax = -1):
            acts.remove(KEY_LEFT)
        if not self.tetromino.validposition(self.board,self.fallpiece,ax = 1):
            acts.remove(KEY_RIGHT)   
        if not self.tetromino.validposition(self.board,self.fallpiece,ay = 1):
            acts.remove(KEY_DOWN)

        # 只允许前面旋转
        # if self.piecesteps>len(pieces[self.fallpiece['shape']]):
        #     acts.remove(KEY_ROTATION)
        # else:
        if self.fallpiece['shape']=="o":
            acts.remove(KEY_ROTATION)
        else:
            r = self.fallpiece['rotation']
            self.fallpiece['rotation'] =  (self.fallpiece['rotation'] + 1) % len(pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board,self.fallpiece):
                acts.remove(KEY_ROTATION)
            self.fallpiece['rotation'] = r

        random.shuffle(acts)
        if self.ig_action!=None and len(acts)>=2:
            if self.ig_action in acts:
                acts.remove(self.ig_action)

        if len(acts)==0: acts=[KEY_NONE]
        return acts         


    def game_end(self):
        # return self.terminal, self.lock
        # winer = self.curr_player  
        return self.terminal, self.winner
        # lastplayer = (self.curr_player+1) % 2
        # return self.terminal, lastplayer

    def step(self, action, env=None):
        # 状态 0 下落过程中 1 更换方块 2 结束一局
        
        self.reward = 0
        self.steps += 1
        self.piecesteps += 1
        self.level, self.fallfreq = self.tetromino.calculate(self.score)
        
        # self.actions.append(action)

        if action == KEY_LEFT and self.tetromino.validposition(self.board,self.fallpiece,ax = -1):
            self.fallpiece['x']-=1

        if action == KEY_RIGHT and self.tetromino.validposition(self.board,self.fallpiece,ax = 1):
            self.fallpiece['x']+=1  

        if (action == KEY_DOWN) and self.tetromino.validposition(self.board,self.fallpiece,ay = 1):
            self.fallpiece['y']+=1  

        if action == KEY_ROTATION:
            self.fallpiece['rotation'] =  (self.fallpiece['rotation'] + 1) % len(pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board,self.fallpiece):
                self.fallpiece['rotation'] = (self.fallpiece['rotation'] - 1) % len(pieces[self.fallpiece['shape']])

        isFalling=True
        if self.tetromino.validposition(self.board,self.fallpiece,ay = 1):
            self.fallpiece['y'] +=1
            self.player_reward[self.curr_player] += 1
        else:
            isFalling = False
            self.player_reward[self.curr_player] -= 5

        fallpiece_y = self.fallpiece['y']
        # self.player_reward[self.curr_player] = fallpiece_y

        self.fallpiece_status.append(self.get_fallpiece_board())

        if not isFalling:
            self.tetromino.addtoboard(self.board,self.fallpiece)
            self.reward = self.tetromino.removecompleteline(self.board) 
            
            self.player_reward[self.curr_player] += self.reward * 10
            # if self.reward >0:
            #     self.terminal = True 
            #     self.state = 2       
            #     return self.state, self.reward
            
            self.score += self.reward          
            # self.level, self.fallfreq = self.tetromino.calculate(self.score)   
            # self.fallpiece_height = landingHeight(self.fallpiece)
            r = 0.5 if self.reward>0 else 0
            self.pieces_height.append(20 - fallpiece_y - r)
            self.fallpiece = None

        self.curr_player = self.steps%2

        if  env:
            env.checkforquit()
            env.render(self.board, self.score, self.level, self.fallpiece, self.nextpiece)

        if not isFalling:
            self.fallpiece = self.nextpiece
            self.nextpiece = self.tetromino.getnewpiece()
            self.piecesteps = 0
            self.piececount +=1 

            if self.player_reward[0]>self.player_reward[1]:
                self.winner = 0
            else:
                self.winner = 1
            self.player_reward_prev = self.player_reward               
            self.player_reward=[0, 0]

            if (not self.tetromino.validposition(self.board,self.fallpiece)) or \
                (self.limit_max_height>0 and self.getMaxHeight()>self.limit_max_height):  
                
                self.terminal = True 
                self.state = 2
                self.reward = -1      
                self.availables = []
                # print(">>>>>", len(self.pieces_height), self.pieces_height, self.actions[-10:])
                return self.state, self.reward 
            else: 
                self.state = 1

            # self.fallpiece_status=[self.get_fallpiece_board()]          
        else:
            self.state = 0
        
        # 早期训练中,如果得分就表示游戏结束
        # if self.reward!=0: 
        #     self.terminal=True

        self.availables = self.get_availables()

        return self.state, self.reward

    def get_key(self):
        info = self.getBoard() + self.fallpiece_status[-1]
        # if self.curr_player==1:
        #     info = info + np.ones((self.height, self.width))
        return hash(info.data.tobytes())        
        # key = [0 for v in range(self.height*self.width)]
        # for x in range(self.height):
        #     for y in range(self.width):
        #         if info[x][y]==0:
        #             key[x*self.width+y]='0'
        #         else:
        #             key[x*self.width+y]='1'
        # key3 = int("".join(key),2)
        # return hash(key3)

    # 打印
    def print2(self, add_fallpiece=False):
        info = self.getBoard()
        if add_fallpiece:
            info += self.fallpiece_status[-1]
        for y in range(self.height):
            line=str(y%10)+" "
            for x in range(self.width):
                if info[y][x]==0:
                    line=line+"  "
                else:
                    line=line+"* "
            print(line)
        print(" "+" -"*self.width)            
        print("level:", self.level, "score:", self.score, "steps:", self.steps,"piececount:", self.piececount)

    def print(self):
        for y in range(self.height):
            line="| "
            for x in range(self.width):
                if self.board[x][y]==blank:
                    line=line+"  "
                else:
                    line=line+str(self.board[x][y])+" "
            print(line)
        print(" "+" -"*self.width)
        print("level:", self.level, "score:", self.score, "steps:", self.steps,"piececount:", self.piececount)

    # 统计当前最大高度
    def getMaxHeight(self):
        c = 0
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y]!=blank:
                    c=y
                    break
            if c!=0:break  
        h = 0 if c == 0 else self.height - c                          
        return h

    # 获得当前局面信息
    def getBoard(self):
        board=np.zeros((self.height, self.width))
        # 得到当前面板的值
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y]!=blank:
                    board[y][x]=1
        return board

    # 获得下落方块的信息
    def get_fallpiece_board(self):   
        board=np.zeros((self.height, self.width))
        # 需要加上当前下落方块的值
        if self.fallpiece != None:
            piece = self.fallpiece
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x]!=blank:
                        px, py = x+piece['x'], y+piece['y']
                        if px>=0 and py>=0:
                            board[y+piece['y']][x+piece['x']]=1
        return board

    # 获得待下落方块的信息
    def get_nextpiece_borad(self):
        board=np.zeros((self.height, self.width))
        if self.nextpiece != None:
            piece = self.nextpiece  
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x]!=blank:
                        board[y][x]=1
        return board

    # 获得当前的全部特征
    # 背景 + 前8步走法 = 9
    # 返回 [9, height, width]
    def current_state(self):
        state = np.zeros((9, self.height, self.width))
        state[0] = self.getBoard()
        # fallpiece_status = self.fallpiece_status[-8:]
        # fallpiece_status.reverse()
        # fallpiece_len=len(self.fallpiece_status)
        # 前4步是对手的,后4步是自己的
        for j in range(4): 
            idx = -2*j-1  #(-1,-3,-5,-7)
            # if idx>=fallpiece_len: break
            state[j+1]=self.fallpiece_status[idx]
        for j in range(4):
            idx = -2*j-2  #(-2,-4,-6,-8)
            # if idx>=fallpiece_len: break
            state[j+5]=self.fallpiece_status[idx]

        return state          


    # 使用 mcts 训练,重用搜索树,并保存数据
    def start_self_play(self, player, temp=1e-3):
        """ start a self-play game using a MCTS player, reuse the search tree,
        and store the self-play data: (state, mcts_probs, z) for training
        """
        
        # if self.limit_max_height > 0:
        #     limit_max_height = self.limit_max_height
        # else:
        #     limit_max_height = random.randint(5,12)
        #     self.limit_max_height = limit_max_height

        # print("limit_max_height:", limit_max_height)

        game_num = 2
        # self.limit_max_height = random.randint(1,20)
        self.limit_max_height = 20
        # limit_max_height = self.limit_max_height
        game_keys, game_states, game_mcts_probs, game_current_players = [],[],[],[] 
        game_piececount, game_score, game_winer = [],[],[]
        for _ in range(game_num):
            # self.lock = (self.lock + 1)%2 
            # self.availables = self.get_availables()
            print("limit_max_height", self.limit_max_height)

            _states, _mcts_probs, _current_players, _keys=[],[],[],[]
            game = copy.deepcopy(self)
            # game.limit_max_height = 5
            _piecestep=0
            for i in count():
                action, move_probs = player.get_action(game, temp=temp, return_prob=1) 

                _keys.append(game.get_key())
                _states.append(game.current_state())
                _mcts_probs.append(move_probs)
                #_current_players.append(game.curr_player)

                game.step(action)
                _piecestep+=1

                if game.state!=0:
                    # game.limit_max_height = max(game.pieces_height)+3
                    # if game.limit_max_height>limit_max_height: game.limit_max_height=limit_max_height
                    print('reward:',game.reward, 'len:', len(game.pieces_height), "max_height:", game.getMaxHeight(), "next:", game.fallpiece['shape'], game.pieces_height,"rewards:", game.player_reward_prev, "winner:", game.winner)
                    temp_winer = []
                    # 预测是针对上一步的,所以如果得分就是上一步正确,否则就是上一步错误
                    # 最后一步应该为-winer,不能为0,不方便训练
                    winer = 1 if game.curr_player == game.winner else -1
                    for j in range(_piecestep):
                        if j==0:
                            temp_winer=[-1*winer]
                        else:
                            temp_winer.insert(0,winer)
                        winer = -1 * winer
                    _current_players.extend(temp_winer)
                    _piecestep = 0

                if game.terminal:
                    break
            
            game_keys.append(_keys)
            game_states.append(_states)
            game_mcts_probs.append(_mcts_probs)
            game_current_players.append(_current_players)

            game_piececount.append(game.piececount)
            game_score.append(game.score)
            _, winer = game.game_end()
            game_winer.append(winer)
            game.print()

        # max_score = max(game_score)
        # game_player_0 = [-1 for _ in range(game_num)] 
        # game_player_1 = [-1 for _ in range(game_num)] 

        # min_game = -1
        # max_game = -1

        # min_piececount = min(game_piececount)
        # max_piececount = max(game_piececount)

        # if game_piececount.count(min_piececount)==1:
        #     min_game = game_piececount.index(min_piececount)

        # if game_piececount.count(max_piececount)==1:
        #     max_game = game_piececount.index(max_piececount)

        # for j in range(game_num):
        #     game_player_0[j] = 1 if game_winer[j]==0 else -1
        #     game_player_1[j] = 1 if game_winer[j]==1 else -1

        # sort_index = sorted(range(len(game_piececount)), key=lambda k: game_piececount[k]+game_score[k])
        
        # split_index = -(game_num//2)
        # max_index = sort_index[split_index:]
        # min_index = sort_index[:split_index]

        # print("game_piececount",game_piececount,"game_score",game_score,"max",max_game,"min",min_game)
        # print("sort_index", sort_index, "max_index", max_index, "min_index", min_index)
        # print("game_player_0",game_player_0,"game_player_1",game_player_1)


        keys, states, mcts_probs, winers= [], [], [], []
        for j in range(game_num):
            for o in game_keys[j]: keys.append(o)
            for o in game_states[j]: states.append(o)
            for o in game_mcts_probs[j]: mcts_probs.append(o)
            for o in game_current_players[j]: winers.append(o)
            # if j in min_index:
            #     for p in game_current_players[j]:
            #         winers.append(-1)
            # elif j in max_index:
            #     for p in game_current_players[j]:
            #         winers.append(1)
            # else:
            # for p in game_current_players[j]:
            #     if p==0:
            #         winers.append(game_player_0[j])
            #     else:
            #         winers.append(game_player_1[j])

        winners_z = np.array(winers)

        assert len(states)==len(mcts_probs)
        assert len(states)==len(winners_z)
        assert len(states)==len(keys)

        _len = len(winers)
        _win = (_len+sum(winners_z))/2
        _loss = _len - _win 
        print("add %s to dataset,winner: %s, loss: %s"%(_len,_win,_loss))
        reward, piececount, agentcount = 0, 0, 0
        reward = sum(game_score)  
        piececount = sum(game_piececount)
        agentcount = game_num
    
        return reward, piececount, agentcount, keys, zip(states, mcts_probs, winners_z)
                
コード例 #7
0
class Agent(object):
    def __init__(self):
        self.width = 10
        self.height = 20
        self.actions_num = len(ACTIONS)
        self.reset()

    def reset(self):
        self.tetromino = Tetromino(isRandomNextPiece=False)
        # 下落的方块
        self.fallpiece = self.tetromino.getnewpiece()
        # 下一个待下落的方块
        self.nextpiece = self.tetromino.getnewpiece()
        # 是否结束
        self.terminal = False
        # 得分
        self.score = 0
        # 当前步得分
        self.reward = 0
        # 等级
        self.level = 0
        # 全部步长
        self.steps = 0
        # 每个方块的步长
        self.piecesteps = 0
        # 方块的数量
        self.piececount = 0
        # 面板
        self.board = self.tetromino.getblankboard()
        # 状态: 0 下落过程中 1 更换方块 2 结束一局
        self.state = 0
        # 上一个下落方块的截图
        self.prev_fallpiece_boards = None
        # 每个方块的高度
        self.pieces_height = []
        # 下降的状态
        self.fallpiece_status = deque(maxlen=10)
        for i in range(9):
            self.fallpiece_status.append(np.zeros((self.height, self.width)))
        self.fallpiece_status.append(self.get_fallpiece_board())
        # 下一个可用步骤
        self.availables = self.get_availables()

    # 概率的索引位置转action
    def position_to_action(self, position):
        return ACTIONS[position]

    def position_to_action_name(self, position):
        return ACTIONS_NAME[position]

    def positions_to_actions(self, positions):
        return [self.position_to_action(i) for i in positions]

    # action转概率的索引位置
    def action_to_position(self, action):
        return action

    def actions_to_positions(self, actions):
        return [act for act in actions]

    # 获取可用步骤, 保留一个旋转始终有用
    # 将单人游戏变为双人博弈,一个正常下,一个只下走,
    def get_availables(self):
        acts = [KEY_ROTATION, KEY_LEFT, KEY_RIGHT, KEY_DOWN]

        if not self.tetromino.validposition(self.board, self.fallpiece, ax=-1):
            acts.remove(KEY_LEFT)
        if not self.tetromino.validposition(self.board, self.fallpiece, ax=1):
            acts.remove(KEY_RIGHT)
        if not self.tetromino.validposition(self.board, self.fallpiece, ay=1):
            acts.remove(KEY_DOWN)

        if self.fallpiece['shape'] == "o":
            acts.remove(KEY_ROTATION)
        else:
            r = self.fallpiece['rotation']
            self.fallpiece['rotation'] = (self.fallpiece['rotation'] +
                                          1) % len(
                                              pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board, self.fallpiece):
                acts.remove(KEY_ROTATION)
            self.fallpiece['rotation'] = r

        random.shuffle(acts)

        if len(acts) == 0: acts = [KEY_NONE]
        return acts

    def game_end(self):
        return self.terminal

    def step(self, action, env=None):
        # 状态 0 下落过程中 1 更换方块 2 结束一局

        self.reward = 0
        self.steps += 1
        self.piecesteps += 1
        self.level, self.fallfreq = self.tetromino.calculate(self.score)

        # self.actions.append(action)

        if action == KEY_LEFT and self.tetromino.validposition(
                self.board, self.fallpiece, ax=-1):
            self.fallpiece['x'] -= 1

        if action == KEY_RIGHT and self.tetromino.validposition(
                self.board, self.fallpiece, ax=1):
            self.fallpiece['x'] += 1

        if (action == KEY_DOWN) and self.tetromino.validposition(
                self.board, self.fallpiece, ay=1):
            self.fallpiece['y'] += 1

        if action == KEY_ROTATION:
            self.fallpiece['rotation'] = (self.fallpiece['rotation'] +
                                          1) % len(
                                              pieces[self.fallpiece['shape']])
            if not self.tetromino.validposition(self.board, self.fallpiece):
                self.fallpiece['rotation'] = (
                    self.fallpiece['rotation'] - 1) % len(
                        pieces[self.fallpiece['shape']])

        isFalling = True
        if self.tetromino.validposition(self.board, self.fallpiece, ay=1):
            self.fallpiece['y'] += 1
        else:
            isFalling = False

        fallpiece_y = self.fallpiece['y']

        self.fallpiece_status.append(self.get_fallpiece_board())

        if not isFalling:
            self.tetromino.addtoboard(self.board, self.fallpiece)
            self.reward = self.tetromino.removecompleteline(self.board)

            self.score += self.reward
            r = 0.5 if self.reward > 0 else 0
            self.pieces_height.append(20 - fallpiece_y - r)
            self.fallpiece = None

        if env:
            env.checkforquit()
            env.render(self.board, self.score, self.level, self.fallpiece,
                       self.nextpiece)

        if not isFalling:
            self.fallpiece = self.nextpiece
            self.nextpiece = self.tetromino.getnewpiece()
            self.piecesteps = 0
            self.piececount += 1

            if (not self.tetromino.validposition(self.board, self.fallpiece)):
                self.terminal = True
                self.state = 2
                self.availables = [KEY_NONE]
                return self.state, self.reward
            else:
                self.state = 1
        else:
            self.state = 0

        self.availables = self.get_availables()

        return self.state, self.reward

    def get_key(self):
        info = self.getBoard() + self.fallpiece_status[-1]
        return hash(info.data.tobytes())

    # 打印
    def print2(self, add_fallpiece=False):
        info = self.getBoard()
        if add_fallpiece:
            info += self.fallpiece_status[-1]
        for y in range(self.height):
            line = str(y % 10) + " "
            for x in range(self.width):
                if info[y][x] == 0:
                    line = line + "  "
                else:
                    line = line + "* "
            print(line)
        print(" " + " -" * self.width)
        print("level:", self.level, "score:", self.score, "steps:", self.steps,
              "piececount:", self.piececount)

    def print(self):
        for y in range(self.height):
            line = "| "
            for x in range(self.width):
                if self.board[x][y] == blank:
                    line = line + "  "
                else:
                    line = line + str(self.board[x][y]) + " "
            print(line)
        print(" " + " -" * self.width)
        print("level:", self.level, "score:", self.score, "steps:", self.steps,
              "piececount:", self.piececount)

    # 统计当前最大高度
    def getMaxHeight(self):
        c = 0
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y] != blank:
                    c = y
                    break
            if c != 0: break
        h = 0 if c == 0 else self.height - c
        return h

    # 统计非空的个数
    def getNoEmptyCount(self):
        c = 0
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y] != blank:
                    c += 1
        return c

    # 获得当前局面信息
    def getBoard(self):
        board = np.zeros((self.height, self.width))
        # 得到当前面板的值
        for y in range(self.height):
            for x in range(self.width):
                if self.board[x][y] != blank:
                    board[y][x] = 1
        return board

    # 获得下落方块的信息
    def get_fallpiece_board(self):
        board = np.zeros((self.height, self.width))
        # 需要加上当前下落方块的值
        if self.fallpiece != None:
            piece = self.fallpiece
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x] != blank:
                        px, py = x + piece['x'], y + piece['y']
                        if px >= 0 and py >= 0:
                            board[y + piece['y']][x + piece['x']] = 1
        return board

    # 获得待下落方块的信息
    def get_nextpiece_borad(self):
        board = np.zeros((self.height, self.width))
        if self.nextpiece != None:
            piece = self.nextpiece
            shapedraw = pieces[piece['shape']][piece['rotation']]
            for x in range(templatenum):
                for y in range(templatenum):
                    if shapedraw[y][x] != blank:
                        board[y][x] = 1
        return board

    # 获得当前的全部特征
    # 背景 + 前8步走法 = 9
    # 返回 [9, height, width]
    def current_state(self):
        state = np.zeros((9, self.height, self.width))
        state[0] = self.getBoard()

        for i in range(8):
            state[i + 1] = self.fallpiece_status[-1 * (i + 1)]

        # 前4步是对手的,后4步是自己的
        # for j in range(4):
        #     idx = -2*j-1  #(-1,-3,-5,-7)
        #     state[j+1]=self.fallpiece_status[idx]
        # for j in range(4):
        #     idx = -2*j-2  #(-2,-4,-6,-8)
        #     state[j+5]=self.fallpiece_status[idx]

        return state

    # 训练模型
    def start_self_play(self, net):
        game_num = 10
        agentcount, agentreward, piececount = 0, 0, 0
        game_keys, game_states, game_Qvals, game_actions = [], [], [], []
        for game_idx in range(game_num):

            _states, _log_probs, _values, _keys, _masks, _rewards, _qvals, _actions=[],[],[],[],[],[],[],[]
            game = copy.deepcopy(self)
            for i in count():

                _states.append(game.current_state())
                if game_idx == game_num - 1:
                    action, log_prob, value = net.get_action(
                        game, deterministic=True)
                else:
                    action, log_prob, value = net.get_action(
                        game, deterministic=False)

                _, reward = game.step(action)

                # 这里的奖励是消除的行数
                if reward > 0:
                    _reward = reward * 10
                else:
                    _reward = 0

                # 方块的个数越多越好
                if game.terminal:
                    _reward += game.getNoEmptyCount()

                _keys.append(game.get_key())
                _log_probs.append(log_prob)
                _values.append(value)
                _rewards.append(_reward)
                _masks.append(1 - game.terminal)
                _actions.append(action)

                if game.terminal:
                    #  _, _, Qval = net.get_action(game)
                    Qval = value
                    for step in reversed(range(len(_states))):
                        # Qval = _rewards[step] + 0.999 * Qval * _masks[step]
                        Qval = _rewards[step]
                        _qvals.insert(0, Qval)

                    print('reward:', game.score, "Qval:", Qval, 'len:',
                          len(_qvals), "piececount:", game.piececount)
                    agentcount += 1
                    agentreward += _reward
                    piececount += game.piececount
                    break

            game_keys.append(_keys)
            game_states.append(_states)
            game_Qvals.append(_qvals)
            game_actions.append(_actions)

            game.print()

        avg_agentreward = agentreward / game_num

        for game_idx in range(game_num):
            game_Qvals[game_idx][-1] -= avg_agentreward
            for i in reversed(range(len(game_keys[game_idx]) - 1)):
                game_Qvals[game_idx][i] += game_Qvals[game_idx][i + 1] * 0.999
            print(*game_Qvals[game_idx][:3], "...", *game_Qvals[game_idx][-3:])

        keys, states, Qvals, actions = [], [], [], []
        for j in range(game_num):
            for o in game_keys[j]:
                keys.append(o)
            for o in game_states[j]:
                states.append(o)
            for o in game_Qvals[j]:
                Qvals.append(o)
            for o in game_actions[j]:
                actions.append(o)

        assert len(states) == len(Qvals)
        assert len(states) == len(keys)
        assert len(states) == len(actions)

        print("add %s to dataset" % len(states))

        return agentcount, agentreward, piececount, keys, zip(
            states, Qvals, actions)