예제 #1
0
        obs = env.reset()
        agent.reset()

        # print('HAND:  ', end=''); print(env.player_hand)
        # print('STATE pp={0}, ha={1}, dc={2}'.format(obs[0], obs[1], obs[2]))

        while True:

            action = agent.pick_action(obs)

            # print('ACTION: {0}'.format(action.action))

            #   ---   time step rolls here   ---

            obs, reward, done = env.step(action)

            # print('   ------ t={0} -----'.format(env.t_step))
            # print('REWARD: {0}'.format(reward))
            # print('HAND:  ', end=''); print(env.player_hand)
            # print('STATE pp={0}, ha={1}, dc={2}'.format(obs[0], obs[1], obs[2]))
            # print('DONE', done)

            agent.remember(obs, reward)

            if done:
                agent.learn_mc()
                break

        # print('  ==  GAME OVER  =='.format(env.t_step))
for episode in range(numEpisode):
    # 重新初始化环境
    observation = env.reset()

    print(
        "#######################################################################"
    )
    print("第{}回合".format(episode))
    # 每个episode运行100步
    for step in range(100):
        # 显示当前的observation
        displayObservation(observation)

        # 获取当前observation的action
        action = policy(observation)

        # 根据当前action,与环境交互,获取下一步的observation, reward, 是否done等信息
        observation, reward, done, _ = env.step(action)
        print("根据当前策略,玩家采用的行为为:{}, 该行为获取的立即返回为: {}".format(["停止要牌",
                                                            "继续要牌"][action],
                                                           reward))

        # 如果done为True
        if done:
            displayObservation(observation)
            print("游戏结束。 回报为: {}\n".format(float(reward)))
            break

    # 该回合结束,暂停几秒
    time.sleep(3)