Beispiel #1
0
        for obs in self._visited_states_returns:
            self._visited_states_returns[obs] += reward


if __name__ == '__main__':
    env = BlackjackEnv()
    agent = BlackjackAgent()

    total_ep = 100000
    for episode in range(total_ep):
        if episode % 1000 == 0:
            print('episode: {0} / {1}', episode, total_ep)
        # print()
        # print('  ==  EPISODE {0} START  =='.format(episode))

        obs = env.reset()
        agent.reset()

        # print('HAND:  ', end=''); print(env.player_hand)
        # print('STATE pp={0}, ha={1}, dc={2}'.format(obs[0], obs[1], obs[2]))

        while True:

            action = agent.pick_action(obs)

            # print('ACTION: {0}'.format(action.action))

            #   ---   time step rolls here   ---

            obs, reward, done = env.step(action)
    # 将observation进行分解
    score, dealerScore, usableSce = observation

    # 如果 玩家 的分数大于20分时,停止要牌(stick), action为0;
    # 否则 玩家 继续要牌(twist), action为1
    if score >= 20:
        return 0
    else:
        return 1


# 显示10个episode的信息
numEpisode = 10
for episode in range(numEpisode):
    # 重新初始化环境
    observation = env.reset()

    print(
        "#######################################################################"
    )
    print("第{}回合".format(episode))
    # 每个episode运行100步
    for step in range(100):
        # 显示当前的observation
        displayObservation(observation)

        # 获取当前observation的action
        action = policy(observation)

        # 根据当前action,与环境交互,获取下一步的observation, reward, 是否done等信息
        observation, reward, done, _ = env.step(action)