obs = env.reset() agent.reset() # print('HAND: ', end=''); print(env.player_hand) # print('STATE pp={0}, ha={1}, dc={2}'.format(obs[0], obs[1], obs[2])) while True: action = agent.pick_action(obs) # print('ACTION: {0}'.format(action.action)) # --- time step rolls here --- obs, reward, done = env.step(action) # print(' ------ t={0} -----'.format(env.t_step)) # print('REWARD: {0}'.format(reward)) # print('HAND: ', end=''); print(env.player_hand) # print('STATE pp={0}, ha={1}, dc={2}'.format(obs[0], obs[1], obs[2])) # print('DONE', done) agent.remember(obs, reward) if done: agent.learn_mc() break # print(' == GAME OVER =='.format(env.t_step))
for episode in range(numEpisode): # 重新初始化环境 observation = env.reset() print( "#######################################################################" ) print("第{}回合".format(episode)) # 每个episode运行100步 for step in range(100): # 显示当前的observation displayObservation(observation) # 获取当前observation的action action = policy(observation) # 根据当前action,与环境交互,获取下一步的observation, reward, 是否done等信息 observation, reward, done, _ = env.step(action) print("根据当前策略,玩家采用的行为为:{}, 该行为获取的立即返回为: {}".format(["停止要牌", "继续要牌"][action], reward)) # 如果done为True if done: displayObservation(observation) print("游戏结束。 回报为: {}\n".format(float(reward))) break # 该回合结束,暂停几秒 time.sleep(3)