for obs in self._visited_states_returns: self._visited_states_returns[obs] += reward if __name__ == '__main__': env = BlackjackEnv() agent = BlackjackAgent() total_ep = 100000 for episode in range(total_ep): if episode % 1000 == 0: print('episode: {0} / {1}', episode, total_ep) # print() # print(' == EPISODE {0} START =='.format(episode)) obs = env.reset() agent.reset() # print('HAND: ', end=''); print(env.player_hand) # print('STATE pp={0}, ha={1}, dc={2}'.format(obs[0], obs[1], obs[2])) while True: action = agent.pick_action(obs) # print('ACTION: {0}'.format(action.action)) # --- time step rolls here --- obs, reward, done = env.step(action)
# 将observation进行分解 score, dealerScore, usableSce = observation # 如果 玩家 的分数大于20分时,停止要牌(stick), action为0; # 否则 玩家 继续要牌(twist), action为1 if score >= 20: return 0 else: return 1 # 显示10个episode的信息 numEpisode = 10 for episode in range(numEpisode): # 重新初始化环境 observation = env.reset() print( "#######################################################################" ) print("第{}回合".format(episode)) # 每个episode运行100步 for step in range(100): # 显示当前的observation displayObservation(observation) # 获取当前observation的action action = policy(observation) # 根据当前action,与环境交互,获取下一步的observation, reward, 是否done等信息 observation, reward, done, _ = env.step(action)