def main(): new_map = ["SFFF", "FHFH", "FFFH", "HFFG"] env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY) env = env.unwrapped succeed_episode = 0 for i_episode in range(1000000): if use_random_map and i_episode % 10 == 0: env.close() new_map = random_map(HOLE_NUM) env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY) env = env.unwrapped pos = env.reset() state = encode_state(new_map, pos) ep_r = 0 while True: a = select_action(state) pos_next, r, done, info = env.step(a) ep_r += r #state_next = encode_state(new_map, pos_next) if args.render: env.render() model.rewards.append(r) if done: break finish_episode() episode_durations.append(ep_r) if ep_r > 0: # EPSILON = 1 - 1. / ((i_episode / 500) + 10) succeed_episode += 1 if i_episode % 1000 == 1: print('EP: {:d} succeed rate {:4f}'.format(i_episode, succeed_episode / 1000)) succeed_episode = 0 if i_episode % 5000 == 1: plot_durations()
# print(episode, qtable) # print(total_rewards) print("Score over time: " + str(sum(rewards) / total_episodes)) print(qtable) env.reset() for episode in range(1): state = env.reset() print("state",state) step = 0 done = False print("****************************************************") print("EPISODE ", episode) for step in range(max_steps): env.render() # Take the action (index) that have the maximum expected future reward given that state action = np.argmax(qtable[state, :]) new_state, reward, done, info = env.step(action) if done: print("done..........") # env.render() break state = new_state env.close()