""" # MACROS LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3 # Key mapping arrow_keys = {'\x1b[A': UP, '\x1b[B': DOWN, '\x1b[C': RIGHT, '\x1b[D': LEFT} # is_slippery True env = gym.make('FrozenLake-v0') env.reset() print_utils.clear_screen() env.render() # Show the initial board while True: # Choose an action from keyboard key = readchar.readkey() if key not in arrow_keys.keys(): print("Game aborted!") break action = arrow_keys[key] state, reward, done, info = env.step(action) # Show the board after action print_utils.clear_screen()
def main(): """Main""" frozone_lake_env = gym.make("FrozenLake-v0") # Initialize table with all zeros Q = np.zeros([N_STATES, N_ACTIONS]) # Set learning parameters # create lists to contain total rewards and steps per episode rewards = [] for i in range(N_EPISODES): # Reset environment and get first new observation state = frozone_lake_env.reset() episode_reward = 0 done = False # The Q-Table learning algorithm while not done: # Choose an action by greedily (with noise) picking from Q table action = np.argmax(Q[state, :] + np.random.randn(1, N_ACTIONS) / (i + 1)) # Get new state and reward from environment new_state, reward, done, _ = frozone_lake_env.step(action) # Update Q-Table with new knowledge using learning rate Q[state, action] = (1 - LEARNING_RATE) * Q[state, action] + LEARNING_RATE * ( reward + DISCOUNT_RATE * np.max(Q[new_state, :])) episode_reward += reward state = new_state rewards.append(episode_reward) print("Score over time: " + str(sum(rewards) / N_EPISODES)) print("Final Q-Table Values") for i in range(10): # Reset environment and get first new observation state = frozone_lake_env.reset() episode_reward = 0 done = False # The Q-Table learning algorithm while not done: # Choose an action by greedily (with noise) picking from Q table action = np.argmax(Q[state, :]) # Get new state and reward from environment new_state, reward, done, _ = frozone_lake_env.step(action) print_utils.clear_screen() frozone_lake_env.render() time.sleep(.1) # Update Q-Table with new knowledge using learning rate Q[state, action] = (1 - LEARNING_RATE) * Q[state, action] \ + LEARNING_RATE * (reward + DISCOUNT_RATE * np.max(Q[new_state, :])) episode_reward += reward state = new_state if done: print("Episode Reward: {}".format(episode_reward)) print_utils.print_result(episode_reward) rewards.append(episode_reward) frozone_lake_env.close()