num_bins = [3, 20, 3, 6, 6, 6, 3, 3] num_pos_actions = len(actions) q_learning = QLearning(env=env, num_bins=num_bins, num_pos_actions=num_pos_actions, env_ranges=env_ranges, discount=0, episodes=0, epsilon=None, lr=None, USE=True) env = gym.make('LunarLander-v2') q_learning.q_table = np.load('./data_lunarlander/0_9000.npy') for _ in range(10): obs = q_learning.reset_state() # Reset the environment and get the initial done = False while not done: action = q_learning.action_to_maximise_q(obs) obs, reward, done = q_learning.perform_sim_step(action) print(obs, reward, done) q_learning.env.render() env.close()
epsilon = [0.5, 1.0, episodes // 2] # Epsilon start, start decay index, stop decay index lr = [0.5, 1.0, episodes // 2 ] # Learning rate start, start decay index, stop decay index q_learning = QLearning(env, num_bins, num_pos_actions, env_ranges, discount, episodes, epsilon, lr) print('q-table shape', q_learning.q_table.shape) obs = q_learning.reset_state() # Reset the environment and get the initial obs = [obs[i] for i in obs_to_use] print('\nInitial observation:', obs) action_to_maximise_q = q_learning.action_to_maximise_q( obs) # Find optimal action action = q_learning.decide_on_action( action_to_maximise_q) # Decide whether to use optimal or random action observation, reward_current, done = q_learning.perform_sim_step( action) # env.step(action) # Perform the first action NUM_TO_SHOW = 5 rewards = [] while q_learning.episode < q_learning.episodes: reward_sum = 0 if not q_learning.episode % (episodes // NUM_TO_SHOW): render = True print('episode, learning_rate, epsilon', q_learning.episode,