Example #1
0
num_bins = [3, 20, 3, 6, 6, 6, 3, 3]
num_pos_actions = len(actions)

q_learning = QLearning(env=env,
                       num_bins=num_bins,
                       num_pos_actions=num_pos_actions,
                       env_ranges=env_ranges,
                       discount=0,
                       episodes=0,
                       epsilon=None,
                       lr=None,
                       USE=True)

env = gym.make('LunarLander-v2')
q_learning.q_table = np.load('./data_lunarlander/0_9000.npy')

for _ in range(10):

    obs = q_learning.reset_state()  # Reset the environment and get the initial

    done = False
    while not done:

        action = q_learning.action_to_maximise_q(obs)
        obs, reward, done = q_learning.perform_sim_step(action)
        print(obs, reward, done)
        q_learning.env.render()

env.close()
epsilon = [0.5, 1.0,
           episodes // 2]  # Epsilon start, start decay index, stop decay index
lr = [0.5, 1.0, episodes // 2
      ]  # Learning rate start, start decay index, stop decay index

q_learning = QLearning(env, num_bins, num_pos_actions, env_ranges, discount,
                       episodes, epsilon, lr)

print('q-table shape', q_learning.q_table.shape)

obs = q_learning.reset_state()  # Reset the environment and get the initial
obs = [obs[i] for i in obs_to_use]
print('\nInitial observation:', obs)

action_to_maximise_q = q_learning.action_to_maximise_q(
    obs)  # Find optimal action
action = q_learning.decide_on_action(
    action_to_maximise_q)  # Decide whether to use optimal or random action
observation, reward_current, done = q_learning.perform_sim_step(
    action)  # env.step(action)  # Perform the first action

NUM_TO_SHOW = 5
rewards = []

while q_learning.episode < q_learning.episodes:

    reward_sum = 0

    if not q_learning.episode % (episodes // NUM_TO_SHOW):
        render = True
        print('episode, learning_rate, epsilon', q_learning.episode,