import rl.utils as utils
from timeit import default_timer as timer

envname = "CliffWalking-v0"

print("\n----------------------------------------------------------------")
print("\tEnvironment: ", envname)
print("----------------------------------------------------------------\n")

env = gym.make(envname)
env.render()
print()

actions = {0: "U", 1: "R", 2: "D", 3: "L"}

# Learning parameters
episodes = 500
alpha = 0.3
gamma = 0.95
epsilon = 0.1

t = timer()

# Q-Learning epsilon greedy
policy, rews, lengths = mf.q_learning(env, episodes, alpha, gamma,
                                      mf.epsilon_greedy, epsilon)
print("Execution time: {0}s\nPolicy:\n{1}\n".format(
    round(timer() - t, 4),
    np.vectorize(actions.get)(policy.reshape(env.shape))))
utils.run_episode(env, policy, 20, True, 1)
Beispiel #2
0
envname = "CliffWalking-v0"

print("\n----------------------------------------------------------------")
print("\tEnvironment: ", envname)
print("----------------------------------------------------------------\n")

env = gym.make(envname)
env.render()
print()

actions = {0: "U", 1: "R", 2: "D", 3: "L"}

# Learning parameters
episodes = 500
alpha = 0.3
gamma = 0.95
epsilon = 0.1

t = timer()
# SARSA epsilon greedy
policy, _, _ = mf.sarsa(env, episodes, alpha, gamma, mf.epsilon_greedy, epsilon)
print("Execution time: {0}s\nPolicy with epsilon_greedy:\n{1}\n"
      .format(round(timer() - t, 4), np.vectorize(actions.get)(policy.reshape(env.shape))))
utils.run_episode(env, policy, 20)

t = timer()
# SARSA softmax
policy, _, _ = mf.sarsa(env, episodes, alpha, gamma, mf.softmax, epsilon)
print("Execution time: {0}s\nPolicy with softmax:\n{1}\n"
      .format(round(timer() - t, 4), np.vectorize(actions.get)(policy.reshape(env.shape))))