def inner_execution(envDesc, a, g, ep, e): env = gym.make(envDesc).env print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}". format(a, g, ep, e)) qlearn = QLearning(env, alpha=a, gamma=g, epsilon=ep, epsilon_min=0.001, epsilon_dec=0.9999, episodes=e) q_table = qlearn.train( "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format( envDesc, a, g, ep, e), None) rewards = 0 for i in range(101): state = env.reset() train_done = False count = 0 while (not train_done) and (count < 200): action = np.argmax(q_table[state]) state, reward, train_done, _ = env.step(action) count += 1 if reward == 1: rewards += 1 r = np.array([a, g, ep, e, rewards]) print(r) savetxt("grid_results/results_{}_alpha_{}_gamma_{}_ep{}_e{}".format( envDesc, a, g, ep, e), r, delimiter=',', newline=" ", fmt="%10.5f")
def inner_execution(env, envDesc, a, g, ep, e): print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}". format(a, g, ep, e)) qlearn = QLearning(env, alpha=a, gamma=g, epsilon=ep, epsilon_min=0.001, epsilon_dec=0.9999, episodes=e) q_table = qlearn.train( "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format( envDesc, a, g, ep, e), "grid_results/actions_{}_alpha_{}_gamma_{}_ep{}_e{}".format( envDesc, a, g, ep, e)) rewards = 0 for i in range(101): state = env.reset() train_done = False count = 0 while (not train_done) and (count < 200): action = np.argmax(q_table[state]) state, reward, train_done, _ = env.step(action) count += 1 if reward == 1: rewards += 1 self.results.append([a, g, ep, e, rewards])
# Implement Q-learning and use this to solve the cartpole-environment import gym # Source: https://github.com/JoeSnow7/Reinforcement-Learning/blob/master/Cartpole%20Q-learning.ipynb # We define a class to contain the learning algorithm from QLearning import QLearning env = gym.make("CartPole-v0") agent = QLearning(env) agent.train() agent.run()
# have a look at LearningPolicy.py for other policies epsilon_policy = LearningPolicy.exponentially_annealed_epsilon(1 / 10000, 0.0) epsilon_policy_2 = LearningPolicy.linear_annealed_epsilon(1., 0.1, 100) alpha1 = 0.2 alpha2 = 0.1 hyperparameters = {"alpha": alpha2, "discount": 0.99} # Please note: Numerous other settings can be adjusted in settings.py if training_mode: q = QLearning(epsilon_policy=epsilon_policy_2, map_name=map, hyperparameters=hyperparameters, save_name=save_name) while True: q.train() else: q = QLearning(epsilon_policy=LearningPolicy.constant_epsilon(0), map_name=map) if checkpoint_file is None: raise Exception("Please specify the checkpoint file path!") q_values = AgentManager.load_q_values(checkpoint_file) while True: q.test(q_values=q_values)
import matplotlib.pyplot as plt from QLearning import QLearning from numpy import loadtxt def stateNumber(state): (x,y,z) = state y = y * 32 z = z * 352 return x+y+z env = gym.make('Blackjack-v0') for i in [0.01]: for g in [0.000001,0.00001,0.0001,0.001,0.01]: for epi in [600000,700000,800000]: qlearn = QLearning(env, alpha=i, gamma=g, epsilon=0.9,epsilon_min=0.01, epsilon_dec=0.99, episodes=epi) q_table = qlearn.train('data/q-table-blackjack.csv', 'results/blackjack') #q_table = loadtxt('data/q-table-blackjack.csv', delimiter=',') #state= env.reset() #print(state) #state = stateNumber(state) #done = False # # #while not done: # action = np.argmax(q_table[state]) # state, reward, done, info = env.step(action) # print(action) # print(state) # state = stateNumber(state) #
# #print(frame['frame'].getvalue()) # print(f"Timestep: {i + 1}") # print(f"State: {frame['state']}") # print(f"Action: {frame['action']}") # print(f"Reward: {frame['reward']}") # sleep(.1) env = gym.make('Roulette-v0').env #q_table = loadtxt('data/q-table-roulette.csv', delimiter=',') #2600loss - stable qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.001, epsilon_dec=0.9999, episodes=1000000) # 500-1000loss - real player like #qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.1, epsilon_dec=0.7, episodes=1000000) q_table = qlearn.train('data/q-table-roulette.csv', None) #q_table = loadtxt('data/q-table-roulette.csv', delimiter=',') state = env.reset() done = False rewards = 0 actions = 0 while not done: action = np.argmax(q_table) state, reward, done, info = env.step(action) actions += 1 rewards += reward