def experiment(test_game, num_experiments, file_name, num_episodes=500, alpha=.99, gamma=.9, epsilon=.9, decay_rate=.99): """ Main experiment method that runs the Q-Learning experiments and returns prints and draws the needed diagrams. works by learning a model x number of times and then compiling the number of steps per epoch for experiment These are then averaged and used to create a graph. A policy is then also chosen to give an average number of steps needed to reach the goal metric. """ list_of_moves_per_experiment = [] policies = [] for x in range(num_experiments): # Learn model q_learning = QLearning(test_game, num_episodes=num_episodes, alpha=alpha, gamma=gamma, epsilon=epsilon, decay_rate=decay_rate) q = q_learning.learn() policies.append(q) num_moves = q_learning.num_moves_per_episode list_of_moves_per_experiment.append(num_moves) np.array(list_of_moves_per_experiment) moves_per_epoc_number = np.sum(list_of_moves_per_experiment, axis=0) moves_per_epoc_number = moves_per_epoc_number / num_experiments # get Average number of steps when executing. q_learning = QLearning(test_game, num_episodes=num_episodes, alpha=alpha, gamma=gamma, epsilon=epsilon, decay_rate=decay_rate) avg_num_steps = 0 for itter in range(100): num_steps = q_learning.execute_policy(policies[num_experiments - 1]) avg_num_steps += num_steps[1] avg_num_steps /= 100.0 generate_validation_curves(np.arange(num_episodes), moves_per_epoc_number, None, "Number of steps", None, x_axis_label="Epoc Number", y_axis_label="Average Path Length", file_name=file_name) return avg_num_steps, policies[num_experiments - 1]
def inner_execution(env, envDesc, a, g, ep, e): print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}". format(a, g, ep, e)) qlearn = QLearning(env, alpha=a, gamma=g, epsilon=ep, epsilon_min=0.001, epsilon_dec=0.9999, episodes=e) q_table = qlearn.train( "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format( envDesc, a, g, ep, e), "grid_results/actions_{}_alpha_{}_gamma_{}_ep{}_e{}".format( envDesc, a, g, ep, e)) rewards = 0 for i in range(101): state = env.reset() train_done = False count = 0 while (not train_done) and (count < 200): action = np.argmax(q_table[state]) state, reward, train_done, _ = env.step(action) count += 1 if reward == 1: rewards += 1 self.results.append([a, g, ep, e, rewards])
def main(): env = GridWorld(MAP4) qlearning_policy = QLearning(env.get_num_states(), env.get_num_actions()) num_episodes = 1000 eps = 0.1 qlearnt = qlearning_train(env, qlearning_policy, num_episodes, eps) state = env.reset() env.print() done = False eps_test = 0.0 while not done: input("press enter:") action = tabular_epsilon_greedy_policy(qlearnt.Q, eps_test, state) state, reward, done = env.step(action) env.print() Qmatrix = np.max(qlearnt.Q, axis=1) Qmatrix = Qmatrix.reshape(6, 13) plt.imshow(Qmatrix) plt.colorbar() plt.title("Q Value Matrix plot trained for 100 episodes (MAP 4)") plt.show()
def init(): """ Ask for a Gridworld file and initializes an MDP as environment and Q-learning object with it, then calls the menu. """ print_headline("Gridworld Selection") gridworld = read_gridworld_file() environment = MDP(state_list=gridworld, field_rewards=Default.FIELD_REWARDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, transition_probabilities=Default.TRANSITION_PROBABILITIES) q_learning = QLearning(env_perform_action=environment.perform_action, state_list=gridworld, goal_fields=Default.GOAL_FIELDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, discount_factor=Default.DISCOUNT_FACTOR, learning_rate=Default.LEARNING_RATE, epsilon=Default.EPSILON, convergence_threshold=Default.CONVERGENCE_THRESHOLD) print("Your input Gridworld:") print_gridworld(gridworld) while show_menu(q_learning): pass print_headline("See you later")
def inner_execution(envDesc, a, g, ep, e): env = gym.make(envDesc).env print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}". format(a, g, ep, e)) qlearn = QLearning(env, alpha=a, gamma=g, epsilon=ep, epsilon_min=0.001, epsilon_dec=0.9999, episodes=e) q_table = qlearn.train( "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format( envDesc, a, g, ep, e), None) rewards = 0 for i in range(101): state = env.reset() train_done = False count = 0 while (not train_done) and (count < 200): action = np.argmax(q_table[state]) state, reward, train_done, _ = env.step(action) count += 1 if reward == 1: rewards += 1 r = np.array([a, g, ep, e, rewards]) print(r) savetxt("grid_results/results_{}_alpha_{}_gamma_{}_ep{}_e{}".format( envDesc, a, g, ep, e), r, delimiter=',', newline=" ", fmt="%10.5f")
def train(self): interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.db.images) / 2 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon) print 'Epoch 0: Exploration' self.runEpoch(interactions, len(self.environment.db.images)) self.learner = QLearning() self.agent.learner = self.learner epoch = 1 egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs: epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon) print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 epoch = 1 maxEpochs = config.geti('exploitLearningEpochs') while epoch <= maxEpochs: print 'Epoch', epoch + egEpochs, '(exploitation mode: epsilon={:5.3f})'.format( epsilon) self.runEpoch(interactions, epochSize) epoch += 1
def initialize_training(self): self.alpha = float(self.obj.var_alpha) self.gamma = float(self.obj.var_gamma) self.epsilon = float(self.obj.var_epsilon) self.neg_reward = float(self.obj.var_neg) self.positive_reward = float(self.obj.var_pos) for i in range(0, self.h, 40): for k in range(0, self.w, 40): self.states.append([k, i]) if self.path.count([k, i]) == 1: self.rewards.append(0) else: self.rewards.append(self.neg_reward) goal_index = self.extract_index( [self.goal_sprite.x, self.goal_sprite.y], self.states) self.rewards[goal_index] = self.positive_reward self.n_states = len(self.states) self.label_batch = pyglet.graphics.Batch() for i in range(len(self.states)): self.reward_labels.append( pyglet.text.Label(str(int(self.rewards[i])), font_name='Times New Roman', font_size=10, x=self.states[i][0] + 10, y=self.states[i][1] + 15, batch=self.label_batch)) self.Qobj = QLearning(self.alpha, self.gamma, self.states, self.rewards, self.n_states, self.n_actions)
def __init__(self, mark, board, game, player_type): self.mark = mark self.board = board self.game = game self.player_type = player_type self.action = Action(self.game) self.q_learning = QLearning() self.ordered_actions = []
def train(self): networkFile = config.get('networkDir') + config.get( 'snapshotPrefix') + '_iter_' + config.get( 'trainingIterationsPerBatch') + '.caffemodel' interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.imageList) / 1 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) epoch = 1 exEpochs = config.geti('explorationEpochs') while epoch <= exEpochs: s = cu.tic() print 'Epoch', epoch, ': Exploration (epsilon=1.0)' self.runEpoch(interactions, len(self.environment.imageList)) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) epoch += 1 self.learner = QLearning() self.agent.learner = self.learner egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs + exEpochs: s = cu.tic() epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) epoch += 1 maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs while epoch <= maxEpochs: s = cu.tic() print 'Epoch', epoch, '(exploitation mode: epsilon={:5.3f})'.format( epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) shutil.copy(networkFile, networkFile + '.' + str(epoch)) epoch += 1
def qLearningWithOptions(env, alpha, gamma, options_eps, epsilon, nSeeds, maxLengthEp, nEpisodes, verbose, useNegation, genericNumOptionsToEvaluate, loadedOptions=None): numSeeds = nSeeds numEpisodes = nEpisodes # We first discover all options options = None actionSetPerOption = None if loadedOptions == None: if verbose: options, actionSetPerOption = discoverOptions(env, options_eps, verbose, useNegation, plotGraphs=True) else: options, actionSetPerOption = discoverOptions(env, options_eps, verbose, useNegation, plotGraphs=False) else: options = loadedOptions actionSetPerOption = [] for i in xrange(len(loadedOptions)): tempActionSet = env.getActionSet() tempActionSet.append('terminate') actionSetPerOption.append(tempActionSet) returns_eval = [] returns_learn = [] # Now I add all options to my action set. Later we decide which ones to use. i = 0 #genericNumOptionsToEvaluate = [1, 2, 4, 32, 64, 128, 256] totalOptionsToUse = [] maxNumOptions = 0 if useNegation and loadedOptions == None: maxNumOptions = int(len(options) / 2) else: maxNumOptions = len(options) while i < len(genericNumOptionsToEvaluate ) and genericNumOptionsToEvaluate[i] <= maxNumOptions: totalOptionsToUse.append(genericNumOptionsToEvaluate[i]) i += 1 for idx, numOptionsToUse in enumerate(totalOptionsToUse): returns_eval.append([]) returns_learn.append([]) if verbose: print 'Using', numOptionsToUse, 'options' for s in xrange(numSeeds): if verbose: print 'Seed: ', s + 1 returns_eval[idx].append([]) returns_learn[idx].append([]) actionSet = env.getActionSet() for i in xrange(numOptionsToUse): actionSet.append(options[i]) if useNegation and loadedOptions == None: numOptions = 2 * numOptionsToUse else: numOptions = numOptionsToUse learner = QLearning(alpha=alpha, gamma=gamma, epsilon=epsilon, environment=env, seed=s, useOnlyPrimActions=True, actionSet=actionSet, actionSetPerOption=actionSetPerOption) for i in xrange(numEpisodes): returns_learn[idx][s].append( learner.learnOneEpisode(timestepLimit=maxLengthEp)) returns_eval[idx][s].append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=maxLengthEp)) returns_learn_primitive = [] returns_eval_primitive = [] for s in xrange(numSeeds): returns_learn_primitive.append([]) returns_eval_primitive.append([]) learner = QLearning(alpha=alpha, gamma=gamma, epsilon=epsilon, environment=env, seed=s) for i in xrange(numEpisodes): returns_learn_primitive[s].append( learner.learnOneEpisode(timestepLimit=maxLengthEp)) returns_eval_primitive[s].append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=maxLengthEp)) return returns_eval_primitive, returns_eval, totalOptionsToUse
# def print_frames(frames): # for i, frame in enumerate(frames): # clear_output(wait=True) # #print(frame['frame']) # #print(frame['frame'].getvalue()) # print(f"Timestep: {i + 1}") # print(f"State: {frame['state']}") # print(f"Action: {frame['action']}") # print(f"Reward: {frame['reward']}") # sleep(.1) env = gym.make('Roulette-v0').env #q_table = loadtxt('data/q-table-roulette.csv', delimiter=',') #2600loss - stable qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.001, epsilon_dec=0.9999, episodes=1000000) # 500-1000loss - real player like #qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.1, epsilon_dec=0.7, episodes=1000000) q_table = qlearn.train('data/q-table-roulette.csv', None) #q_table = loadtxt('data/q-table-roulette.csv', delimiter=',') state = env.reset() done = False rewards = 0 actions = 0 while not done: action = np.argmax(q_table) state, reward, done, info = env.step(action)
# import Q Learning function from QLearning import QLearning # define experiment parameters gamma = 0.99 lr = 0.1 epsilon = [0.01, 0.1, 0.25] runs = 2 step_number = 100 episode_length = 100 # run experiment QLearning(gamma, lr, epsilon, runs, step_number, episode_length)
elif taskToPerform == 4: #Compute the average number of time steps between any two states gamma = 1.0 env.useNegativeRewards = True #I need this because I'm counting time steps stats = MDPStats(gamma=gamma, env=env, outputPath=outputPath) getExpectedNumberOfStepsFromOption(env=env, eps=epsilon, verbose=verbose, discoverNegation=bothDirections, loadedOptions=loadedOptions) elif taskToPerform == 5: #Solve for a given goal (q-learning) returns_learn = [] returns_eval = [] learner = QLearning(alpha=0.1, gamma=0.9, epsilon=1.00, environment=env) for i in xrange(num_episodes): returns_learn.append( learner.learnOneEpisode(timestepLimit=max_length_episode)) returns_eval.append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=max_length_episode)) plt.plot(returns_eval) plt.show() elif taskToPerform == 6: #Solve for a given goal w/ primitive actions (q-learning) following options returns_eval_primitive, returns_eval, totalOptionsToUse = qLearningWithOptions( env=env, alpha=0.1,
import Functions from GridWorld import GridWorld from QLearning import QLearning from matplotlib import pylab from pylab import * if __name__ == "__main__": grid_world = GridWorld(10,10) # Functions.create_grid_from_hex(grid_world) Functions.create_random_obstacles(grid_world, 0.105) grid_world.scan_grid_and_generate_graph() grid_world.print_graph() grid_world.create_grid_ui(grid_world.m, grid_world.n, (grid_world.start_x, grid_world.start_y), (grid_world.end_x, grid_world.end_y), grid_world.obstacles) QL = QLearning(list(range(4))) scores, episodes = [], [] number_of_episodes = 10 for episode in range(number_of_episodes): score = 0 state = grid_world.reset() grid_world.is_visited = [[0] * grid_world.m for temp in range(grid_world.n)] while True: grid_world.render() action = QL.get_action(str(state)) next_state, reward, done = grid_world.step(action) QL.learn(str(state), action, reward, str(next_state))
from environment import Env from QLearning import QLearning if __name__ == "__main__": env = Env() QL = QLearning(list(range(env.n_actions))) for episode in range(1000): state = env.reset() while True: env.render() # take action and proceed one step in the environment action = QL.get_action(str(state)) next_state, reward, done = env.step(action) # with sample <s,a,r,s'>, agent learns new q function QL.learn(str(state), action, reward, str(next_state)) state = next_state env.print_value_all(QL.q_table) # if episode ends, then break if done: break
def update(env, RL): for episode in range(MAX_EPISODES): state = env.reset() # initial state while True: action = RL.choose_action(str(state)) # RL choose action based on state state_, reward = env.step(action) # RL take action and get next state and reward RL.learn(str(state), action, reward, str(state_)) # RL learn from this transition state = state_ # swap state if state == 'terminal': # break while loop when end of this episode break if episode % 500 == 0: simulation(episode) # writer = pd.ExcelWriter('./file1.xlsx') # RL.q_table.to_excel(writer) # writer.save() print('game over') print(RL.q_table) if __name__ == "__main__": env = Game() RL = QLearning( actions = list(range(env.n_actions)), learning_rate = ALPHA, reward_decay = GAMMA, e_greedy = EPSILON ) update(env, RL)
tries = 100 episodes = 1000 results = np.zeros((tries, episodes)) #### run with QLearning for t in range(tries): # define learning settings epsilon_decay = 1 - (1 / episodes) * 6 learning_decay = 1 - (1 / episodes) * 3 agent = QLearning(env.env, learning_rate=0.5, discount_factor=0.9, exploration_rate=0, epsilon_decay_func=lambda x: x * epsilon_decay, alpha_decay_func=lambda x: x * learning_decay, qtable_default=1) # fit and save results env.fit(agent, episodes) results[t, :] = agent.rewards_per_episode # plot rewards plot_rewards(np.mean(results, axis=0), smoothing=0.1, color='blue') #### run with SARSA # define learning settings
def main(): print('Cart Pole') env = gym.make('CartPole-v1') q_learn = QLearning(env, num_episodes=3000) q_learn.run()
import Gridworld import DefaultConstants as Default gridworld_list = Gridworld.make_list_from_file("3by4.grid") mdp = MDP(state_list=gridworld_list, field_rewards=Default.FIELD_REWARDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, transition_probabilities=Default.TRANSITION_PROBABILITIES) qlearning = QLearning(env_perform_action=mdp.perform_action, state_list=gridworld_list, goal_fields=Default.GOAL_FIELDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, discount_factor=Default.DISCOUNT_FACTOR, learning_rate=0.1, epsilon=0.5, convergence_threshold=1000) print("---Instance variables---") print(qlearning.states) print(qlearning.goal_states) print(qlearning.actions) print(qlearning.discount_factor) print(qlearning.learning_rate) print(qlearning.epsilon) print(qlearning.q_function) print() """current_state = (0, 2)
import numpy as np import matplotlib.pyplot as plt from QLearning import QLearning from numpy import loadtxt def stateNumber(state): (x,y,z) = state y = y * 32 z = z * 352 return x+y+z env = gym.make('Blackjack-v0') for i in [0.01]: for g in [0.000001,0.00001,0.0001,0.001,0.01]: for epi in [600000,700000,800000]: qlearn = QLearning(env, alpha=i, gamma=g, epsilon=0.9,epsilon_min=0.01, epsilon_dec=0.99, episodes=epi) q_table = qlearn.train('data/q-table-blackjack.csv', 'results/blackjack') #q_table = loadtxt('data/q-table-blackjack.csv', delimiter=',') #state= env.reset() #print(state) #state = stateNumber(state) #done = False # # #while not done: # action = np.argmax(q_table[state]) # state, reward, done, info = env.step(action) # print(action) # print(state) # state = stateNumber(state)
def render(self, Q_table): for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() sys.exit() self.display_surface.fill((0, 0, 0)) self.render_table(Q_table) self.render_player() self.render_target() pygame.display.update() pygame.time.wait(250) agent = QLearning(GridWorld(), buckets=(10, 10, 10, 10), lower_bounds=0, upper_bounds=9, num_episodes=1000, min_lr=0.001, min_epsilon=0.3) def run(): t = 0 done = False current_state = agent.discretize_state(agent.env.reset()) while not done: t += 1 action = agent.choose_action(current_state) agent.env.render(agent.Q_table) obs, reward, done, _ = agent.env.step(action) new_state = agent.discretize_state(obs) current_state = new_state return t
get_size_except_dim(Input)]) Output = tf.layers.dense(inputs=Reshape, units=10, activation=None) Labels = tf.cast(tf.reshape(Labels, shape=[BatchSize]), tf.int64) #OneHotLabels=tf.one_hot(Labels,depth=10,axis=-1) Loss = tf.losses.sparse_softmax_cross_entropy(labels=Labels, logits=Output) Acc = tf.reduce_mean( tf.cast(tf.equal(Labels, tf.argmax(Output, 1)), tf.float32)) #print(Loss,Loss.shape.as_list()) #exit() #Loss=tf.reshape(Loss,shape=[-1,1]) return Output, Loss, Acc Mode = "Train" RL_Exp = QLearning() TaskSpec = { "LogHistory": True, "OperatorList": Op_List, "OperatorNum": OperatorLimit, "InputNum": 1, "OutputNum": 1, "TaskInput": images, "TaskLabel": labels, "Epochs": TrainEpochs, "NetworkDecor": NetworkDecor, "BatchSize": BatchSize, "ConcatOperator": ConcatOperatorDense, "InputOperator": ImageInput, "TrajectoryLength": OperatorLimit - 4, "RewardGamma": 0.9
#!/usr/bin/python3 from PK_Handler import PK_Handler from PK_Game import PK_Game from PK_Player_Greedy import PK_Player_Greedy from QLearning import QLearning game = PK_Game() qplayer = PK_Player_Greedy(game) qlearning = QLearning(game, qplayer, 0.05, 0.95, 0.8) qplayer.set_optimizer(qlearning) game.set_player1(qplayer) handler = PK_Handler(game) handler.train(1000000) print(qplayer.probas)
def __init__( self, environment, location=(0,0) ): self.Environment = environment self.location = location self.QLearning = QLearning( self, 0.5, 0.7, 0.1)
import http.server import socketserver import gym import gym_sample from QLearning import QLearning import json PORT = 8080 env = gym.make('sample-v0') qLearning = QLearning(env, 99) class Handler(http.server.SimpleHTTPRequestHandler): def do_GET(self): self.send_response(200) self.send_header('Content-type', 'application/json') self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() if (self.path == '/train'): self.wfile.write( json.dumps( qLearning.train(5000, 0.7, 0.618, 1, 1, 0.01, 0.01).tolist()).encode('utf8')) elif (self.path == '/ai_step'): self.wfile.write(json.dumps(qLearning.ai_step()).encode('utf8')) elif (self.path == '/reset'): qLearning.reset() self.wfile.write('{"reseted": true}'.encode('utf8')) else: self.wfile.write('{"rodando": "ta"}'.encode('utf8'))
opt_val = prob_env.still(prob_env.output_noiseless(opt_state)) wall_time_limit = kwargs.wall_time_limit generation = call_counts # for random search, generation means call counts elif kwargs.method == 'rl_prtr': model_env_name = get_model_env_name(kwargs.prtr_model_dir) assert model_env_name == prob_env_name opt = QLearning( k=prob_env.k, d=prob_env.d, env_name=prob_env_name, env_dir=kwargs.prob_env_dir, # env_dir will load environment env_fixed_xo=False, n_hidden=0, save_and_load_path=kwargs. prtr_model_dir, # model dir will load model load=True, tensorboard_path=None, logger_path=None, memory_capacity=None, memory_capacity_start_learning=None, learn_wall_time_limit=None, prioritized=None, save_model_iter=None, trial_size=0, ) # opt may learned from non fixed xo environment # but we will test it under fixed xo environment opt.set_env_fixed_xo(prob_env.x_o) assert opt.get_env_if_set_fixed_xo() opt_val, start_x_o, opt_x_p, start_x_p, duration = extract_rl_exp_result( opt, prob_env)
from variables3_3 import * import random from QLearning import QLearning from write_results import write_results print("Done with importing") tb = TarockBasics() qlearning = QLearning(0.1, 0.1, 0.1) def play_one_game(p1, p2, p3, talon, ps, sp, duo, strats): msa = MilestoneAgents(p1, p2, p3, [1,2,3], solo, duo) msa.update_state_talon(talon) points = {1: 0, 2: 0, 3: 0} while msa.player1_cards: first = msa.starting_player second = msa.second_player third = msa.third_player if strats[first] == "me": if first not in msa.duo: card1 = get_solo_first_card_3_3(msa, first, qlearning) else: card1 = get_duo_first_card_3_3(msa, first, qlearning) elif strats[first] == "LWW": card1 = msa.locally_worst_worst_agent(first) elif strats[first] == "LW": card1 = msa.locally_worst_agent(first)
game = Game() #initialize window. game.initialisationWindow() game.initialisationBackground() game.loadingPictures() #recuperate window. window = game.getterWindow() #Recuperate wall and rewards coords on the .txt file mapWallCoord = game.getterWallCoord() rewardCoordB, rewardCoordM = game.getterReward() """---------------------- INITIALIZE QLEARNING ---------------------""" #Create object from qlearning class. QLearning = QLearning() #Init qTable with rewards and '0' coords from the .txt file QLearning.intiQtable() """---------------------- INITIALIZE TRAINING KNN---------------------""" Trainning = Trainning() epochs = 2000 dataX = [] #targets dataY = [] #features for epoch in range(epochs): #Simulate rayon (right or top detection) rayons = Trainning.simulateRayon()
# Implement Q-learning and use this to solve the cartpole-environment import gym # Source: https://github.com/JoeSnow7/Reinforcement-Learning/blob/master/Cartpole%20Q-learning.ipynb # We define a class to contain the learning algorithm from QLearning import QLearning env = gym.make("CartPole-v0") agent = QLearning(env) agent.train() agent.run()
save_name = "q_values_" # have a look at LearningPolicy.py for other policies epsilon_policy = LearningPolicy.exponentially_annealed_epsilon(1 / 10000, 0.0) epsilon_policy_2 = LearningPolicy.linear_annealed_epsilon(1., 0.1, 100) alpha1 = 0.2 alpha2 = 0.1 hyperparameters = {"alpha": alpha2, "discount": 0.99} # Please note: Numerous other settings can be adjusted in settings.py if training_mode: q = QLearning(epsilon_policy=epsilon_policy_2, map_name=map, hyperparameters=hyperparameters, save_name=save_name) while True: q.train() else: q = QLearning(epsilon_policy=LearningPolicy.constant_epsilon(0), map_name=map) if checkpoint_file is None: raise Exception("Please specify the checkpoint file path!") q_values = AgentManager.load_q_values(checkpoint_file) while True: q.test(q_values=q_values)