def initialize_training(self): self.alpha = float(self.obj.var_alpha) self.gamma = float(self.obj.var_gamma) self.epsilon = float(self.obj.var_epsilon) self.neg_reward = float(self.obj.var_neg) self.positive_reward = float(self.obj.var_pos) for i in range(0, self.h, 40): for k in range(0, self.w, 40): self.states.append([k, i]) if self.path.count([k, i]) == 1: self.rewards.append(0) else: self.rewards.append(self.neg_reward) goal_index = self.extract_index( [self.goal_sprite.x, self.goal_sprite.y], self.states) self.rewards[goal_index] = self.positive_reward self.n_states = len(self.states) self.label_batch = pyglet.graphics.Batch() for i in range(len(self.states)): self.reward_labels.append( pyglet.text.Label(str(int(self.rewards[i])), font_name='Times New Roman', font_size=10, x=self.states[i][0] + 10, y=self.states[i][1] + 15, batch=self.label_batch)) self.Qobj = QLearning(self.alpha, self.gamma, self.states, self.rewards, self.n_states, self.n_actions)
def main(): # resolve the parameters sent from the command line call params = obtainParameters() # resolve file issues regarding the execution of the algorithm prepareFolders(params['commandPath'], params['filePath']) myMDP = MDP(params['filePath']) myAgent = Agent(myMDP) Wacumulado = 0 for i in range(int(params['numberOfExecutions'])): print 'Running experiment ' + str(i + 1) + ' of ' + str(params['numberOfExecutions']) myQLearning = QLearning(myMDP, \ myAgent, \ alpha = float(params['alpha']), \ gamma = float(params['gamma']), \ epsilon = float(params['epsilon']), \ epsilonIncrement = float(params['epsilonIncrement']), \ K = int(params['K']), \ H = int(params['H']), \ gammaPRQL = float(params['gammaPRQL'])) W, Ws = myQLearning.execute() Wacumulado += array(Ws) Ws = Wacumulado / float(params['numberOfExecutions']) saveOutputFiles(myQLearning, params, Ws)
def inner_execution(envDesc, a, g, ep, e): env = gym.make(envDesc).env print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}". format(a, g, ep, e)) qlearn = QLearning(env, alpha=a, gamma=g, epsilon=ep, epsilon_min=0.001, epsilon_dec=0.9999, episodes=e) q_table = qlearn.train( "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format( envDesc, a, g, ep, e), None) rewards = 0 for i in range(101): state = env.reset() train_done = False count = 0 while (not train_done) and (count < 200): action = np.argmax(q_table[state]) state, reward, train_done, _ = env.step(action) count += 1 if reward == 1: rewards += 1 r = np.array([a, g, ep, e, rewards]) print(r) savetxt("grid_results/results_{}_alpha_{}_gamma_{}_ep{}_e{}".format( envDesc, a, g, ep, e), r, delimiter=',', newline=" ", fmt="%10.5f")
def inner_execution(env, envDesc, a, g, ep, e): print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}". format(a, g, ep, e)) qlearn = QLearning(env, alpha=a, gamma=g, epsilon=ep, epsilon_min=0.001, epsilon_dec=0.9999, episodes=e) q_table = qlearn.train( "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format( envDesc, a, g, ep, e), "grid_results/actions_{}_alpha_{}_gamma_{}_ep{}_e{}".format( envDesc, a, g, ep, e)) rewards = 0 for i in range(101): state = env.reset() train_done = False count = 0 while (not train_done) and (count < 200): action = np.argmax(q_table[state]) state, reward, train_done, _ = env.step(action) count += 1 if reward == 1: rewards += 1 self.results.append([a, g, ep, e, rewards])
def __init__(self, mark, board, game, player_type): self.mark = mark self.board = board self.game = game self.player_type = player_type self.action = Action(self.game) self.q_learning = QLearning() self.ordered_actions = []
def main(): env = GridWorld(MAP4) qlearning_policy = QLearning(env.get_num_states(), env.get_num_actions()) num_episodes = 1000 eps = 0.1 qlearnt = qlearning_train(env, qlearning_policy, num_episodes, eps) state = env.reset() env.print() done = False eps_test = 0.0 while not done: input("press enter:") action = tabular_epsilon_greedy_policy(qlearnt.Q, eps_test, state) state, reward, done = env.step(action) env.print() Qmatrix = np.max(qlearnt.Q, axis=1) Qmatrix = Qmatrix.reshape(6, 13) plt.imshow(Qmatrix) plt.colorbar() plt.title("Q Value Matrix plot trained for 100 episodes (MAP 4)") plt.show()
def init(): """ Ask for a Gridworld file and initializes an MDP as environment and Q-learning object with it, then calls the menu. """ print_headline("Gridworld Selection") gridworld = read_gridworld_file() environment = MDP(state_list=gridworld, field_rewards=Default.FIELD_REWARDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, transition_probabilities=Default.TRANSITION_PROBABILITIES) q_learning = QLearning(env_perform_action=environment.perform_action, state_list=gridworld, goal_fields=Default.GOAL_FIELDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, discount_factor=Default.DISCOUNT_FACTOR, learning_rate=Default.LEARNING_RATE, epsilon=Default.EPSILON, convergence_threshold=Default.CONVERGENCE_THRESHOLD) print("Your input Gridworld:") print_gridworld(gridworld) while show_menu(q_learning): pass print_headline("See you later")
def chargerParametresQLearning(self): my_settings = QSettings("Bazerque-Vigie", "Labyrinthe"); # Charge les paramètres généraux my_settings.beginGroup("General") if (my_settings.contains("politique")): self.politique_choisie = my_settings.value("politique").toInt()[0] if (my_settings.contains("nb_coups")): self.nb_coups_max = my_settings.value("nb_coups").toInt()[0] if (my_settings.contains("vitesse")): self.vitesse = my_settings.value("vitesse").toInt()[0] my_settings.endGroup() # Charge les coûts des déplacements my_settings.beginGroup("Couts") if (my_settings.contains("deplacement_normal")): deplacement_normal = my_settings.value("deplacement_normal").toFloat()[0] if (my_settings.contains("deplacement_piege")): deplacement_piege = my_settings.value("deplacement_piege").toFloat()[0] if (my_settings.contains("deplacement_sortie")): deplacement_sortie = my_settings.value("deplacement_sortie").toFloat()[0] self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init)
def train(self): interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.db.images) / 2 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon) print 'Epoch 0: Exploration' self.runEpoch(interactions, len(self.environment.db.images)) self.learner = QLearning() self.agent.learner = self.learner epoch = 1 egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs: epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon) print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 epoch = 1 maxEpochs = config.geti('exploitLearningEpochs') while epoch <= maxEpochs: print 'Epoch', epoch + egEpochs, '(exploitation mode: epsilon={:5.3f})'.format( epsilon) self.runEpoch(interactions, epochSize) epoch += 1
def modifierParametres(self): f = FenetreParametres(self) if (f.exec_()): # Récupère les choix dès que la FenetreParametres est validée self.politique_choisie,self.nb_coups_max, self.vitesse, deplacement_normal,deplacement_piege,deplacement_sortie = f.getChoix() # Modifie les paramètres initiaux du QLearning self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init) # Actualise la barre de status self.changeStatus()
def experiment(test_game, num_experiments, file_name, num_episodes=500, alpha=.99, gamma=.9, epsilon=.9, decay_rate=.99): """ Main experiment method that runs the Q-Learning experiments and returns prints and draws the needed diagrams. works by learning a model x number of times and then compiling the number of steps per epoch for experiment These are then averaged and used to create a graph. A policy is then also chosen to give an average number of steps needed to reach the goal metric. """ list_of_moves_per_experiment = [] policies = [] for x in range(num_experiments): # Learn model q_learning = QLearning(test_game, num_episodes=num_episodes, alpha=alpha, gamma=gamma, epsilon=epsilon, decay_rate=decay_rate) q = q_learning.learn() policies.append(q) num_moves = q_learning.num_moves_per_episode list_of_moves_per_experiment.append(num_moves) np.array(list_of_moves_per_experiment) moves_per_epoc_number = np.sum(list_of_moves_per_experiment, axis=0) moves_per_epoc_number = moves_per_epoc_number / num_experiments # get Average number of steps when executing. q_learning = QLearning(test_game, num_episodes=num_episodes, alpha=alpha, gamma=gamma, epsilon=epsilon, decay_rate=decay_rate) avg_num_steps = 0 for itter in range(100): num_steps = q_learning.execute_policy(policies[num_experiments - 1]) avg_num_steps += num_steps[1] avg_num_steps /= 100.0 generate_validation_curves(np.arange(num_episodes), moves_per_epoc_number, None, "Number of steps", None, x_axis_label="Epoc Number", y_axis_label="Average Path Length", file_name=file_name) return avg_num_steps, policies[num_experiments - 1]
def train(self): networkFile = config.get('networkDir') + config.get( 'snapshotPrefix') + '_iter_' + config.get( 'trainingIterationsPerBatch') + '.caffemodel' interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.imageList) / 1 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) epoch = 1 exEpochs = config.geti('explorationEpochs') while epoch <= exEpochs: s = cu.tic() print 'Epoch', epoch, ': Exploration (epsilon=1.0)' self.runEpoch(interactions, len(self.environment.imageList)) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) epoch += 1 self.learner = QLearning() self.agent.learner = self.learner egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs + exEpochs: s = cu.tic() epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) epoch += 1 maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs while epoch <= maxEpochs: s = cu.tic() print 'Epoch', epoch, '(exploitation mode: epsilon={:5.3f})'.format( epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) shutil.copy(networkFile, networkFile + '.' + str(epoch)) epoch += 1
def qLearningWithOptions(env, alpha, gamma, options_eps, epsilon, nSeeds, maxLengthEp, nEpisodes, verbose, useNegation, genericNumOptionsToEvaluate, loadedOptions=None): numSeeds = nSeeds numEpisodes = nEpisodes # We first discover all options options = None actionSetPerOption = None if loadedOptions == None: if verbose: options, actionSetPerOption = discoverOptions(env, options_eps, verbose, useNegation, plotGraphs=True) else: options, actionSetPerOption = discoverOptions(env, options_eps, verbose, useNegation, plotGraphs=False) else: options = loadedOptions actionSetPerOption = [] for i in xrange(len(loadedOptions)): tempActionSet = env.getActionSet() tempActionSet.append('terminate') actionSetPerOption.append(tempActionSet) returns_eval = [] returns_learn = [] # Now I add all options to my action set. Later we decide which ones to use. i = 0 #genericNumOptionsToEvaluate = [1, 2, 4, 32, 64, 128, 256] totalOptionsToUse = [] maxNumOptions = 0 if useNegation and loadedOptions == None: maxNumOptions = int(len(options) / 2) else: maxNumOptions = len(options) while i < len(genericNumOptionsToEvaluate ) and genericNumOptionsToEvaluate[i] <= maxNumOptions: totalOptionsToUse.append(genericNumOptionsToEvaluate[i]) i += 1 for idx, numOptionsToUse in enumerate(totalOptionsToUse): returns_eval.append([]) returns_learn.append([]) if verbose: print 'Using', numOptionsToUse, 'options' for s in xrange(numSeeds): if verbose: print 'Seed: ', s + 1 returns_eval[idx].append([]) returns_learn[idx].append([]) actionSet = env.getActionSet() for i in xrange(numOptionsToUse): actionSet.append(options[i]) if useNegation and loadedOptions == None: numOptions = 2 * numOptionsToUse else: numOptions = numOptionsToUse learner = QLearning(alpha=alpha, gamma=gamma, epsilon=epsilon, environment=env, seed=s, useOnlyPrimActions=True, actionSet=actionSet, actionSetPerOption=actionSetPerOption) for i in xrange(numEpisodes): returns_learn[idx][s].append( learner.learnOneEpisode(timestepLimit=maxLengthEp)) returns_eval[idx][s].append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=maxLengthEp)) returns_learn_primitive = [] returns_eval_primitive = [] for s in xrange(numSeeds): returns_learn_primitive.append([]) returns_eval_primitive.append([]) learner = QLearning(alpha=alpha, gamma=gamma, epsilon=epsilon, environment=env, seed=s) for i in xrange(numEpisodes): returns_learn_primitive[s].append( learner.learnOneEpisode(timestepLimit=maxLengthEp)) returns_eval_primitive[s].append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=maxLengthEp)) return returns_eval_primitive, returns_eval, totalOptionsToUse
elif taskToPerform == 4: #Compute the average number of time steps between any two states gamma = 1.0 env.useNegativeRewards = True #I need this because I'm counting time steps stats = MDPStats(gamma=gamma, env=env, outputPath=outputPath) getExpectedNumberOfStepsFromOption(env=env, eps=epsilon, verbose=verbose, discoverNegation=bothDirections, loadedOptions=loadedOptions) elif taskToPerform == 5: #Solve for a given goal (q-learning) returns_learn = [] returns_eval = [] learner = QLearning(alpha=0.1, gamma=0.9, epsilon=1.00, environment=env) for i in xrange(num_episodes): returns_learn.append( learner.learnOneEpisode(timestepLimit=max_length_episode)) returns_eval.append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=max_length_episode)) plt.plot(returns_eval) plt.show() elif taskToPerform == 6: #Solve for a given goal w/ primitive actions (q-learning) following options returns_eval_primitive, returns_eval, totalOptionsToUse = qLearningWithOptions( env=env, alpha=0.1,
class Agent(): ''' Creates an agent which contains a policy. Input are an environment object and the location of the agent, which is (0,0) by default. ''' ACTION_UP = (-1, 0) ACTION_DOWN = ( 1, 0) ACTION_RIGHT = ( 0, 1) ACTION_LEFT = ( 0,-1) ACTION_STAY = ( 0, 0) actions = set([ACTION_UP, ACTION_DOWN, ACTION_RIGHT, ACTION_LEFT, ACTION_STAY]) policy = dict() location = None def __init__( self, environment, location=(0,0) ): self.Environment = environment self.location = location self.QLearning = QLearning( self, 0.5, 0.7, 0.1) def getActionEpsilonGreedy( self, s ): ''' a <- getActionEpsilonGreedy(s) Find an action using the current state s, in an epsilon-greedy fashion. ''' # Find the action that maximizes Q[(s, a)] prob_actions = dict() uniform_epsilon = self.QLearning.epsilon / (len(self.actions)) for possible_a in self.actions: # Set probabilities of all actions uniformly prob_actions[possible_a] = uniform_epsilon best_a = argmax( self.QLearning.Q[s] ) prob_actions[best_a] += 1 - self.QLearning.epsilon # For every action, check if the cumulative probability exceeds a # random number. random_number = random.random() cumulative_prob = 0.0 for a in self.actions: cumulative_prob += prob_actions[a] if cumulative_prob >= random_number: return a def getAction( self, s ): ''' a <- getAction(s) Get the optimal action given the current state s, using Q[s]. ''' if not s in self.QLearning.Q: self.QLearning.initQ(s) best_a = argmax( self.QLearning.Q[s] ) return best_a def performAction( self, a ): raise NotImplementedError def updateQ(self, s, a, s_prime, r): raise NotImplementedError
def main(): print('Cart Pole') env = gym.make('CartPole-v1') q_learn = QLearning(env, num_episodes=3000) q_learn.run()
base_memory = 'res/memory1-0.bson' gamma = 0.99 learning_rate = 0.8 epoch_min = 3500000 epoch_max = 5000000 eps_min = 0.1 eps_max = 0.1 while eps_min <= eps_max: while epoch_min <= epoch_max: bp = BoardPossitionParams() q = QLearning(bp, gamma=gamma, learning_rate=learning_rate, epochs=epoch_min, eps=eps_min, name=base_memory) q.learning() q.save() fp = base_memory.split('.')[0] + '_Q_trained_ep' + str(epoch_min) + '_g' + str(int(gamma * 100)) + \ '_l' + str(int(learning_rate * 10)) + '_e' + str(int(eps_min * 100)) + '.bson' play = Play(fp, False) wins, rounds = play.play_stats(games_to_play) with open(("res/eps_"+str(int(eps_min * 100))), 'a') as outfile: outfile.write(str(epoch_min)+"-"+str(wins)+"-"+str(rounds)+"\n") print('Win perc:', wins,'Average Rounds:', rounds) epoch_min += 500000
from variables3_3 import * import random from QLearning import QLearning from write_results import write_results print("Done with importing") tb = TarockBasics() qlearning = QLearning(0.1, 0.1, 0.1) def play_one_game(p1, p2, p3, talon, ps, sp, duo, strats): msa = MilestoneAgents(p1, p2, p3, [1,2,3], solo, duo) msa.update_state_talon(talon) points = {1: 0, 2: 0, 3: 0} while msa.player1_cards: first = msa.starting_player second = msa.second_player third = msa.third_player if strats[first] == "me": if first not in msa.duo: card1 = get_solo_first_card_3_3(msa, first, qlearning) else: card1 = get_duo_first_card_3_3(msa, first, qlearning) elif strats[first] == "LWW": card1 = msa.locally_worst_worst_agent(first) elif strats[first] == "LW": card1 = msa.locally_worst_agent(first)
import http.server import socketserver import gym import gym_sample from QLearning import QLearning import json PORT = 8080 env = gym.make('sample-v0') qLearning = QLearning(env, 99) class Handler(http.server.SimpleHTTPRequestHandler): def do_GET(self): self.send_response(200) self.send_header('Content-type', 'application/json') self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() if (self.path == '/train'): self.wfile.write( json.dumps( qLearning.train(5000, 0.7, 0.618, 1, 1, 0.01, 0.01).tolist()).encode('utf8')) elif (self.path == '/ai_step'): self.wfile.write(json.dumps(qLearning.ai_step()).encode('utf8')) elif (self.path == '/reset'): qLearning.reset() self.wfile.write('{"reseted": true}'.encode('utf8')) else: self.wfile.write('{"rodando": "ta"}'.encode('utf8'))
save_name = "q_values_" # have a look at LearningPolicy.py for other policies epsilon_policy = LearningPolicy.exponentially_annealed_epsilon(1 / 10000, 0.0) epsilon_policy_2 = LearningPolicy.linear_annealed_epsilon(1., 0.1, 100) alpha1 = 0.2 alpha2 = 0.1 hyperparameters = {"alpha": alpha2, "discount": 0.99} # Please note: Numerous other settings can be adjusted in settings.py if training_mode: q = QLearning(epsilon_policy=epsilon_policy_2, map_name=map, hyperparameters=hyperparameters, save_name=save_name) while True: q.train() else: q = QLearning(epsilon_policy=LearningPolicy.constant_epsilon(0), map_name=map) if checkpoint_file is None: raise Exception("Please specify the checkpoint file path!") q_values = AgentManager.load_q_values(checkpoint_file) while True: q.test(q_values=q_values)
import numpy as np import matplotlib.pyplot as plt from QLearning import QLearning from numpy import loadtxt def stateNumber(state): (x,y,z) = state y = y * 32 z = z * 352 return x+y+z env = gym.make('Blackjack-v0') for i in [0.01]: for g in [0.000001,0.00001,0.0001,0.001,0.01]: for epi in [600000,700000,800000]: qlearn = QLearning(env, alpha=i, gamma=g, epsilon=0.9,epsilon_min=0.01, epsilon_dec=0.99, episodes=epi) q_table = qlearn.train('data/q-table-blackjack.csv', 'results/blackjack') #q_table = loadtxt('data/q-table-blackjack.csv', delimiter=',') #state= env.reset() #print(state) #state = stateNumber(state) #done = False # # #while not done: # action = np.argmax(q_table[state]) # state, reward, done, info = env.step(action) # print(action) # print(state) # state = stateNumber(state)
def __init__( self, environment, location=(0,0) ): self.Environment = environment self.location = location self.QLearning = QLearning( self, 0.5, 0.7, 0.1)
def render(self, Q_table): for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() sys.exit() self.display_surface.fill((0, 0, 0)) self.render_table(Q_table) self.render_player() self.render_target() pygame.display.update() pygame.time.wait(250) agent = QLearning(GridWorld(), buckets=(10, 10, 10, 10), lower_bounds=0, upper_bounds=9, num_episodes=1000, min_lr=0.001, min_epsilon=0.3) def run(): t = 0 done = False current_state = agent.discretize_state(agent.env.reset()) while not done: t += 1 action = agent.choose_action(current_state) agent.env.render(agent.Q_table) obs, reward, done, _ = agent.env.step(action) new_state = agent.discretize_state(obs) current_state = new_state return t
get_size_except_dim(Input)]) Output = tf.layers.dense(inputs=Reshape, units=10, activation=None) Labels = tf.cast(tf.reshape(Labels, shape=[BatchSize]), tf.int64) #OneHotLabels=tf.one_hot(Labels,depth=10,axis=-1) Loss = tf.losses.sparse_softmax_cross_entropy(labels=Labels, logits=Output) Acc = tf.reduce_mean( tf.cast(tf.equal(Labels, tf.argmax(Output, 1)), tf.float32)) #print(Loss,Loss.shape.as_list()) #exit() #Loss=tf.reshape(Loss,shape=[-1,1]) return Output, Loss, Acc Mode = "Train" RL_Exp = QLearning() TaskSpec = { "LogHistory": True, "OperatorList": Op_List, "OperatorNum": OperatorLimit, "InputNum": 1, "OutputNum": 1, "TaskInput": images, "TaskLabel": labels, "Epochs": TrainEpochs, "NetworkDecor": NetworkDecor, "BatchSize": BatchSize, "ConcatOperator": ConcatOperatorDense, "InputOperator": ImageInput, "TrajectoryLength": OperatorLimit - 4, "RewardGamma": 0.9
# def print_frames(frames): # for i, frame in enumerate(frames): # clear_output(wait=True) # #print(frame['frame']) # #print(frame['frame'].getvalue()) # print(f"Timestep: {i + 1}") # print(f"State: {frame['state']}") # print(f"Action: {frame['action']}") # print(f"Reward: {frame['reward']}") # sleep(.1) env = gym.make('Roulette-v0').env #q_table = loadtxt('data/q-table-roulette.csv', delimiter=',') #2600loss - stable qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.001, epsilon_dec=0.9999, episodes=1000000) # 500-1000loss - real player like #qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.1, epsilon_dec=0.7, episodes=1000000) q_table = qlearn.train('data/q-table-roulette.csv', None) #q_table = loadtxt('data/q-table-roulette.csv', delimiter=',') state = env.reset() done = False rewards = 0 actions = 0 while not done: action = np.argmax(q_table) state, reward, done, info = env.step(action)
#!/usr/bin/python3 from PK_Handler import PK_Handler from PK_Game import PK_Game from PK_Player_Greedy import PK_Player_Greedy from QLearning import QLearning game = PK_Game() qplayer = PK_Player_Greedy(game) qlearning = QLearning(game, qplayer, 0.05, 0.95, 0.8) qplayer.set_optimizer(qlearning) game.set_player1(qplayer) handler = PK_Handler(game) handler.train(1000000) print(qplayer.probas)
def update(env, RL): for episode in range(MAX_EPISODES): state = env.reset() # initial state while True: action = RL.choose_action(str(state)) # RL choose action based on state state_, reward = env.step(action) # RL take action and get next state and reward RL.learn(str(state), action, reward, str(state_)) # RL learn from this transition state = state_ # swap state if state == 'terminal': # break while loop when end of this episode break if episode % 500 == 0: simulation(episode) # writer = pd.ExcelWriter('./file1.xlsx') # RL.q_table.to_excel(writer) # writer.save() print('game over') print(RL.q_table) if __name__ == "__main__": env = Game() RL = QLearning( actions = list(range(env.n_actions)), learning_rate = ALPHA, reward_decay = GAMMA, e_greedy = EPSILON ) update(env, RL)
tries = 100 episodes = 1000 results = np.zeros((tries, episodes)) #### run with QLearning for t in range(tries): # define learning settings epsilon_decay = 1 - (1 / episodes) * 6 learning_decay = 1 - (1 / episodes) * 3 agent = QLearning(env.env, learning_rate=0.5, discount_factor=0.9, exploration_rate=0, epsilon_decay_func=lambda x: x * epsilon_decay, alpha_decay_func=lambda x: x * learning_decay, qtable_default=1) # fit and save results env.fit(agent, episodes) results[t, :] = agent.rewards_per_episode # plot rewards plot_rewards(np.mean(results, axis=0), smoothing=0.1, color='blue') #### run with SARSA # define learning settings
class Agent(object): def __init__(self, mark, board, game, player_type): self.mark = mark self.board = board self.game = game self.player_type = player_type self.action = Action(self.game) self.q_learning = QLearning() self.ordered_actions = [] def reset(self): """Resets the list of actions taken""" self.ordered_actions = [] def store_action(self, action, state, game): """ Store the action taken in an ordered list of actions taken during the game """ self.ordered_actions.append({ "action": action, "state": state, "game_layout": list(game) }) def select_action(self, epsilon=0.0, state=None): """ Select the action based on the players types @param epsilon: the E-greedy action value. Determines if a random actions (exploration) should be chosen over learned Q-values (exploitation). @param state: the current state of the game, used to determine the available actions @return the index of where the marker will be placed (the action) """ if self.player_type == "human": action = self.get_input() elif self.player_type == "random": action = self.action.get_random_action() elif self.player_type == "qlearning": action = self.action.get_egreedy_action(epsilon, self.q_learning, state) else: print "Undefined player type" raise NotImplementedError return action def learn_from_game(self, alpha, reward, gamma): """ Update the Q-table if the players type is qlearning. Updating the Q-table happens by going back through the actions the player took until the game ended. Every action will go through the Q-learning update rule/equation. """ if not self.player_type == "qlearning": # the other type of agents do not learn return for index, action in enumerate(self.ordered_actions): if index + 1 < len(self.ordered_actions): next_state = self.ordered_actions[index + 1]["state"] else: # This is the last action before the game ended (no further states) next_state = None state = action[ "state"] # get the state before the action was executed # string representation of the game layout selected_action = action["action"] # get the action game = action[ "game_layout"] # get the current game layout as a list # Only the last state receives a reward if index == len(self.ordered_actions) - 1: state_reward = reward else: state_reward = 0 possible_actions = self.action.get_valid_actions(game) self.q_learning.update_q(state, selected_action, next_state, possible_actions, alpha, state_reward, gamma) def do_action(self, action): """ Perform an action based on the player type. An actions is defined as placing a mark in one of the empty cells. Currently the game will support three types of players: - human: us - random: a bot that just places marks randomly in cells - qlearning: a smarter bot (hopefully) that learns what the best options are """ pos = [action // 3, action % 3] self.game.set_cell(pos, self.mark) self.board.update(pos, self.mark) def get_input(self): """ Get input from user via keyboard. Input refers to the index of one of the fields @return a position in the board where the user wants there mark to be placed """ self.board.print_game() while True: user_input = raw_input("Your turn " + str(self.get_mark()) + " (0-8 or 'q' to quit): ") valid_actions = self.action.get_valid_actions() if user_input.isalpha() and user_input == "q": return user_input elif user_input.isalpha(): print "Invallid character, please try again" continue if (int(user_input) >= 0 and int(user_input) < 9 and \ valid_actions[int(user_input)] == 1): return int(user_input) print "Invallid turn, please try another cell" def get_type(self): """ Return the type of user: human, random, qlearning """ return self.player_type def get_mark(self): """ Get the mark the agent is using @return a string representing the mark. E.q. 'X' or 'O' """ return self.mark
class FenetrePrincipale(QMainWindow): apprentissage_fini = pyqtSignal() def __init__(self): super(FenetrePrincipale,self).__init__() # Charge un labyrinthe vide self.labyrinthe = Labyrinthe() # Charge les images correspondantes aux type de case self.ICONE_VIDE = QPixmap("Icones/floor.png") self.ICONE_ENTREE = QPixmap("Icones/entry.png") self.ICONE_SORTIE = QPixmap("Icones/exit.png") self.ICONE_MUR = QPixmap("Icones/wall.png") self.ICONE_PIEGE = QPixmap("Icones/trap.png") self.ICONE_PERSO = QPixmap("Icones/perso_front.png") self.case_perso = None # Position courante du personnage self.nb_coups = 0 # Numéro de l'itération en cours self.nb_coups_max = 0 # Nb d'itérations max self.nb_coups_en_suivant = 0 # Nb d'itérations choisies dans la zone de texte self.toggleAlgo = False # Permet de switcher entre exploration et exploitation self.case_init = None # Position initiale du personnage self.politique_choisie = 0 # Politique d'apprentissage choisie depuis les paramètres self.apprentissage_termine = False # Indique la fin de l'apprentissage self.lecture_en_cours = False # Indique si une lecture (bouton play) est en cours self.deplacement_en_cours = False self.last_move = None self.moyenne_renforcement = [0,0,0,0,0,0] # Enregistre les moyennes de renforcement pour un labyrinthe pour chaque politique self.chargerParametresQLearning() self.apprentissage_fini.connect(self.reinitialiserApprentissage) # Signal emit lors d'une fin d'apprentissage # Initialise les éléments de la fenêtre self.initUI() ''' Initialise l'interface de la fenêtre ''' def initUI(self): # Crée les boutons de la barre d'outil self.newAction = QAction(QIcon('Icones/new.png'),u'&Nouveau labyrinthe',self) self.newAction.setShortcut('Ctrl+N') self.newAction.triggered.connect(self.nouveauLabyrinthe) self.openAction = QAction(QIcon('Icones/open.png'),u'&Ouvrir un labyrinthe',self) self.openAction.setShortcut('Ctrl+O') self.openAction.triggered.connect(self.chargerLabyrinthe) self.saveAction = QAction(QIcon('Icones/save.png'),u'&Enregistrer le labyrinthe',self) self.saveAction.setEnabled(False) self.saveAction.setShortcut('Ctrl+S') self.saveAction.triggered.connect(self.enregistrerLabyrinthe) self.playAction = QAction(QIcon('Icones/play.png'),u'&Lancer l\'apprentissage jusqu\'à atteindre le nb d\'itérations max',self) self.playAction.setEnabled(False) self.playAction.setShortcut('Ctrl+P') self.playAction.triggered.connect(self.runPlayPause) self.nextAction = QAction(QIcon('Icones/next.png'),u"&Exécuter une étape d'apprentissage",self) self.nextAction.setEnabled(False) self.nextAction.setShortcut('Ctrl+E') self.nextAction.triggered.connect(self.runPolitique1Coup) self.nb_coups_simules = QLineEdit(self) self.nb_coups_simules.setFixedWidth(90) self.nb_coups_simules.setEnabled(False) self.nb_coups_simules.setValidator( QIntValidator(self) ) self.nb_coups_simules.setToolTip(u"Entrez un nb d'itération à effectuer puis valider") self.nb_coups_simules.returnPressed.connect(self.runPolitiqueNbCoups) self.stopAction = QAction(QIcon('Icones/stop.png'),'&Arreter',self) self.stopAction.setEnabled(False) self.stopAction.setShortcut('Ctrl+S') self.stopAction.triggered.connect(self.stopPolitique) self.moyAction = QAction(QIcon('Icones/moy.png'),'&Afficher les moyennes de renforcement',self) self.moyAction.setShortcut('Ctrl+M') self.moyAction.triggered.connect(self.showMoy) self.maxQAction = QAction(QIcon('Icones/maxQ.png'),'&Afficher/Masquer les max(Q)',self) self.maxQAction.setCheckable(True) self.maxQAction.setShortcut('Ctrl+Q') self.maxQAction.triggered.connect(self.triggerMaxQ) self.settingsAction = QAction(QIcon('Icones/settings.png'),u'&Modifier les paramètres d\'apprentissage',self) self.settingsAction.setShortcut('Ctrl+U') self.settingsAction.triggered.connect(self.modifierParametres) # Ajoute un spacer spacer = QWidget() spacer.setSizePolicy(QSizePolicy.Expanding,QSizePolicy.Expanding) # Ajoute les boutons à la barre d'outil self.toolbar = self.addToolBar('toolbar') self.toolbar.setAllowedAreas(Qt.TopToolBarArea) self.toolbar.setFloatable(False) self.toolbar.addAction(self.newAction) self.toolbar.addAction(self.openAction) self.toolbar.addAction(self.saveAction) self.toolbar.addAction(self.playAction) self.toolbar.addAction(self.nextAction) self.toolbar.addAction(self.stopAction) self.toolbar.addWidget(self.nb_coups_simules) self.toolbar.addWidget(spacer) self.toolbar.addAction(self.moyAction) self.toolbar.addAction(self.maxQAction) self.toolbar.addAction(self.settingsAction) self.toolbar.setContextMenuPolicy(Qt.CustomContextMenu) # Ajoute une barre de setStatusTip self.lbl_status = QLabel(u"Politique " + str(self.politique_choisie+1) + u" | Nb passes max = 0" + u" | Itération n° " + str(self.nb_coups)) self.statusBar().addPermanentWidget(self.lbl_status,1) # Ajoute une grille pour le labyrinthe self.grille = QTableWidget() self.grille.setFrameShape(QFrame.NoFrame) self.grille.setShowGrid(False) self.grille.horizontalHeader().hide() self.grille.verticalHeader().hide() self.grille.setSelectionMode(QAbstractItemView.NoSelection) self.grille.setEditTriggers(QAbstractItemView.NoEditTriggers) self.grille.cellDoubleClicked.connect(self.drawCase) self.grille.setFocusPolicy(Qt.NoFocus) self.grille.verticalHeader().setResizeMode(QHeaderView.Fixed) self.grille.horizontalHeader().setResizeMode(QHeaderView.Fixed) self.grille.verticalHeader().setDefaultSectionSize(32) self.grille.horizontalHeader().setDefaultSectionSize(32) self.entreeAction = QAction(QIcon(self.ICONE_ENTREE),u'Entrée',self) self.entreeAction.setIconVisibleInMenu(True) self.entreeAction.setCheckable(True) self.sortieAction = QAction(QIcon(self.ICONE_SORTIE),u'Sortie',self) self.sortieAction.setIconVisibleInMenu(True) self.sortieAction.setCheckable(True) self.murAction = QAction(QIcon(self.ICONE_MUR),u'Mur',self) self.murAction.setIconVisibleInMenu(True) self.murAction.setCheckable(True) self.piegeAction = QAction(QIcon(self.ICONE_PIEGE),u'Piège',self) self.piegeAction.setIconVisibleInMenu(True) self.piegeAction.setCheckable(True) self.videAction = QAction(QIcon(self.ICONE_VIDE),u'Vide',self) self.videAction.setIconVisibleInMenu(True) self.videAction.setCheckable(True) self.persoAction = QAction(QIcon(self.ICONE_PERSO),u'Placer le personnage',self) self.persoAction.setIconVisibleInMenu(True) self.persoAction.setCheckable(True) self.groupe_action = QActionGroup(self) self.groupe_action.addAction(self.persoAction) self.groupe_action.addAction(self.entreeAction) self.groupe_action.addAction(self.sortieAction) self.groupe_action.addAction(self.murAction) self.groupe_action.addAction(self.piegeAction) self.groupe_action.addAction(self.videAction) self.persoAction.setChecked(True) # Ajoute les boutons à la barre d'outil self.toolbar2 = QToolBar(self) self.addToolBar(Qt.RightToolBarArea,self.toolbar2) self.toolbar2.setFloatable(False) self.toolbar2.addAction(self.persoAction) self.toolbar2.addAction(self.entreeAction) self.toolbar2.addAction(self.sortieAction) self.toolbar2.addAction(self.murAction) self.toolbar2.addAction(self.piegeAction) self.toolbar2.addAction(self.videAction) self.toolbar2.setContextMenuPolicy(Qt.CustomContextMenu) # Initialise et centre la fenêtre bureau = QDesktopWidget(); largeur = self.sizeHint().width() self.setGeometry((bureau.screen().width()/2)-(largeur/2),(bureau.screen().height()/2)-(largeur/2),largeur,largeur); self.setCentralWidget(self.grille) self.setWindowIcon(QIcon('Icones/maze.png')) self.setWindowTitle("Labyrinthe QLearning") self.show() ''' Actualise l'affichage de la barre de status ''' def changeStatus(self): self.lbl_status.setText(u"Politique " + str(self.politique_choisie+1) + u" | Nb passes max " + str(self.nb_coups_max) + u" | Itération n° " + str(self.nb_coups)) ''' Affiche/Masque les q de chaque case ''' def triggerMaxQ(self): # Affiche les q de chaque case if(self.maxQAction.isChecked()): for ligne in range(self.labyrinthe.nb_lignes): for colonne in range(self.labyrinthe.nb_colonnes): maxq = max(self.qlearning.table_q[ligne,colonne]) self.grille.item(ligne,colonne).setText(str(round(maxq,1))) liste_Q = [] self.grille.item(ligne,colonne).setToolTip("G:"+str(self.qlearning.table_q[ligne,colonne][0])+" D:"+str(self.qlearning.table_q[ligne,colonne][1])+" H:"+str(self.qlearning.table_q[ligne,colonne][2])+" B:"+str(self.qlearning.table_q[ligne,colonne][3])) # Masque les q de chaque case else: for ligne in range(self.labyrinthe.nb_lignes): for colonne in range(self.labyrinthe.nb_colonnes): self.grille.item(ligne,colonne).setText('') ''' Colorie la case double cliquée suivant le type de case choisi ''' def drawCase(self,row,column): brush = QBrush() brush.setTexture(self.ICONE_VIDE) if(self.persoAction.isChecked()): # Positionne le personnage self.positionnerPerso(row,column) self.case_perso = self.labyrinthe.grille[row][column] self.case_init = self.case_perso self.qlearning.case_init = self.case_perso # Active les actions self.playAction.setEnabled(True) self.nextAction.setEnabled(True) self.nb_coups_simules.setEnabled(True) elif(self.entreeAction.isChecked()): brush.setTexture(self.ICONE_ENTREE) type_case = ENTREE elif(self.sortieAction.isChecked()): brush.setTexture(self.ICONE_SORTIE) type_case = SORTIE elif(self.murAction.isChecked()): brush.setTexture(self.ICONE_MUR) type_case = MUR elif(self.piegeAction.isChecked()): brush.setTexture(self.ICONE_PIEGE) type_case = PIEGE elif(self.videAction.isChecked()): brush.setTexture(self.ICONE_VIDE) type_case = VIDE # Le labyrinthe a été modifié, on autorise donc l'enregistrement if(self.persoAction.isChecked()==False): if(self.labyrinthe.grille[row][column].type!=type_case): self.saveAction.setEnabled(True) # Met à jour le labyrinthe self.labyrinthe.grille[row][column].type=type_case # Change la couleur de la case suivant son nouveau type self.grille.item(row, column).setBackground(brush) ''' Récupère les paramètres enregistrés du QLearning ''' def chargerParametresQLearning(self): my_settings = QSettings("Bazerque-Vigie", "Labyrinthe"); # Charge les paramètres généraux my_settings.beginGroup("General") if (my_settings.contains("politique")): self.politique_choisie = my_settings.value("politique").toInt()[0] if (my_settings.contains("nb_coups")): self.nb_coups_max = my_settings.value("nb_coups").toInt()[0] if (my_settings.contains("vitesse")): self.vitesse = my_settings.value("vitesse").toInt()[0] my_settings.endGroup() # Charge les coûts des déplacements my_settings.beginGroup("Couts") if (my_settings.contains("deplacement_normal")): deplacement_normal = my_settings.value("deplacement_normal").toFloat()[0] if (my_settings.contains("deplacement_piege")): deplacement_piege = my_settings.value("deplacement_piege").toFloat()[0] if (my_settings.contains("deplacement_sortie")): deplacement_sortie = my_settings.value("deplacement_sortie").toFloat()[0] self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init) ''' Modifie les paramètres ''' def modifierParametres(self): f = FenetreParametres(self) if (f.exec_()): # Récupère les choix dès que la FenetreParametres est validée self.politique_choisie,self.nb_coups_max, self.vitesse, deplacement_normal,deplacement_piege,deplacement_sortie = f.getChoix() # Modifie les paramètres initiaux du QLearning self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init) # Actualise la barre de status self.changeStatus() ''' Affiche les moyennes d'apprentissage calculées pour le labyrinthe actuel ''' def showMoy(self): # Lance une fenêtre pour créer un nouveau labyrinthe f = FenetreMoyennes(self,self.moyenne_renforcement[0],self.moyenne_renforcement[1],self.moyenne_renforcement[2],self.moyenne_renforcement[3],self.moyenne_renforcement[4],self.moyenne_renforcement[5]) f.exec_() ''' Initialise le labyrinthe à partir d'un fichier ''' def chargerLabyrinthe(self): # Instancie un labyrinthe à partir d'un fichier path_labyrinthe = QFileDialog.getOpenFileName(self,'Charger un labyrinthe','./') if (path_labyrinthe!=""): self.labyrinthe = Labyrinthe() self.labyrinthe.loadMaze(path_labyrinthe) # Charge les paramètres du QLearning self.chargerParametresQLearning() # Estime un nombre de coups maximum pour le labrinthe self.nb_coups_max = self.labyrinthe.nb_lignes*self.labyrinthe.nb_colonnes*20 my_settings = QSettings("Bazerque-Vigie", "Labyrinthe"); my_settings.beginGroup("General"); my_settings.setValue("nb_coups", self.nb_coups_max) my_settings.endGroup() # Initialise la grille du labyrinthe selon le labyrinthe chargé self.grille.setRowCount(self.labyrinthe.nb_lignes) self.grille.setColumnCount(self.labyrinthe.nb_colonnes) brush = QBrush() for ligne in range(self.labyrinthe.nb_lignes): for colonne in range(self.labyrinthe.nb_colonnes): # Ajoute un item vide à la grille self.grille.setItem(ligne,colonne,QTableWidgetItem()) self.grille.item(ligne,colonne).setTextAlignment(Qt.AlignCenter) self.grille.item(ligne,colonne).setForeground(Qt.black) # Suivant le type de case modifie l'image de la case if(self.labyrinthe.grille[ligne][colonne].type==ENTREE): brush.setTexture(self.ICONE_ENTREE) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==SORTIE): brush.setTexture(self.ICONE_SORTIE) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==MUR): brush.setTexture(self.ICONE_MUR) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==PIEGE): brush.setTexture(self.ICONE_PIEGE) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==VIDE): brush.setTexture(self.ICONE_VIDE) self.grille.item(ligne,colonne).setBackground(brush) self.saveAction.setEnabled(False) # Affiche ou masque les MaxQ selon l'état du bouton MaxQ self.triggerMaxQ() # Reinitialise le perso si un labyrinthe avait déjà été utilisé self.case_perso = None self.changeStatus() self.moyenne_renforcement=[0,0,0,0,0,0] ''' Crée un nouveau labyrinthe ''' def nouveauLabyrinthe(self): # Lance une fenêtre pour créer un nouveau labyrinthe f = FenetreNewLabyrinthe(self) aleatoire = False if (f.exec_()): # Récupère les choix dès que la FenetreNewLabyrinthe est validée nb_lignes,nb_colonnes, aleatoire = f.getChoix() self.labyrinthe = Labyrinthe(nb_lignes,nb_colonnes,aleatoire) # Charge les paramètres du QLearning self.chargerParametresQLearning() # Estime un nombre de coups maximum pour le labrinthe self.nb_coups_max = self.labyrinthe.nb_lignes*self.labyrinthe.nb_colonnes*20 my_settings = QSettings("Bazerque-Vigie", "Labyrinthe"); my_settings.beginGroup("General"); my_settings.setValue("nb_coups", self.nb_coups_max) my_settings.endGroup() # Initialise la grille suivant les choix effectués self.grille.setRowCount(nb_lignes) self.grille.setColumnCount(nb_colonnes) brush = QBrush() brush.setTexture(self.ICONE_VIDE) for ligne in range(nb_lignes): for colonne in range(nb_colonnes): # Ajoute un item vide à la grille self.grille.setItem(ligne,colonne,QTableWidgetItem()) self.grille.item(ligne,colonne).setTextAlignment(Qt.AlignCenter) self.grille.item(ligne,colonne).setForeground(Qt.black) # Suivant le type de case modifie l'image de la case if(self.labyrinthe.grille[ligne][colonne].type==MUR): brush.setTexture(self.ICONE_MUR) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==VIDE): brush.setTexture(self.ICONE_VIDE) self.grille.item(ligne,colonne).setBackground(brush) # Empêche l'utilisateur de cliquer sur le bouton play self.playAction.setEnabled(False) self.nextAction.setEnabled(False) self.saveAction.setEnabled(True) self.nb_coups_simules.setEnabled(False) # Affiche ou masque les MaxQ selon l'état du bouton MaxQ self.triggerMaxQ() # Reinitialise le perso si un labyrinthe avait déjà été utilisé self.case_perso = None self.changeStatus() self.moyenne_renforcement=[0,0,0,0,0,0] ''' Enregistre le labyrinthe dessiné dans un fichier ''' def enregistrerLabyrinthe(self): # Lance une fenêtre pour récupérer l'endroit ou enregistrer le labyrinthe path_labyrinthe = QFileDialog.getSaveFileName(self,'Enregistrer le labyrinthe','./') if(path_labyrinthe!=""): try: with open(path_labyrinthe,'w') as f: # Ecrit le labyrinthe caractère par caractère suivant la couleur (donc le type) de chaque case for ligne in range(self.grille.rowCount()): for colonne in range(self.grille.columnCount()): if (self.labyrinthe.grille[ligne][colonne].type==ENTREE): f.write('E ') elif (self.labyrinthe.grille[ligne][colonne].type==SORTIE): f.write('S ') elif (self.labyrinthe.grille[ligne][colonne].type==MUR): f.write('M ') elif (self.labyrinthe.grille[ligne][colonne].type==PIEGE): f.write('P ') elif (self.labyrinthe.grille[ligne][colonne].type==VIDE): f.write('. ') f.write('\n') f.close() # Désactive le bouton sauvegarder et active le bouton play self.saveAction.setEnabled(False) except IOError: # Si le fichier est illisible print "Impossible de sauvegarder le labyrinthe dans ce fichier" ''' Efface le personnage de son ancienne case et le dessine sur la nouvelle ''' def deplacerPerso(self,ligne,colonne): brush = QBrush() # Efface le personnage de sa case actuelle # Suivant le type de case modifie l'image de la case if(self.labyrinthe.grille[ligne][colonne].type==ENTREE): brush.setTexture(self.ICONE_ENTREE) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==SORTIE): brush.setTexture(self.ICONE_SORTIE) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==PIEGE): brush.setTexture(self.ICONE_PIEGE) self.grille.item(ligne,colonne).setBackground(brush) elif(self.labyrinthe.grille[ligne][colonne].type==VIDE): brush.setTexture(self.ICONE_VIDE) self.grille.item(ligne,colonne).setBackground(brush) # Dessine le personnage sur sa nouvelle case brush.setTexture(self.ICONE_PERSO) # self.position_perso = (ligne,colonne) self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush) ''' Efface le personnage de sa dernière case ''' def effacerPerso(self): brush = QBrush() if(self.case_perso!=None): # Efface le personnage de sa case actuelle # Suivant le type de case modifie l'image de la case if(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==ENTREE): brush.setTexture(self.ICONE_ENTREE) self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush) elif(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==SORTIE): brush.setTexture(self.ICONE_SORTIE) self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush) elif(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==PIEGE): brush.setTexture(self.ICONE_PIEGE) self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush) elif(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==VIDE): brush.setTexture(self.ICONE_VIDE) self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush) ''' Place le personnage à l'endroit choisi par l'utilisateur ''' def positionnerPerso(self,ligne,colonne): brush = QBrush() if (self.labyrinthe.grille[ligne][colonne].type==ENTREE): self.effacerPerso() # Dessine le personnage sur sa nouvelle case brush.setTexture(self.ICONE_PERSO) self.grille.item(ligne,colonne).setBackground(brush) else: msg_warning = QMessageBox() msg_warning.setIcon(QMessageBox.Warning) msg_warning.setText(u"Le personnage doit être placé sur une entrée du labyrinthe !") msg_warning.exec_() ''' Stoppe l'apprentissage en cours ''' def stopPolitique(self): self.apprentissage_termine = True self.deplacement_en_cours = False self.lecture_en_cours = False # Fait un tour de plus pour arreter l'algo si click sur suivant self.runPolitique1Coup() ''' Lance l'apprentissage ou le met en pause suivant l'état du bouton ''' def runPlayPause(self): # Si le bouton est en mode play et click, et qu'il n'y a pas de déplacement en cours, lance l'algo if(self.lecture_en_cours==False and self.deplacement_en_cours==False): self.deplacement_en_cours = True # Empeche l'utilisateur de mofifier les parametres, de créer ou d'ouvrir un labyrinthe self.newAction.setEnabled(False) self.openAction.setEnabled(False) self.settingsAction.setEnabled(False) # Autorise l'utilisateur à cliquer sur le bouton stop self.stopAction.setEnabled(True) self.lecture_en_cours = True self.playAction.setIcon(QIcon('Icones/pause.png')) # Lance l'apprentissage avec la politique choisie self.runPolitiqueNbCoupsMax() # Sinon met en pause else: self.lecture_en_cours = False self.deplacement_en_cours = False self.playAction.setIcon(QIcon('Icones/play.png')) ''' Exécute la politique choisie ''' def runPolitiqueChoisie(self): if(self.politique_choisie==0): self.politique1() elif(self.politique_choisie==1): self.politique2() elif(self.politique_choisie==2): self.politique3() elif(self.politique_choisie==3): self.politique4() elif(self.politique_choisie==4): self.politique5() elif(self.politique_choisie==5): self.politique6() ''' Execute la politique choisie une fois ''' def runPolitique1Coup(self): if(self.apprentissage_termine==False): self.stopAction.setEnabled(True) self.newAction.setEnabled(False) self.openAction.setEnabled(False) self.settingsAction.setEnabled(False) # Effectue la politique choisie self.runPolitiqueChoisie() else: self.apprentissage_fini.emit() # Met à jour les QMAX self.triggerMaxQ() # Met à jour la barre de status self.changeStatus() ''' Execute la politique tant que le nb max de coups n'est pas atteint ''' ''' ou que l'utilisateur ne clique pas sur pause ou sur stop ''' def runPolitiqueNbCoupsMax(self): if(self.apprentissage_termine==False): if(self.lecture_en_cours==True): # Effectue la politique choisie self.runPolitiqueChoisie() # Exécute l'étape suivante au bout d'un délai définit par la vitesse du perso QTimer.singleShot(int(1000/self.vitesse), self.runPolitiqueNbCoupsMax) else: self.apprentissage_fini.emit() # Met à jour les QMAX self.triggerMaxQ() # Met à jour la barre de status self.changeStatus() ''' Execute la politique tant que le nb de coups entrés dans la barre ''' ''' d'outil n'est pas atteint ou que l'utilisateur ne clique pas sur ''' ''' le bouton stop ''' def runPolitiqueNbCoups(self): # Lit le nb d'itération à effectuer nb_iterations = int(self.nb_coups_simules.text()) self.stopAction.setEnabled(True) if(self.apprentissage_termine==False): if (self.lecture_en_cours==False): if(self.nb_coups_en_suivant<nb_iterations): self.deplacement_en_cours = True # Effectue la politique choisie self.runPolitiqueChoisie() self.nb_coups_en_suivant +=1 QTimer.singleShot(int(1000/self.vitesse), self.runPolitiqueNbCoups) else: self.deplacement_en_cours = False self.nb_coups_en_suivant = 0 nb_iterations = 0 else: self.apprentissage_fini.emit() self.deplacement_en_cours = False # Met à jour les QMAX self.triggerMaxQ() # Met à jour la barre de status self.changeStatus() ''' La politique 1 effectue une exploration pure. Les cases explorées sont choisies aléatoirement. ''' def politique1(self): print "nbcoups",self.nb_coups # if(self.lecture_en_cours==False and self.deplacement_en_cours==False): if(self.nb_coups==self.nb_coups_max-1): self.apprentissage_termine=True self.moyenne_renforcement[0] = self.qlearning.somme_recompenses/self.nb_coups_max print "Moyenne renforcement = ",self.moyenne_renforcement[0] else: case_precedente = self.case_perso self.case_perso = self.qlearning.exploration_pure(self.case_perso) self.nb_coups = self.nb_coups + 1 self.deplacerPerso(case_precedente.position[0],case_precedente.position[1]) ''' La politique 2 effectue une exploration exigente. L'exploration \"exigente\" défavorise un retour sur la dernière case explorée. ''' def politique2(self): print "nbcoups",self.nb_coups # if(self.lecture_en_cours==False and self.deplacement_en_cours==False): if(self.nb_coups==self.nb_coups_max-1): self.apprentissage_termine=True self.moyenne_renforcement[1] = self.qlearning.somme_recompenses/self.nb_coups_max print "Moyenne renforcement = ",self.moyenne_renforcement[1] else: case_precedente = self.case_perso self.case_perso,self.last_move = self.qlearning.exploration_exigente(self.case_perso,self.last_move) self.nb_coups = self.nb_coups + 1 self.deplacerPerso(case_precedente.position[0],case_precedente.position[1]) ''' La politique 3 effectue une exploration infaillible. L'exploration \"infaillible\" interdit un retour sur la ''' ''' dernière case explorée (sauf cul-de-sac). ''' def politique3(self): print "nbcoups",self.nb_coups if(self.nb_coups==self.nb_coups_max-1): self.apprentissage_termine=True self.moyenne_renforcement[2] = self.qlearning.somme_recompenses/self.nb_coups_max print "Moyenne renforcement = ",self.moyenne_renforcement[2] else: case_precedente = self.case_perso self.case_perso,self.last_move = self.qlearning.exploration_infaillible(self.case_perso,self.last_move) self.nb_coups = self.nb_coups + 1 self.deplacerPerso(case_precedente.position[0],case_precedente.position[1]) ''' La politique 4 effectue la totalité des passes en exploration pure puis recommence avec des exploitations. ''' ''' L'exploration pure choisie les cases aléatoirement. ''' def politique4(self): print "nbcoups",self.nb_coups case_precedente = self.case_perso if(self.toggleAlgo==False): self.case_perso = self.qlearning.exploration_pure(self.case_perso) self.nb_coups = self.nb_coups + 1 if(self.nb_coups==self.nb_coups_max): self.toggleAlgo = True self.nb_coups = 0 self.case_perso = self.case_init else: self.case_perso = self.qlearning.exploitation(self.case_perso) self.nb_coups = self.nb_coups + 1 if(self.nb_coups==self.nb_coups_max-1): self.apprentissage_termine=True self.moyenne_renforcement[3] = self.qlearning.somme_recompenses/(self.nb_coups_max*2) print "Moyenne renforcement = ",self.moyenne_renforcement[3] self.deplacerPerso(case_precedente.position[0],case_precedente.position[1]) if(self.nb_coups==self.nb_coups_max and self.toggleAlgo==True): self.apprentissage_fini.emit() ''' La politique 5 effectue la totalité des passes en exploration exigente puis recommence avec des exploitations. ''' ''' L'exploration \"exigente\" défavorise un retour sur la dernière case explorée. ''' def politique5(self): print "nbcoups",self.nb_coups case_precedente = self.case_perso if(self.toggleAlgo==False): self.case_perso,self.last_move = self.qlearning.exploration_exigente(self.case_perso,self.last_move) self.nb_coups = self.nb_coups + 1 if(self.nb_coups==self.nb_coups_max): self.toggleAlgo = True self.nb_coups = 0 self.case_perso = self.case_init else: self.case_perso = self.qlearning.exploitation(self.case_perso) self.nb_coups = self.nb_coups + 1 if(self.nb_coups==self.nb_coups_max-1): self.apprentissage_termine=True self.moyenne_renforcement[4] = self.qlearning.somme_recompenses/(self.nb_coups_max*2) print "Moyenne renforcement = ",self.moyenne_renforcement[4] self.deplacerPerso(case_precedente.position[0],case_precedente.position[1]) ''' La politique 6 effectue la totalité des passes en exploration infaillible puis recommence avec des exploitations. ''' ''' L'exploration \"infaillible\" interdit un retour sur la dernière case explorée (sauf cul-de-sac). ''' def politique6(self): print "nbcoups",self.nb_coups case_precedente = self.case_perso if(self.toggleAlgo==False): self.case_perso,self.last_move = self.qlearning.exploration_infaillible(self.case_perso,self.last_move) self.nb_coups = self.nb_coups + 1 if(self.nb_coups==self.nb_coups_max): self.toggleAlgo = True self.nb_coups = 0 self.case_perso = self.case_init else: self.case_perso = self.qlearning.exploitation(self.case_perso) self.nb_coups = self.nb_coups + 1 if(self.nb_coups==self.nb_coups_max-1): self.apprentissage_termine=True self.moyenne_renforcement[5] = self.qlearning.somme_recompenses/(self.nb_coups_max*2) print "Moyenne renforcement = ",self.moyenne_renforcement[5] self.deplacerPerso(case_precedente.position[0],case_precedente.position[1]) ''' Réinitialise les données d'apprentissage lors d'un click sur stop ou à la fin de l'apprentissage ''' def reinitialiserApprentissage(self): # Remet les compteurs à 0 self.nb_coups_en_suivant=0 self.nb_coups=0 # Réinitialise l'affichage des boutons de controle de lecture self.apprentissage_termine=False self.lecture_en_cours = False self.toggleAlgo = False self.playAction.setIcon(QIcon('Icones/play.png')) self.nb_coups_simules.setEnabled(False) self.nextAction.setEnabled(False) self.playAction.setEnabled(False) self.stopAction.setEnabled(False) # Efface le personnage self.effacerPerso() self.case_perso = None # Signale à l'utilisateur que l'apprentissage est terminé msg_warning = QMessageBox() msg_warning.setStandardButtons(QMessageBox.Ok | QMessageBox.No) msg_warning.setDefaultButton(QMessageBox.Ok) msg_warning.setIcon(QMessageBox.Information) msg_warning.setText(u"Apprentissage terminé !\nVoulez-vous effacer les données d'apprentissage (table des q) ?") res = msg_warning.exec_() if(res==QMessageBox.Ok): # Reinitialise qlearning self.chargerParametresQLearning() # Autorise l'utilisateur à mofifier les parametres, à créer ou d'ouvrir un labyrinthe self.newAction.setEnabled(True) self.openAction.setEnabled(True) self.settingsAction.setEnabled(True)
opt_val = prob_env.still(prob_env.output_noiseless(opt_state)) wall_time_limit = kwargs.wall_time_limit generation = call_counts # for random search, generation means call counts elif kwargs.method == 'rl_prtr': model_env_name = get_model_env_name(kwargs.prtr_model_dir) assert model_env_name == prob_env_name opt = QLearning( k=prob_env.k, d=prob_env.d, env_name=prob_env_name, env_dir=kwargs.prob_env_dir, # env_dir will load environment env_fixed_xo=False, n_hidden=0, save_and_load_path=kwargs. prtr_model_dir, # model dir will load model load=True, tensorboard_path=None, logger_path=None, memory_capacity=None, memory_capacity_start_learning=None, learn_wall_time_limit=None, prioritized=None, save_model_iter=None, trial_size=0, ) # opt may learned from non fixed xo environment # but we will test it under fixed xo environment opt.set_env_fixed_xo(prob_env.x_o) assert opt.get_env_if_set_fixed_xo() opt_val, start_x_o, opt_x_p, start_x_p, duration = extract_rl_exp_result( opt, prob_env)
from environment import Env from QLearning import QLearning if __name__ == "__main__": env = Env() QL = QLearning(list(range(env.n_actions))) for episode in range(1000): state = env.reset() while True: env.render() # take action and proceed one step in the environment action = QL.get_action(str(state)) next_state, reward, done = env.step(action) # with sample <s,a,r,s'>, agent learns new q function QL.learn(str(state), action, reward, str(next_state)) state = next_state env.print_value_all(QL.q_table) # if episode ends, then break if done: break
# import Q Learning function from QLearning import QLearning # define experiment parameters gamma = 0.99 lr = 0.1 epsilon = [0.01, 0.1, 0.25] runs = 2 step_number = 100 episode_length = 100 # run experiment QLearning(gamma, lr, epsilon, runs, step_number, episode_length)
# Implement Q-learning and use this to solve the cartpole-environment import gym # Source: https://github.com/JoeSnow7/Reinforcement-Learning/blob/master/Cartpole%20Q-learning.ipynb # We define a class to contain the learning algorithm from QLearning import QLearning env = gym.make("CartPole-v0") agent = QLearning(env) agent.train() agent.run()
game = Game() #initialize window. game.initialisationWindow() game.initialisationBackground() game.loadingPictures() #recuperate window. window = game.getterWindow() #Recuperate wall and rewards coords on the .txt file mapWallCoord = game.getterWallCoord() rewardCoordB, rewardCoordM = game.getterReward() """---------------------- INITIALIZE QLEARNING ---------------------""" #Create object from qlearning class. QLearning = QLearning() #Init qTable with rewards and '0' coords from the .txt file QLearning.intiQtable() """---------------------- INITIALIZE TRAINING KNN---------------------""" Trainning = Trainning() epochs = 2000 dataX = [] #targets dataY = [] #features for epoch in range(epochs): #Simulate rayon (right or top detection) rayons = Trainning.simulateRayon()