Example #1
0
    def initialize_training(self):

        self.alpha = float(self.obj.var_alpha)
        self.gamma = float(self.obj.var_gamma)
        self.epsilon = float(self.obj.var_epsilon)
        self.neg_reward = float(self.obj.var_neg)
        self.positive_reward = float(self.obj.var_pos)

        for i in range(0, self.h, 40):
            for k in range(0, self.w, 40):
                self.states.append([k, i])
                if self.path.count([k, i]) == 1:
                    self.rewards.append(0)
                else:
                    self.rewards.append(self.neg_reward)

        goal_index = self.extract_index(
            [self.goal_sprite.x, self.goal_sprite.y], self.states)
        self.rewards[goal_index] = self.positive_reward

        self.n_states = len(self.states)

        self.label_batch = pyglet.graphics.Batch()

        for i in range(len(self.states)):
            self.reward_labels.append(
                pyglet.text.Label(str(int(self.rewards[i])),
                                  font_name='Times New Roman',
                                  font_size=10,
                                  x=self.states[i][0] + 10,
                                  y=self.states[i][1] + 15,
                                  batch=self.label_batch))

        self.Qobj = QLearning(self.alpha, self.gamma, self.states,
                              self.rewards, self.n_states, self.n_actions)
Example #2
0
def main():
    # resolve the parameters sent from the command line call
    params = obtainParameters()

    # resolve file issues regarding the execution of the algorithm
    prepareFolders(params['commandPath'], params['filePath'])

    myMDP = MDP(params['filePath'])
    myAgent = Agent(myMDP)

    Wacumulado = 0
    for i in range(int(params['numberOfExecutions'])):
        print 'Running experiment ' + str(i + 1) + ' of ' + str(params['numberOfExecutions'])

        myQLearning = QLearning(myMDP,                                                \
                                myAgent,                                              \
                                alpha            = float(params['alpha']),            \
                                gamma            = float(params['gamma']),            \
                                epsilon          = float(params['epsilon']),          \
                                epsilonIncrement = float(params['epsilonIncrement']), \
                                K                = int(params['K']),                  \
                                H                = int(params['H']),                  \
                                gammaPRQL        = float(params['gammaPRQL']))

        W, Ws = myQLearning.execute()
        Wacumulado += array(Ws)
        
    Ws = Wacumulado / float(params['numberOfExecutions'])

    saveOutputFiles(myQLearning, params, Ws)
Example #3
0
def inner_execution(envDesc, a, g, ep, e):
    env = gym.make(envDesc).env
    print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}".
          format(a, g, ep, e))
    qlearn = QLearning(env,
                       alpha=a,
                       gamma=g,
                       epsilon=ep,
                       epsilon_min=0.001,
                       epsilon_dec=0.9999,
                       episodes=e)
    q_table = qlearn.train(
        "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format(
            envDesc, a, g, ep, e), None)

    rewards = 0
    for i in range(101):
        state = env.reset()
        train_done = False
        count = 0
        while (not train_done) and (count < 200):
            action = np.argmax(q_table[state])
            state, reward, train_done, _ = env.step(action)
            count += 1
            if reward == 1:
                rewards += 1

    r = np.array([a, g, ep, e, rewards])
    print(r)
    savetxt("grid_results/results_{}_alpha_{}_gamma_{}_ep{}_e{}".format(
        envDesc, a, g, ep, e),
            r,
            delimiter=',',
            newline="  ",
            fmt="%10.5f")
Example #4
0
def inner_execution(env, envDesc, a, g, ep, e):
    print("current alpha -> {}, gamma -> {}, epsilon -> {}, episodes -> {}".
          format(a, g, ep, e))
    qlearn = QLearning(env,
                       alpha=a,
                       gamma=g,
                       epsilon=ep,
                       epsilon_min=0.001,
                       epsilon_dec=0.9999,
                       episodes=e)
    q_table = qlearn.train(
        "grid_data/q_table_{}_alpha_{}_gamma_{}_ep{}_e{}.csv".format(
            envDesc, a, g, ep, e),
        "grid_results/actions_{}_alpha_{}_gamma_{}_ep{}_e{}".format(
            envDesc, a, g, ep, e))

    rewards = 0
    for i in range(101):
        state = env.reset()
        train_done = False
        count = 0
        while (not train_done) and (count < 200):
            action = np.argmax(q_table[state])
            state, reward, train_done, _ = env.step(action)
            count += 1
            if reward == 1:
                rewards += 1

    self.results.append([a, g, ep, e, rewards])
Example #5
0
 def __init__(self, mark, board, game, player_type):
     self.mark = mark
     self.board = board
     self.game = game
     self.player_type = player_type
     self.action = Action(self.game)
     self.q_learning = QLearning()
     self.ordered_actions = []
def main():

    env = GridWorld(MAP4)
    qlearning_policy = QLearning(env.get_num_states(), env.get_num_actions())

    num_episodes = 1000
    eps = 0.1
    qlearnt = qlearning_train(env, qlearning_policy, num_episodes, eps)

    state = env.reset()
    env.print()
    done = False
    eps_test = 0.0
    while not done:
        input("press enter:")
        action = tabular_epsilon_greedy_policy(qlearnt.Q, eps_test, state)
        state, reward, done = env.step(action)
        env.print()

    Qmatrix = np.max(qlearnt.Q, axis=1)
    Qmatrix = Qmatrix.reshape(6, 13)
    plt.imshow(Qmatrix)
    plt.colorbar()
    plt.title("Q Value Matrix plot trained for 100 episodes (MAP 4)")
    plt.show()
Example #7
0
def init():
    """
    Ask for a Gridworld file and initializes an MDP as environment and Q-learning object with it, then calls the menu.
    """
    print_headline("Gridworld Selection")
    gridworld = read_gridworld_file()

    environment = MDP(state_list=gridworld,
                      field_rewards=Default.FIELD_REWARDS,
                      obstacle_fields=Default.OBSTACLE_FIELDS,
                      actions=Default.ACTIONS,
                      transition_probabilities=Default.TRANSITION_PROBABILITIES)

    q_learning = QLearning(env_perform_action=environment.perform_action,
                           state_list=gridworld,
                           goal_fields=Default.GOAL_FIELDS,
                           obstacle_fields=Default.OBSTACLE_FIELDS,
                           actions=Default.ACTIONS,
                           discount_factor=Default.DISCOUNT_FACTOR,
                           learning_rate=Default.LEARNING_RATE,
                           epsilon=Default.EPSILON,
                           convergence_threshold=Default.CONVERGENCE_THRESHOLD)

    print("Your input Gridworld:")
    print_gridworld(gridworld)

    while show_menu(q_learning):
        pass

    print_headline("See you later")
    def chargerParametresQLearning(self):
        my_settings = QSettings("Bazerque-Vigie", "Labyrinthe");

        # Charge les paramètres généraux
        my_settings.beginGroup("General")
        if (my_settings.contains("politique")):
            self.politique_choisie = my_settings.value("politique").toInt()[0]

        if (my_settings.contains("nb_coups")):
            self.nb_coups_max = my_settings.value("nb_coups").toInt()[0]

        if (my_settings.contains("vitesse")):
            self.vitesse = my_settings.value("vitesse").toInt()[0]

        my_settings.endGroup()

        # Charge les coûts des déplacements
        my_settings.beginGroup("Couts")
        if (my_settings.contains("deplacement_normal")):
            deplacement_normal = my_settings.value("deplacement_normal").toFloat()[0]
        if (my_settings.contains("deplacement_piege")):
            deplacement_piege = my_settings.value("deplacement_piege").toFloat()[0]
        if (my_settings.contains("deplacement_sortie")):
            deplacement_sortie = my_settings.value("deplacement_sortie").toFloat()[0]

        self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init)
Example #9
0
 def train(self):
     interactions = config.geti('trainInteractions')
     minEpsilon = config.getf('minTrainingEpsilon')
     epochSize = len(self.environment.db.images) / 2
     epsilon = 1.0
     self.controller.setEpsilonGreedy(epsilon)
     print 'Epoch 0: Exploration'
     self.runEpoch(interactions, len(self.environment.db.images))
     self.learner = QLearning()
     self.agent.learner = self.learner
     epoch = 1
     egEpochs = config.geti('epsilonGreedyEpochs')
     while epoch <= egEpochs:
         epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs)
         if epsilon < minEpsilon: epsilon = minEpsilon
         self.controller.setEpsilonGreedy(epsilon)
         print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon)
         self.runEpoch(interactions, epochSize)
         epoch += 1
     epoch = 1
     maxEpochs = config.geti('exploitLearningEpochs')
     while epoch <= maxEpochs:
         print 'Epoch', epoch + egEpochs, '(exploitation mode: epsilon={:5.3f})'.format(
             epsilon)
         self.runEpoch(interactions, epochSize)
         epoch += 1
    def modifierParametres(self):
        f = FenetreParametres(self)
        if (f.exec_()):

            # Récupère les choix dès que la FenetreParametres est validée
            self.politique_choisie,self.nb_coups_max, self.vitesse, deplacement_normal,deplacement_piege,deplacement_sortie = f.getChoix()

            # Modifie les paramètres initiaux du QLearning
            self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init)

            # Actualise la barre de status
            self.changeStatus()
Example #11
0
def experiment(test_game,
               num_experiments,
               file_name,
               num_episodes=500,
               alpha=.99,
               gamma=.9,
               epsilon=.9,
               decay_rate=.99):
    """
    Main experiment method that runs the Q-Learning experiments and returns prints and draws the needed diagrams.
    works by learning a model x number of times and then compiling the number of steps per epoch for experiment
    These are then averaged and used to create a graph.

    A policy is then also chosen to give an average number of steps needed to reach the goal metric.
    """

    list_of_moves_per_experiment = []
    policies = []
    for x in range(num_experiments):
        # Learn model
        q_learning = QLearning(test_game,
                               num_episodes=num_episodes,
                               alpha=alpha,
                               gamma=gamma,
                               epsilon=epsilon,
                               decay_rate=decay_rate)
        q = q_learning.learn()
        policies.append(q)

        num_moves = q_learning.num_moves_per_episode
        list_of_moves_per_experiment.append(num_moves)

    np.array(list_of_moves_per_experiment)
    moves_per_epoc_number = np.sum(list_of_moves_per_experiment, axis=0)
    moves_per_epoc_number = moves_per_epoc_number / num_experiments

    # get Average number of steps when executing.
    q_learning = QLearning(test_game,
                           num_episodes=num_episodes,
                           alpha=alpha,
                           gamma=gamma,
                           epsilon=epsilon,
                           decay_rate=decay_rate)
    avg_num_steps = 0
    for itter in range(100):
        num_steps = q_learning.execute_policy(policies[num_experiments - 1])
        avg_num_steps += num_steps[1]

    avg_num_steps /= 100.0

    generate_validation_curves(np.arange(num_episodes),
                               moves_per_epoc_number,
                               None,
                               "Number of steps",
                               None,
                               x_axis_label="Epoc Number",
                               y_axis_label="Average Path Length",
                               file_name=file_name)

    return avg_num_steps, policies[num_experiments - 1]
Example #12
0
 def train(self):
     networkFile = config.get('networkDir') + config.get(
         'snapshotPrefix') + '_iter_' + config.get(
             'trainingIterationsPerBatch') + '.caffemodel'
     interactions = config.geti('trainInteractions')
     minEpsilon = config.getf('minTrainingEpsilon')
     epochSize = len(self.environment.imageList) / 1
     epsilon = 1.0
     self.controller.setEpsilonGreedy(epsilon,
                                      self.environment.sampleAction)
     epoch = 1
     exEpochs = config.geti('explorationEpochs')
     while epoch <= exEpochs:
         s = cu.tic()
         print 'Epoch', epoch, ': Exploration (epsilon=1.0)'
         self.runEpoch(interactions, len(self.environment.imageList))
         self.task.flushStats()
         self.doValidation(epoch)
         s = cu.toc('Epoch done in ', s)
         epoch += 1
     self.learner = QLearning()
     self.agent.learner = self.learner
     egEpochs = config.geti('epsilonGreedyEpochs')
     while epoch <= egEpochs + exEpochs:
         s = cu.tic()
         epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs)
         if epsilon < minEpsilon: epsilon = minEpsilon
         self.controller.setEpsilonGreedy(epsilon,
                                          self.environment.sampleAction)
         print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon)
         self.runEpoch(interactions, epochSize)
         self.task.flushStats()
         self.doValidation(epoch)
         s = cu.toc('Epoch done in ', s)
         epoch += 1
     maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs
     while epoch <= maxEpochs:
         s = cu.tic()
         print 'Epoch', epoch, '(exploitation mode: epsilon={:5.3f})'.format(
             epsilon)
         self.runEpoch(interactions, epochSize)
         self.task.flushStats()
         self.doValidation(epoch)
         s = cu.toc('Epoch done in ', s)
         shutil.copy(networkFile, networkFile + '.' + str(epoch))
         epoch += 1
Example #13
0
def qLearningWithOptions(env,
                         alpha,
                         gamma,
                         options_eps,
                         epsilon,
                         nSeeds,
                         maxLengthEp,
                         nEpisodes,
                         verbose,
                         useNegation,
                         genericNumOptionsToEvaluate,
                         loadedOptions=None):

    numSeeds = nSeeds
    numEpisodes = nEpisodes
    # We first discover all options
    options = None
    actionSetPerOption = None

    if loadedOptions == None:
        if verbose:
            options, actionSetPerOption = discoverOptions(env,
                                                          options_eps,
                                                          verbose,
                                                          useNegation,
                                                          plotGraphs=True)
        else:
            options, actionSetPerOption = discoverOptions(env,
                                                          options_eps,
                                                          verbose,
                                                          useNegation,
                                                          plotGraphs=False)
    else:
        options = loadedOptions
        actionSetPerOption = []

        for i in xrange(len(loadedOptions)):
            tempActionSet = env.getActionSet()
            tempActionSet.append('terminate')
            actionSetPerOption.append(tempActionSet)

    returns_eval = []
    returns_learn = []
    # Now I add all options to my action set. Later we decide which ones to use.
    i = 0
    #genericNumOptionsToEvaluate = [1, 2, 4, 32, 64, 128, 256]
    totalOptionsToUse = []
    maxNumOptions = 0
    if useNegation and loadedOptions == None:
        maxNumOptions = int(len(options) / 2)
    else:
        maxNumOptions = len(options)
    while i < len(genericNumOptionsToEvaluate
                  ) and genericNumOptionsToEvaluate[i] <= maxNumOptions:
        totalOptionsToUse.append(genericNumOptionsToEvaluate[i])
        i += 1

    for idx, numOptionsToUse in enumerate(totalOptionsToUse):
        returns_eval.append([])
        returns_learn.append([])

        if verbose:
            print 'Using', numOptionsToUse, 'options'

        for s in xrange(numSeeds):
            if verbose:
                print 'Seed: ', s + 1

            returns_eval[idx].append([])
            returns_learn[idx].append([])
            actionSet = env.getActionSet()

            for i in xrange(numOptionsToUse):
                actionSet.append(options[i])

            if useNegation and loadedOptions == None:
                numOptions = 2 * numOptionsToUse
            else:
                numOptions = numOptionsToUse

            learner = QLearning(alpha=alpha,
                                gamma=gamma,
                                epsilon=epsilon,
                                environment=env,
                                seed=s,
                                useOnlyPrimActions=True,
                                actionSet=actionSet,
                                actionSetPerOption=actionSetPerOption)

            for i in xrange(numEpisodes):
                returns_learn[idx][s].append(
                    learner.learnOneEpisode(timestepLimit=maxLengthEp))
                returns_eval[idx][s].append(
                    learner.evaluateOneEpisode(eps=0.01,
                                               timestepLimit=maxLengthEp))

    returns_learn_primitive = []
    returns_eval_primitive = []
    for s in xrange(numSeeds):
        returns_learn_primitive.append([])
        returns_eval_primitive.append([])
        learner = QLearning(alpha=alpha,
                            gamma=gamma,
                            epsilon=epsilon,
                            environment=env,
                            seed=s)
        for i in xrange(numEpisodes):
            returns_learn_primitive[s].append(
                learner.learnOneEpisode(timestepLimit=maxLengthEp))
            returns_eval_primitive[s].append(
                learner.evaluateOneEpisode(eps=0.01,
                                           timestepLimit=maxLengthEp))

    return returns_eval_primitive, returns_eval, totalOptionsToUse
Example #14
0
    elif taskToPerform == 4:  #Compute the average number of time steps between any two states
        gamma = 1.0
        env.useNegativeRewards = True  #I need this because I'm counting time steps
        stats = MDPStats(gamma=gamma, env=env, outputPath=outputPath)
        getExpectedNumberOfStepsFromOption(env=env,
                                           eps=epsilon,
                                           verbose=verbose,
                                           discoverNegation=bothDirections,
                                           loadedOptions=loadedOptions)

    elif taskToPerform == 5:  #Solve for a given goal (q-learning)
        returns_learn = []
        returns_eval = []
        learner = QLearning(alpha=0.1,
                            gamma=0.9,
                            epsilon=1.00,
                            environment=env)
        for i in xrange(num_episodes):
            returns_learn.append(
                learner.learnOneEpisode(timestepLimit=max_length_episode))
            returns_eval.append(
                learner.evaluateOneEpisode(eps=0.01,
                                           timestepLimit=max_length_episode))

        plt.plot(returns_eval)
        plt.show()

    elif taskToPerform == 6:  #Solve for a given goal w/ primitive actions (q-learning) following options
        returns_eval_primitive, returns_eval, totalOptionsToUse = qLearningWithOptions(
            env=env,
            alpha=0.1,
Example #15
0
class Agent():
    '''
    Creates an agent which contains a policy. Input are an environment object
    and the location of the agent, which is (0,0) by default.    
    '''
    ACTION_UP    = (-1, 0)
    ACTION_DOWN  = ( 1, 0)
    ACTION_RIGHT = ( 0, 1)
    ACTION_LEFT  = ( 0,-1)
    ACTION_STAY  = ( 0, 0)   
    
    actions = set([ACTION_UP, ACTION_DOWN, ACTION_RIGHT, ACTION_LEFT, ACTION_STAY])
    policy = dict()
    location = None

    def __init__( self, environment, location=(0,0) ):        
        
        self.Environment = environment
        self.location = location
        self.QLearning = QLearning( self, 0.5, 0.7, 0.1)

    def getActionEpsilonGreedy( self, s ):
        '''
        a <- getActionEpsilonGreedy(s)
        
        Find an action using the current state s, in an epsilon-greedy fashion. 
        '''
        # Find the action that maximizes Q[(s, a)]                
        prob_actions = dict()        
        uniform_epsilon = self.QLearning.epsilon / (len(self.actions))
        
        for possible_a in self.actions:
            # Set probabilities of all actions uniformly
            prob_actions[possible_a] = uniform_epsilon
            
        best_a = argmax( self.QLearning.Q[s] )
        prob_actions[best_a] += 1 - self.QLearning.epsilon
                    
        # For every action, check if the cumulative probability exceeds a 
        # random number. 
        random_number = random.random()
        cumulative_prob = 0.0
        
        for a in self.actions:
            cumulative_prob += prob_actions[a]
            if cumulative_prob >= random_number:                
                return a
    
    def getAction( self, s ):
        '''
        a <- getAction(s)        
        
        Get the optimal action given the current state s, using Q[s]. 
        '''
        if not s in self.QLearning.Q:
            self.QLearning.initQ(s)
        best_a = argmax( self.QLearning.Q[s] )        
        
        return best_a

    def performAction( self, a ):
        raise NotImplementedError
        
    def updateQ(self, s, a, s_prime, r):
        raise NotImplementedError
Example #16
0
def main():
    print('Cart Pole')
    env = gym.make('CartPole-v1')

    q_learn = QLearning(env, num_episodes=3000)
    q_learn.run()
    base_memory = 'res/memory1-0.bson'
    gamma = 0.99
    learning_rate = 0.8

    epoch_min = 3500000
    epoch_max = 5000000

    eps_min = 0.1
    eps_max = 0.1
    
    while eps_min <= eps_max:

        while epoch_min <= epoch_max:

            bp = BoardPossitionParams()
            q = QLearning(bp, gamma=gamma, learning_rate=learning_rate, epochs=epoch_min, eps=eps_min, name=base_memory)
            q.learning()
            q.save()

            fp = base_memory.split('.')[0] + '_Q_trained_ep' + str(epoch_min) + '_g' + str(int(gamma * 100)) + \
                   '_l' + str(int(learning_rate * 10)) + '_e' + str(int(eps_min * 100)) + '.bson'

            play = Play(fp, False)
            wins, rounds = play.play_stats(games_to_play)
            
            with open(("res/eps_"+str(int(eps_min * 100))), 'a') as outfile:
                outfile.write(str(epoch_min)+"-"+str(wins)+"-"+str(rounds)+"\n")
            
            print('Win perc:', wins,'Average Rounds:', rounds)
            
            epoch_min += 500000
Example #18
0
from variables3_3 import *
import random
from QLearning import QLearning
from write_results import write_results

print("Done with importing")

tb = TarockBasics()
qlearning = QLearning(0.1, 0.1, 0.1)

def play_one_game(p1, p2, p3, talon, ps, sp, duo, strats):
    
    msa = MilestoneAgents(p1, p2, p3, [1,2,3], solo, duo)
    msa.update_state_talon(talon)
    points = {1: 0, 2: 0, 3: 0}
    
    while msa.player1_cards:
        
        first = msa.starting_player
        second = msa.second_player
        third = msa.third_player

        if strats[first] == "me":
            if first not in msa.duo:
                card1 = get_solo_first_card_3_3(msa, first, qlearning)
            else:
                card1 = get_duo_first_card_3_3(msa, first, qlearning)
        elif strats[first] == "LWW":
            card1 = msa.locally_worst_worst_agent(first)
        elif strats[first] == "LW":
            card1 = msa.locally_worst_agent(first)
Example #19
0
import http.server
import socketserver
import gym
import gym_sample
from QLearning import QLearning
import json

PORT = 8080

env = gym.make('sample-v0')
qLearning = QLearning(env, 99)


class Handler(http.server.SimpleHTTPRequestHandler):
    def do_GET(self):
        self.send_response(200)
        self.send_header('Content-type', 'application/json')
        self.send_header('Access-Control-Allow-Origin', '*')
        self.end_headers()
        if (self.path == '/train'):
            self.wfile.write(
                json.dumps(
                    qLearning.train(5000, 0.7, 0.618, 1, 1, 0.01,
                                    0.01).tolist()).encode('utf8'))
        elif (self.path == '/ai_step'):
            self.wfile.write(json.dumps(qLearning.ai_step()).encode('utf8'))
        elif (self.path == '/reset'):
            qLearning.reset()
            self.wfile.write('{"reseted": true}'.encode('utf8'))
        else:
            self.wfile.write('{"rodando": "ta"}'.encode('utf8'))
Example #20
0
save_name = "q_values_"

# have a look at LearningPolicy.py for other policies
epsilon_policy = LearningPolicy.exponentially_annealed_epsilon(1 / 10000, 0.0)
epsilon_policy_2 = LearningPolicy.linear_annealed_epsilon(1., 0.1, 100)

alpha1 = 0.2
alpha2 = 0.1

hyperparameters = {"alpha": alpha2, "discount": 0.99}

# Please note: Numerous other settings can be adjusted in settings.py

if training_mode:
    q = QLearning(epsilon_policy=epsilon_policy_2,
                  map_name=map,
                  hyperparameters=hyperparameters,
                  save_name=save_name)
    while True:
        q.train()

else:
    q = QLearning(epsilon_policy=LearningPolicy.constant_epsilon(0),
                  map_name=map)

    if checkpoint_file is None:
        raise Exception("Please specify the checkpoint file path!")

    q_values = AgentManager.load_q_values(checkpoint_file)

    while True:
        q.test(q_values=q_values)
Example #21
0
class Agent():
    '''
    Creates an agent which contains a policy. Input are an environment object
    and the location of the agent, which is (0,0) by default.    
    '''
    ACTION_UP    = (-1, 0)
    ACTION_DOWN  = ( 1, 0)
    ACTION_RIGHT = ( 0, 1)
    ACTION_LEFT  = ( 0,-1)
    ACTION_STAY  = ( 0, 0)   
    actions = set([ACTION_UP, ACTION_DOWN, ACTION_RIGHT, ACTION_LEFT, ACTION_STAY])
    policy = dict()
    location = None

    def __init__( self, environment, location=(0,0) ):        
            
        self.Environment = environment
        self.location = location
        self.QLearning = QLearning( self, 0.5, 0.7, 0.1)

    def getActionEpsilonGreedy( self, s ):
        '''
        a <- getActionEpsilonGreedy(s)
        
        Find an action using the current state s, in an epsilon-greedy fashion. 
        '''
        # Find the action that maximizes Q[(s, a)]                
        prob_actions = dict()        
        uniform_epsilon = self.QLearning.epsilon / (len(self.actions))
        
        for possible_a in self.actions:
            # Set probabilities of all actions uniformly
            prob_actions[possible_a] = uniform_epsilon
            
        best_a = argmax( self.QLearning.Q[s] )
        prob_actions[best_a] += 1 - self.QLearning.epsilon
                    
        # For every action, check if the cumulative probability exceeds a 
        # random number. 
        random_number = random.random()
        cumulative_prob = 0.0
        
        for a in self.actions:
            cumulative_prob += prob_actions[a]
            if cumulative_prob >= random_number:                
                return a
    
    def getAction( self, s ):
        '''
        a <- getAction(s)        
        
        Get the optimal action given the current state s, using Q[s]. 
        '''
        if not s in self.QLearning.Q:
            self.QLearning.initQ(s)
        best_a = argmax( self.QLearning.Q[s] )        
        
        return best_a

    def performAction( self, a ):
        raise NotImplementedError
        
    def updateQ(self, s, a, s_prime, r):
        raise NotImplementedError
Example #22
0
import numpy as np
import matplotlib.pyplot as plt
from QLearning import QLearning
from numpy import loadtxt

def stateNumber(state):
        (x,y,z) = state
        y = y * 32
        z = z * 352
        return x+y+z

env = gym.make('Blackjack-v0')
for i in [0.01]:
    for g in [0.000001,0.00001,0.0001,0.001,0.01]:
        for epi in [600000,700000,800000]:
            qlearn = QLearning(env, alpha=i, gamma=g, epsilon=0.9,epsilon_min=0.01, epsilon_dec=0.99, episodes=epi)
            q_table = qlearn.train('data/q-table-blackjack.csv', 'results/blackjack')
#q_table = loadtxt('data/q-table-blackjack.csv', delimiter=',')

#state= env.reset()
#print(state) 
#state = stateNumber(state)
#done = False
#
#
#while not done:
#    action = np.argmax(q_table[state])
#    state, reward, done, info = env.step(action)
#    print(action)
#    print(state)
#    state = stateNumber(state)
Example #23
0
 def __init__( self, environment, location=(0,0) ):        
         
     self.Environment = environment
     self.location = location
     self.QLearning = QLearning( self, 0.5, 0.7, 0.1)
Example #24
0
    def render(self, Q_table):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit()
        self.display_surface.fill((0, 0, 0))

        self.render_table(Q_table)
        self.render_player()
        self.render_target()

        pygame.display.update()
        pygame.time.wait(250)


agent = QLearning(GridWorld(), buckets=(10, 10, 10, 10), lower_bounds=0, upper_bounds=9, num_episodes=1000,
                  min_lr=0.001, min_epsilon=0.3)


def run():
    t = 0
    done = False
    current_state = agent.discretize_state(agent.env.reset())
    while not done:
        t += 1
        action = agent.choose_action(current_state)
        agent.env.render(agent.Q_table)
        obs, reward, done, _ = agent.env.step(action)
        new_state = agent.discretize_state(obs)
        current_state = new_state
    return t
Example #25
0
                                    get_size_except_dim(Input)])
    Output = tf.layers.dense(inputs=Reshape, units=10, activation=None)
    Labels = tf.cast(tf.reshape(Labels, shape=[BatchSize]), tf.int64)
    #OneHotLabels=tf.one_hot(Labels,depth=10,axis=-1)
    Loss = tf.losses.sparse_softmax_cross_entropy(labels=Labels, logits=Output)
    Acc = tf.reduce_mean(
        tf.cast(tf.equal(Labels, tf.argmax(Output, 1)), tf.float32))
    #print(Loss,Loss.shape.as_list())
    #exit()
    #Loss=tf.reshape(Loss,shape=[-1,1])
    return Output, Loss, Acc


Mode = "Train"

RL_Exp = QLearning()
TaskSpec = {
    "LogHistory": True,
    "OperatorList": Op_List,
    "OperatorNum": OperatorLimit,
    "InputNum": 1,
    "OutputNum": 1,
    "TaskInput": images,
    "TaskLabel": labels,
    "Epochs": TrainEpochs,
    "NetworkDecor": NetworkDecor,
    "BatchSize": BatchSize,
    "ConcatOperator": ConcatOperatorDense,
    "InputOperator": ImageInput,
    "TrajectoryLength": OperatorLimit - 4,
    "RewardGamma": 0.9
Example #26
0
 def __init__( self, environment, location=(0,0) ):        
     
     self.Environment = environment
     self.location = location
     self.QLearning = QLearning( self, 0.5, 0.7, 0.1)
Example #27
0
# def print_frames(frames):
#     for i, frame in enumerate(frames):
#         clear_output(wait=True)
#         #print(frame['frame'])
#         #print(frame['frame'].getvalue())
#         print(f"Timestep: {i + 1}")
#         print(f"State: {frame['state']}")
#         print(f"Action: {frame['action']}")
#         print(f"Reward: {frame['reward']}")
#         sleep(.1)

env = gym.make('Roulette-v0').env
#q_table = loadtxt('data/q-table-roulette.csv', delimiter=',')

#2600loss - stable
qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.001, epsilon_dec=0.9999, episodes=1000000)

# 500-1000loss - real player like
#qlearn = QLearning(env, alpha=0.001, gamma=0.001, epsilon=0.9, epsilon_min=0.1, epsilon_dec=0.7, episodes=1000000)
q_table = qlearn.train('data/q-table-roulette.csv', None)

#q_table = loadtxt('data/q-table-roulette.csv', delimiter=',')

state = env.reset()
done = False
rewards = 0
actions = 0

while not done:
    action = np.argmax(q_table)
    state, reward, done, info = env.step(action)
Example #28
0
#!/usr/bin/python3

from PK_Handler import PK_Handler
from PK_Game import PK_Game
from PK_Player_Greedy import PK_Player_Greedy
from QLearning import QLearning

game = PK_Game()

qplayer = PK_Player_Greedy(game)
qlearning = QLearning(game, qplayer, 0.05, 0.95, 0.8)
qplayer.set_optimizer(qlearning)
game.set_player1(qplayer)

handler = PK_Handler(game)

handler.train(1000000)
print(qplayer.probas)
def update(env, RL):
    for episode in range(MAX_EPISODES):
        state = env.reset()                                     # initial state
        while True:            
            action = RL.choose_action(str(state))               # RL choose action based on state
            state_, reward = env.step(action)                   # RL take action and get next state and reward
            RL.learn(str(state), action, reward, str(state_))   # RL learn from this transition
            state = state_                                      # swap state
            if state == 'terminal':                             # break while loop when end of this episode
                break
        
        if episode % 500 == 0:
            simulation(episode)
        # writer = pd.ExcelWriter('./file1.xlsx')
        # RL.q_table.to_excel(writer)
        # writer.save()
        
    print('game over')
    print(RL.q_table)


if __name__ == "__main__":
    env = Game()
    RL = QLearning(
        actions = list(range(env.n_actions)), 
        learning_rate = ALPHA,
        reward_decay = GAMMA,
        e_greedy = EPSILON
    )
    update(env, RL)
Example #30
0
tries = 100
episodes = 1000
results = np.zeros((tries, episodes))

#### run with QLearning

for t in range(tries):

    # define learning settings
    epsilon_decay = 1 - (1 / episodes) * 6
    learning_decay = 1 - (1 / episodes) * 3
    agent = QLearning(env.env,
                      learning_rate=0.5,
                      discount_factor=0.9,
                      exploration_rate=0,
                      epsilon_decay_func=lambda x: x * epsilon_decay,
                      alpha_decay_func=lambda x: x * learning_decay,
                      qtable_default=1)

    # fit and save results
    env.fit(agent, episodes)
    results[t, :] = agent.rewards_per_episode

# plot rewards
plot_rewards(np.mean(results, axis=0), smoothing=0.1, color='blue')

#### run with SARSA

# define learning settings
Example #31
0
class Agent(object):
    def __init__(self, mark, board, game, player_type):
        self.mark = mark
        self.board = board
        self.game = game
        self.player_type = player_type
        self.action = Action(self.game)
        self.q_learning = QLearning()
        self.ordered_actions = []

    def reset(self):
        """Resets the list of actions taken"""
        self.ordered_actions = []

    def store_action(self, action, state, game):
        """
        Store the action taken in an ordered list of actions taken during the game
        """
        self.ordered_actions.append({
            "action": action,
            "state": state,
            "game_layout": list(game)
        })

    def select_action(self, epsilon=0.0, state=None):
        """
        Select the action based on the players types

        @param epsilon: the E-greedy action value. Determines if a random actions
                        (exploration) should be chosen over learned Q-values
                        (exploitation).
        @param state: the current state of the game, used to determine the available
                      actions

        @return the index of where the marker will be placed (the action)
        """
        if self.player_type == "human":
            action = self.get_input()
        elif self.player_type == "random":
            action = self.action.get_random_action()
        elif self.player_type == "qlearning":
            action = self.action.get_egreedy_action(epsilon, self.q_learning,
                                                    state)
        else:
            print "Undefined player type"
            raise NotImplementedError
        return action

    def learn_from_game(self, alpha, reward, gamma):
        """
        Update the Q-table if the players type is qlearning. Updating the Q-table
        happens by going back through the actions the player took until the game
        ended. Every action will go through the Q-learning update rule/equation.
        """
        if not self.player_type == "qlearning":  # the other type of agents do not learn
            return

        for index, action in enumerate(self.ordered_actions):
            if index + 1 < len(self.ordered_actions):
                next_state = self.ordered_actions[index + 1]["state"]
            else:  # This is the last action before the game ended (no further states)
                next_state = None

            state = action[
                "state"]  # get the state before the action was executed
            # string representation of the game layout
            selected_action = action["action"]  # get the action
            game = action[
                "game_layout"]  # get the current game layout as a list

            # Only the last state receives a reward
            if index == len(self.ordered_actions) - 1:
                state_reward = reward
            else:
                state_reward = 0

            possible_actions = self.action.get_valid_actions(game)
            self.q_learning.update_q(state, selected_action, next_state,
                                     possible_actions, alpha, state_reward,
                                     gamma)

    def do_action(self, action):
        """
        Perform an action based on the player type. An actions is defined as
        placing a mark in one of the empty cells. Currently the game will support
        three types of players:
        - human: us
        - random: a bot that just places marks randomly in cells
        - qlearning: a smarter bot (hopefully) that learns what the best options are
        """
        pos = [action // 3, action % 3]
        self.game.set_cell(pos, self.mark)
        self.board.update(pos, self.mark)

    def get_input(self):
        """
        Get input from user via keyboard. Input refers to the index of one of the
        fields

        @return a position in the board where the user wants there mark to be placed
        """
        self.board.print_game()

        while True:
            user_input = raw_input("Your turn " + str(self.get_mark()) +
                                   " (0-8 or 'q' to quit): ")
            valid_actions = self.action.get_valid_actions()
            if user_input.isalpha() and user_input == "q":
                return user_input
            elif user_input.isalpha():
                print "Invallid character, please try again"
                continue

            if (int(user_input) >= 0 and int(user_input) < 9 and \
                    valid_actions[int(user_input)] == 1):
                return int(user_input)
            print "Invallid turn, please try another cell"

    def get_type(self):
        """
        Return the type of user: human, random, qlearning
        """
        return self.player_type

    def get_mark(self):
        """
        Get the mark the agent is using

        @return a string representing the mark. E.q. 'X' or 'O'
        """
        return self.mark
class FenetrePrincipale(QMainWindow):


    apprentissage_fini = pyqtSignal()

    def __init__(self):
        super(FenetrePrincipale,self).__init__()

        # Charge un labyrinthe vide
        self.labyrinthe = Labyrinthe()

        # Charge les images correspondantes aux type de case
        self.ICONE_VIDE = QPixmap("Icones/floor.png")
        self.ICONE_ENTREE = QPixmap("Icones/entry.png")
        self.ICONE_SORTIE = QPixmap("Icones/exit.png")
        self.ICONE_MUR = QPixmap("Icones/wall.png")
        self.ICONE_PIEGE = QPixmap("Icones/trap.png")
        self.ICONE_PERSO = QPixmap("Icones/perso_front.png")

        self.case_perso = None # Position courante du personnage
        self.nb_coups = 0 # Numéro de l'itération en cours
        self.nb_coups_max = 0 # Nb d'itérations max
        self.nb_coups_en_suivant = 0 # Nb d'itérations choisies dans la zone de texte
        self.toggleAlgo = False # Permet de switcher entre exploration et exploitation
        self.case_init = None # Position initiale du personnage
        self.politique_choisie = 0 # Politique d'apprentissage choisie depuis les paramètres
        self.apprentissage_termine = False # Indique la fin de l'apprentissage
        self.lecture_en_cours = False # Indique si une lecture (bouton play) est en cours
        self.deplacement_en_cours = False
        self.last_move = None
        self.moyenne_renforcement = [0,0,0,0,0,0] # Enregistre les moyennes de renforcement pour un labyrinthe pour chaque politique

        self.chargerParametresQLearning()

        self.apprentissage_fini.connect(self.reinitialiserApprentissage) # Signal emit lors d'une fin d'apprentissage

        # Initialise les éléments de la fenêtre
        self.initUI()


    ''' Initialise l'interface de la fenêtre '''
    def initUI(self):

        # Crée les boutons de la barre d'outil
        self.newAction = QAction(QIcon('Icones/new.png'),u'&Nouveau labyrinthe',self)
        self.newAction.setShortcut('Ctrl+N')
        self.newAction.triggered.connect(self.nouveauLabyrinthe)

        self.openAction = QAction(QIcon('Icones/open.png'),u'&Ouvrir un labyrinthe',self)
        self.openAction.setShortcut('Ctrl+O')
        self.openAction.triggered.connect(self.chargerLabyrinthe)

        self.saveAction = QAction(QIcon('Icones/save.png'),u'&Enregistrer le labyrinthe',self)
        self.saveAction.setEnabled(False)
        self.saveAction.setShortcut('Ctrl+S')
        self.saveAction.triggered.connect(self.enregistrerLabyrinthe)

        self.playAction = QAction(QIcon('Icones/play.png'),u'&Lancer l\'apprentissage jusqu\'à atteindre le nb d\'itérations max',self)
        self.playAction.setEnabled(False)
        self.playAction.setShortcut('Ctrl+P')
        self.playAction.triggered.connect(self.runPlayPause)

        self.nextAction = QAction(QIcon('Icones/next.png'),u"&Exécuter une étape d'apprentissage",self)
        self.nextAction.setEnabled(False)
        self.nextAction.setShortcut('Ctrl+E')
        self.nextAction.triggered.connect(self.runPolitique1Coup)

        self.nb_coups_simules = QLineEdit(self)
        self.nb_coups_simules.setFixedWidth(90)
        self.nb_coups_simules.setEnabled(False)
        self.nb_coups_simules.setValidator( QIntValidator(self) )
        self.nb_coups_simules.setToolTip(u"Entrez un nb d'itération à effectuer puis valider")
        self.nb_coups_simules.returnPressed.connect(self.runPolitiqueNbCoups)

        self.stopAction = QAction(QIcon('Icones/stop.png'),'&Arreter',self)
        self.stopAction.setEnabled(False)
        self.stopAction.setShortcut('Ctrl+S')
        self.stopAction.triggered.connect(self.stopPolitique)

        self.moyAction = QAction(QIcon('Icones/moy.png'),'&Afficher les moyennes de renforcement',self)
        self.moyAction.setShortcut('Ctrl+M')
        self.moyAction.triggered.connect(self.showMoy)

        self.maxQAction = QAction(QIcon('Icones/maxQ.png'),'&Afficher/Masquer les max(Q)',self)
        self.maxQAction.setCheckable(True)
        self.maxQAction.setShortcut('Ctrl+Q')
        self.maxQAction.triggered.connect(self.triggerMaxQ)

        self.settingsAction = QAction(QIcon('Icones/settings.png'),u'&Modifier les paramètres d\'apprentissage',self)
        self.settingsAction.setShortcut('Ctrl+U')
        self.settingsAction.triggered.connect(self.modifierParametres)

        # Ajoute un spacer
        spacer = QWidget()
        spacer.setSizePolicy(QSizePolicy.Expanding,QSizePolicy.Expanding)

        # Ajoute les boutons à la barre d'outil
        self.toolbar = self.addToolBar('toolbar')
        self.toolbar.setAllowedAreas(Qt.TopToolBarArea)
        self.toolbar.setFloatable(False)
        self.toolbar.addAction(self.newAction)
        self.toolbar.addAction(self.openAction)
        self.toolbar.addAction(self.saveAction)
        self.toolbar.addAction(self.playAction)
        self.toolbar.addAction(self.nextAction)
        self.toolbar.addAction(self.stopAction)
        self.toolbar.addWidget(self.nb_coups_simules)
        self.toolbar.addWidget(spacer)
        self.toolbar.addAction(self.moyAction)
        self.toolbar.addAction(self.maxQAction)
        self.toolbar.addAction(self.settingsAction)
        self.toolbar.setContextMenuPolicy(Qt.CustomContextMenu)

        # Ajoute une barre de setStatusTip
        self.lbl_status = QLabel(u"Politique " + str(self.politique_choisie+1) + u" | Nb passes max = 0" + u" | Itération n° " + str(self.nb_coups))
        self.statusBar().addPermanentWidget(self.lbl_status,1)

        # Ajoute une grille pour le labyrinthe
        self.grille = QTableWidget()
        self.grille.setFrameShape(QFrame.NoFrame)
        self.grille.setShowGrid(False)
        self.grille.horizontalHeader().hide()
        self.grille.verticalHeader().hide()
        self.grille.setSelectionMode(QAbstractItemView.NoSelection)
        self.grille.setEditTriggers(QAbstractItemView.NoEditTriggers)
        self.grille.cellDoubleClicked.connect(self.drawCase)
        self.grille.setFocusPolicy(Qt.NoFocus)
        self.grille.verticalHeader().setResizeMode(QHeaderView.Fixed)
        self.grille.horizontalHeader().setResizeMode(QHeaderView.Fixed)
        self.grille.verticalHeader().setDefaultSectionSize(32)
        self.grille.horizontalHeader().setDefaultSectionSize(32)

        self.entreeAction = QAction(QIcon(self.ICONE_ENTREE),u'Entrée',self)
        self.entreeAction.setIconVisibleInMenu(True)
        self.entreeAction.setCheckable(True)

        self.sortieAction = QAction(QIcon(self.ICONE_SORTIE),u'Sortie',self)
        self.sortieAction.setIconVisibleInMenu(True)
        self.sortieAction.setCheckable(True)

        self.murAction = QAction(QIcon(self.ICONE_MUR),u'Mur',self)
        self.murAction.setIconVisibleInMenu(True)
        self.murAction.setCheckable(True)

        self.piegeAction = QAction(QIcon(self.ICONE_PIEGE),u'Piège',self)
        self.piegeAction.setIconVisibleInMenu(True)
        self.piegeAction.setCheckable(True)

        self.videAction = QAction(QIcon(self.ICONE_VIDE),u'Vide',self)
        self.videAction.setIconVisibleInMenu(True)
        self.videAction.setCheckable(True)

        self.persoAction = QAction(QIcon(self.ICONE_PERSO),u'Placer le personnage',self)
        self.persoAction.setIconVisibleInMenu(True)
        self.persoAction.setCheckable(True)

        self.groupe_action = QActionGroup(self)
        self.groupe_action.addAction(self.persoAction)
        self.groupe_action.addAction(self.entreeAction)
        self.groupe_action.addAction(self.sortieAction)
        self.groupe_action.addAction(self.murAction)
        self.groupe_action.addAction(self.piegeAction)
        self.groupe_action.addAction(self.videAction)
        self.persoAction.setChecked(True)

        # Ajoute les boutons à la barre d'outil
        self.toolbar2 = QToolBar(self)
        self.addToolBar(Qt.RightToolBarArea,self.toolbar2)
        self.toolbar2.setFloatable(False)
        self.toolbar2.addAction(self.persoAction)
        self.toolbar2.addAction(self.entreeAction)
        self.toolbar2.addAction(self.sortieAction)
        self.toolbar2.addAction(self.murAction)
        self.toolbar2.addAction(self.piegeAction)
        self.toolbar2.addAction(self.videAction)
        self.toolbar2.setContextMenuPolicy(Qt.CustomContextMenu)

        # Initialise et centre la fenêtre
        bureau = QDesktopWidget();
        largeur = self.sizeHint().width()
        self.setGeometry((bureau.screen().width()/2)-(largeur/2),(bureau.screen().height()/2)-(largeur/2),largeur,largeur);
        self.setCentralWidget(self.grille)
        self.setWindowIcon(QIcon('Icones/maze.png'))
        self.setWindowTitle("Labyrinthe QLearning")
        self.show()


    ''' Actualise l'affichage de la barre de status '''
    def changeStatus(self):
        self.lbl_status.setText(u"Politique " + str(self.politique_choisie+1) + u" | Nb passes max " + str(self.nb_coups_max) + u" | Itération n° " + str(self.nb_coups))


    ''' Affiche/Masque les q de chaque case '''
    def triggerMaxQ(self):
        # Affiche les q de chaque case
        if(self.maxQAction.isChecked()):
            for ligne in range(self.labyrinthe.nb_lignes):
                for colonne in range(self.labyrinthe.nb_colonnes):
                    maxq = max(self.qlearning.table_q[ligne,colonne])
                    self.grille.item(ligne,colonne).setText(str(round(maxq,1)))
                    liste_Q = []
                    self.grille.item(ligne,colonne).setToolTip("G:"+str(self.qlearning.table_q[ligne,colonne][0])+" D:"+str(self.qlearning.table_q[ligne,colonne][1])+" H:"+str(self.qlearning.table_q[ligne,colonne][2])+" B:"+str(self.qlearning.table_q[ligne,colonne][3]))
        # Masque les q de chaque case
        else:
            for ligne in range(self.labyrinthe.nb_lignes):
                for colonne in range(self.labyrinthe.nb_colonnes):
                    self.grille.item(ligne,colonne).setText('')


    ''' Colorie la case double cliquée suivant le type de case choisi '''
    def drawCase(self,row,column):
        brush = QBrush()
        brush.setTexture(self.ICONE_VIDE)

        if(self.persoAction.isChecked()):
            # Positionne le personnage
            self.positionnerPerso(row,column)
            self.case_perso = self.labyrinthe.grille[row][column]
            self.case_init = self.case_perso
            self.qlearning.case_init = self.case_perso

            # Active les actions
            self.playAction.setEnabled(True)
            self.nextAction.setEnabled(True)
            self.nb_coups_simules.setEnabled(True)

        elif(self.entreeAction.isChecked()):
            brush.setTexture(self.ICONE_ENTREE)
            type_case = ENTREE
        elif(self.sortieAction.isChecked()):
            brush.setTexture(self.ICONE_SORTIE)
            type_case = SORTIE
        elif(self.murAction.isChecked()):
            brush.setTexture(self.ICONE_MUR)
            type_case = MUR
        elif(self.piegeAction.isChecked()):
            brush.setTexture(self.ICONE_PIEGE)
            type_case = PIEGE
        elif(self.videAction.isChecked()):
            brush.setTexture(self.ICONE_VIDE)
            type_case = VIDE

        # Le labyrinthe a été modifié, on autorise donc l'enregistrement
        if(self.persoAction.isChecked()==False):
            if(self.labyrinthe.grille[row][column].type!=type_case):
                self.saveAction.setEnabled(True)

            # Met à jour le labyrinthe
            self.labyrinthe.grille[row][column].type=type_case

            # Change la couleur de la case suivant son nouveau type
            self.grille.item(row, column).setBackground(brush)


    ''' Récupère les paramètres enregistrés du QLearning '''
    def chargerParametresQLearning(self):
        my_settings = QSettings("Bazerque-Vigie", "Labyrinthe");

        # Charge les paramètres généraux
        my_settings.beginGroup("General")
        if (my_settings.contains("politique")):
            self.politique_choisie = my_settings.value("politique").toInt()[0]

        if (my_settings.contains("nb_coups")):
            self.nb_coups_max = my_settings.value("nb_coups").toInt()[0]

        if (my_settings.contains("vitesse")):
            self.vitesse = my_settings.value("vitesse").toInt()[0]

        my_settings.endGroup()

        # Charge les coûts des déplacements
        my_settings.beginGroup("Couts")
        if (my_settings.contains("deplacement_normal")):
            deplacement_normal = my_settings.value("deplacement_normal").toFloat()[0]
        if (my_settings.contains("deplacement_piege")):
            deplacement_piege = my_settings.value("deplacement_piege").toFloat()[0]
        if (my_settings.contains("deplacement_sortie")):
            deplacement_sortie = my_settings.value("deplacement_sortie").toFloat()[0]

        self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init)


    ''' Modifie les paramètres '''
    def modifierParametres(self):
        f = FenetreParametres(self)
        if (f.exec_()):

            # Récupère les choix dès que la FenetreParametres est validée
            self.politique_choisie,self.nb_coups_max, self.vitesse, deplacement_normal,deplacement_piege,deplacement_sortie = f.getChoix()

            # Modifie les paramètres initiaux du QLearning
            self.qlearning = QLearning(self.labyrinthe, deplacement_normal,deplacement_piege,deplacement_sortie,self.case_init)

            # Actualise la barre de status
            self.changeStatus()

    ''' Affiche les moyennes d'apprentissage calculées pour le labyrinthe actuel '''
    def showMoy(self):
        # Lance une fenêtre pour créer un nouveau labyrinthe
        f = FenetreMoyennes(self,self.moyenne_renforcement[0],self.moyenne_renforcement[1],self.moyenne_renforcement[2],self.moyenne_renforcement[3],self.moyenne_renforcement[4],self.moyenne_renforcement[5])
        f.exec_()


    ''' Initialise le labyrinthe à partir d'un fichier '''
    def chargerLabyrinthe(self):

        # Instancie un labyrinthe à partir d'un fichier
        path_labyrinthe = QFileDialog.getOpenFileName(self,'Charger un labyrinthe','./')
        if (path_labyrinthe!=""):
            self.labyrinthe = Labyrinthe()
            self.labyrinthe.loadMaze(path_labyrinthe)

            # Charge les paramètres du QLearning
            self.chargerParametresQLearning()

            # Estime un nombre de coups maximum pour le labrinthe
            self.nb_coups_max = self.labyrinthe.nb_lignes*self.labyrinthe.nb_colonnes*20
            my_settings = QSettings("Bazerque-Vigie", "Labyrinthe");
            my_settings.beginGroup("General");
            my_settings.setValue("nb_coups", self.nb_coups_max)
            my_settings.endGroup()

            # Initialise la grille du labyrinthe selon le labyrinthe chargé
            self.grille.setRowCount(self.labyrinthe.nb_lignes)
            self.grille.setColumnCount(self.labyrinthe.nb_colonnes)

            brush = QBrush()
            for ligne in range(self.labyrinthe.nb_lignes):
                for colonne in range(self.labyrinthe.nb_colonnes):

                    # Ajoute un item vide à la grille
                    self.grille.setItem(ligne,colonne,QTableWidgetItem())
                    self.grille.item(ligne,colonne).setTextAlignment(Qt.AlignCenter)
                    self.grille.item(ligne,colonne).setForeground(Qt.black)

                    # Suivant le type de case modifie l'image de la case
                    if(self.labyrinthe.grille[ligne][colonne].type==ENTREE):
                        brush.setTexture(self.ICONE_ENTREE)
                        self.grille.item(ligne,colonne).setBackground(brush)

                    elif(self.labyrinthe.grille[ligne][colonne].type==SORTIE):
                        brush.setTexture(self.ICONE_SORTIE)
                        self.grille.item(ligne,colonne).setBackground(brush)

                    elif(self.labyrinthe.grille[ligne][colonne].type==MUR):
                        brush.setTexture(self.ICONE_MUR)
                        self.grille.item(ligne,colonne).setBackground(brush)

                    elif(self.labyrinthe.grille[ligne][colonne].type==PIEGE):
                        brush.setTexture(self.ICONE_PIEGE)
                        self.grille.item(ligne,colonne).setBackground(brush)

                    elif(self.labyrinthe.grille[ligne][colonne].type==VIDE):
                        brush.setTexture(self.ICONE_VIDE)
                        self.grille.item(ligne,colonne).setBackground(brush)


            self.saveAction.setEnabled(False)

            # Affiche ou masque les MaxQ selon l'état du bouton MaxQ
            self.triggerMaxQ()

            # Reinitialise le perso si un labyrinthe avait déjà été utilisé
            self.case_perso = None

            self.changeStatus()
            self.moyenne_renforcement=[0,0,0,0,0,0]


    ''' Crée un nouveau labyrinthe '''
    def nouveauLabyrinthe(self):

        # Lance une fenêtre pour créer un nouveau labyrinthe
        f = FenetreNewLabyrinthe(self)
        aleatoire = False
        if (f.exec_()):

            # Récupère les choix dès que la FenetreNewLabyrinthe est validée
            nb_lignes,nb_colonnes, aleatoire = f.getChoix()
            self.labyrinthe = Labyrinthe(nb_lignes,nb_colonnes,aleatoire)

            # Charge les paramètres du QLearning
            self.chargerParametresQLearning()

            # Estime un nombre de coups maximum pour le labrinthe
            self.nb_coups_max = self.labyrinthe.nb_lignes*self.labyrinthe.nb_colonnes*20
            my_settings = QSettings("Bazerque-Vigie", "Labyrinthe");
            my_settings.beginGroup("General");
            my_settings.setValue("nb_coups", self.nb_coups_max)
            my_settings.endGroup()

            # Initialise la grille suivant les choix effectués
            self.grille.setRowCount(nb_lignes)
            self.grille.setColumnCount(nb_colonnes)

            brush = QBrush()
            brush.setTexture(self.ICONE_VIDE)
            for ligne in range(nb_lignes):
                for colonne in range(nb_colonnes):

                    # Ajoute un item vide à la grille
                    self.grille.setItem(ligne,colonne,QTableWidgetItem())
                    self.grille.item(ligne,colonne).setTextAlignment(Qt.AlignCenter)
                    self.grille.item(ligne,colonne).setForeground(Qt.black)

                    # Suivant le type de case modifie l'image de la case
                    if(self.labyrinthe.grille[ligne][colonne].type==MUR):
                        brush.setTexture(self.ICONE_MUR)
                        self.grille.item(ligne,colonne).setBackground(brush)

                    elif(self.labyrinthe.grille[ligne][colonne].type==VIDE):
                        brush.setTexture(self.ICONE_VIDE)
                        self.grille.item(ligne,colonne).setBackground(brush)

            # Empêche l'utilisateur de cliquer sur le bouton play
            self.playAction.setEnabled(False)
            self.nextAction.setEnabled(False)
            self.saveAction.setEnabled(True)
            self.nb_coups_simules.setEnabled(False)

            # Affiche ou masque les MaxQ selon l'état du bouton MaxQ
            self.triggerMaxQ()

            # Reinitialise le perso si un labyrinthe avait déjà été utilisé
            self.case_perso = None

            self.changeStatus()

            self.moyenne_renforcement=[0,0,0,0,0,0]


    ''' Enregistre le labyrinthe dessiné dans un fichier '''
    def enregistrerLabyrinthe(self):

        # Lance une fenêtre pour récupérer l'endroit ou enregistrer le labyrinthe
        path_labyrinthe = QFileDialog.getSaveFileName(self,'Enregistrer le labyrinthe','./')
        if(path_labyrinthe!=""):
            try:
                with open(path_labyrinthe,'w') as f:

                    # Ecrit le labyrinthe caractère par caractère suivant la couleur (donc le type) de chaque case
                    for ligne in range(self.grille.rowCount()):
                        for colonne in range(self.grille.columnCount()):
                            if (self.labyrinthe.grille[ligne][colonne].type==ENTREE):
                                f.write('E ')
                            elif (self.labyrinthe.grille[ligne][colonne].type==SORTIE):
                                f.write('S ')
                            elif (self.labyrinthe.grille[ligne][colonne].type==MUR):
                                f.write('M ')
                            elif (self.labyrinthe.grille[ligne][colonne].type==PIEGE):
                                f.write('P ')
                            elif (self.labyrinthe.grille[ligne][colonne].type==VIDE):
                                f.write('. ')
                        f.write('\n')
                    f.close()

                    # Désactive le bouton sauvegarder et active le bouton play
                    self.saveAction.setEnabled(False)

            except IOError:
                # Si le fichier est illisible
                print "Impossible de sauvegarder le labyrinthe dans ce fichier"


    ''' Efface le personnage de son ancienne case et le dessine sur la nouvelle '''
    def deplacerPerso(self,ligne,colonne):
        brush = QBrush()

        # Efface le personnage de sa case actuelle
        # Suivant le type de case modifie l'image de la case
        if(self.labyrinthe.grille[ligne][colonne].type==ENTREE):
            brush.setTexture(self.ICONE_ENTREE)
            self.grille.item(ligne,colonne).setBackground(brush)

        elif(self.labyrinthe.grille[ligne][colonne].type==SORTIE):
            brush.setTexture(self.ICONE_SORTIE)
            self.grille.item(ligne,colonne).setBackground(brush)

        elif(self.labyrinthe.grille[ligne][colonne].type==PIEGE):
            brush.setTexture(self.ICONE_PIEGE)
            self.grille.item(ligne,colonne).setBackground(brush)

        elif(self.labyrinthe.grille[ligne][colonne].type==VIDE):
            brush.setTexture(self.ICONE_VIDE)
            self.grille.item(ligne,colonne).setBackground(brush)

        # Dessine le personnage sur sa nouvelle case
        brush.setTexture(self.ICONE_PERSO)
        # self.position_perso = (ligne,colonne)
        self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush)


    ''' Efface le personnage de sa dernière case '''
    def effacerPerso(self):
        brush = QBrush()

        if(self.case_perso!=None):
            # Efface le personnage de sa case actuelle
            # Suivant le type de case modifie l'image de la case
            if(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==ENTREE):
                brush.setTexture(self.ICONE_ENTREE)
                self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush)

            elif(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==SORTIE):
                brush.setTexture(self.ICONE_SORTIE)
                self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush)

            elif(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==PIEGE):
                brush.setTexture(self.ICONE_PIEGE)
                self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush)

            elif(self.labyrinthe.grille[self.case_perso.position[0]][self.case_perso.position[1]].type==VIDE):
                brush.setTexture(self.ICONE_VIDE)
                self.grille.item(self.case_perso.position[0],self.case_perso.position[1]).setBackground(brush)


    ''' Place le personnage à l'endroit choisi par l'utilisateur '''
    def positionnerPerso(self,ligne,colonne):
        brush = QBrush()

        if (self.labyrinthe.grille[ligne][colonne].type==ENTREE):

            self.effacerPerso()

            # Dessine le personnage sur sa nouvelle case
            brush.setTexture(self.ICONE_PERSO)
            self.grille.item(ligne,colonne).setBackground(brush)
        else:
            msg_warning = QMessageBox()
            msg_warning.setIcon(QMessageBox.Warning)
            msg_warning.setText(u"Le personnage doit être placé sur une entrée du labyrinthe !")
            msg_warning.exec_()


    ''' Stoppe l'apprentissage en cours '''
    def stopPolitique(self):
        self.apprentissage_termine = True
        self.deplacement_en_cours = False
        self.lecture_en_cours = False

        # Fait un tour de plus pour arreter l'algo si click sur suivant
        self.runPolitique1Coup()


    ''' Lance l'apprentissage ou le met en pause suivant l'état du bouton '''
    def runPlayPause(self):
        # Si le bouton est en mode play et click, et qu'il n'y a pas de déplacement en cours, lance l'algo
        if(self.lecture_en_cours==False and self.deplacement_en_cours==False):
            self.deplacement_en_cours = True

            # Empeche l'utilisateur de mofifier les parametres, de créer ou d'ouvrir un labyrinthe
            self.newAction.setEnabled(False)
            self.openAction.setEnabled(False)
            self.settingsAction.setEnabled(False)

            # Autorise l'utilisateur à cliquer sur le bouton stop
            self.stopAction.setEnabled(True)
            self.lecture_en_cours = True
            self.playAction.setIcon(QIcon('Icones/pause.png'))

            # Lance l'apprentissage avec la politique choisie
            self.runPolitiqueNbCoupsMax()

        # Sinon met en pause
        else:
            self.lecture_en_cours = False
            self.deplacement_en_cours = False
            self.playAction.setIcon(QIcon('Icones/play.png'))


    ''' Exécute la politique choisie '''
    def runPolitiqueChoisie(self):
        if(self.politique_choisie==0):
            self.politique1()
        elif(self.politique_choisie==1):
            self.politique2()
        elif(self.politique_choisie==2):
            self.politique3()
        elif(self.politique_choisie==3):
            self.politique4()
        elif(self.politique_choisie==4):
            self.politique5()
        elif(self.politique_choisie==5):
            self.politique6()


    ''' Execute la politique choisie une fois '''
    def runPolitique1Coup(self):

        if(self.apprentissage_termine==False):
            self.stopAction.setEnabled(True)
            self.newAction.setEnabled(False)
            self.openAction.setEnabled(False)
            self.settingsAction.setEnabled(False)

            # Effectue la politique choisie
            self.runPolitiqueChoisie()

        else:
            self.apprentissage_fini.emit()

        # Met à jour les QMAX
        self.triggerMaxQ()

        # Met à jour la barre de status
        self.changeStatus()


    ''' Execute la politique tant que le nb max de coups n'est pas atteint '''
    ''' ou que l'utilisateur ne clique pas sur pause ou sur stop           '''
    def runPolitiqueNbCoupsMax(self):

        if(self.apprentissage_termine==False):
            if(self.lecture_en_cours==True):

                # Effectue la politique choisie
                self.runPolitiqueChoisie()


                # Exécute l'étape suivante au bout d'un délai définit par la vitesse du perso
                QTimer.singleShot(int(1000/self.vitesse), self.runPolitiqueNbCoupsMax)
        else:
            self.apprentissage_fini.emit()

        # Met à jour les QMAX
        self.triggerMaxQ()

        # Met à jour la barre de status
        self.changeStatus()


    ''' Execute la politique tant que le nb de coups entrés dans la barre '''
    ''' d'outil n'est pas atteint ou que l'utilisateur ne clique pas sur  '''
    ''' le bouton stop                                                    '''
    def runPolitiqueNbCoups(self):

        # Lit le nb d'itération à effectuer
        nb_iterations = int(self.nb_coups_simules.text())
        self.stopAction.setEnabled(True)

        if(self.apprentissage_termine==False):
            if (self.lecture_en_cours==False):
                if(self.nb_coups_en_suivant<nb_iterations):
                    self.deplacement_en_cours = True

                    # Effectue la politique choisie
                    self.runPolitiqueChoisie()


                    self.nb_coups_en_suivant +=1
                    QTimer.singleShot(int(1000/self.vitesse), self.runPolitiqueNbCoups)
                else:
                    self.deplacement_en_cours = False
                    self.nb_coups_en_suivant = 0
                    nb_iterations = 0
        else:
            self.apprentissage_fini.emit()
            self.deplacement_en_cours = False


        # Met à jour les QMAX
        self.triggerMaxQ()

        # Met à jour la barre de status
        self.changeStatus()


    ''' La politique 1 effectue une exploration pure. Les cases explorées sont choisies aléatoirement. '''
    def politique1(self):
        print "nbcoups",self.nb_coups

        # if(self.lecture_en_cours==False and self.deplacement_en_cours==False):
        if(self.nb_coups==self.nb_coups_max-1):
            self.apprentissage_termine=True
            self.moyenne_renforcement[0] = self.qlearning.somme_recompenses/self.nb_coups_max
            print "Moyenne renforcement = ",self.moyenne_renforcement[0]
        else:
            case_precedente = self.case_perso
            self.case_perso = self.qlearning.exploration_pure(self.case_perso)
            self.nb_coups = self.nb_coups + 1
            self.deplacerPerso(case_precedente.position[0],case_precedente.position[1])


    ''' La politique 2 effectue une exploration exigente. L'exploration \"exigente\" défavorise un retour sur la dernière case explorée. '''
    def politique2(self):
        print "nbcoups",self.nb_coups

        # if(self.lecture_en_cours==False and self.deplacement_en_cours==False):
        if(self.nb_coups==self.nb_coups_max-1):
            self.apprentissage_termine=True
            self.moyenne_renforcement[1] = self.qlearning.somme_recompenses/self.nb_coups_max
            print "Moyenne renforcement = ",self.moyenne_renforcement[1]
        else:
            case_precedente = self.case_perso
            self.case_perso,self.last_move = self.qlearning.exploration_exigente(self.case_perso,self.last_move)
            self.nb_coups = self.nb_coups + 1
            self.deplacerPerso(case_precedente.position[0],case_precedente.position[1])


    ''' La politique 3 effectue une exploration infaillible. L'exploration \"infaillible\" interdit un retour sur la '''
    ''' dernière case explorée (sauf cul-de-sac).                                                                    '''
    def politique3(self):
        print "nbcoups",self.nb_coups

        if(self.nb_coups==self.nb_coups_max-1):
            self.apprentissage_termine=True
            self.moyenne_renforcement[2] = self.qlearning.somme_recompenses/self.nb_coups_max
            print "Moyenne renforcement = ",self.moyenne_renforcement[2]
        else:
            case_precedente = self.case_perso
            self.case_perso,self.last_move = self.qlearning.exploration_infaillible(self.case_perso,self.last_move)
            self.nb_coups = self.nb_coups + 1
            self.deplacerPerso(case_precedente.position[0],case_precedente.position[1])


    ''' La politique 4 effectue la totalité des passes en exploration pure puis recommence avec des exploitations. '''
    ''' L'exploration pure choisie les cases aléatoirement.                                                        '''
    def politique4(self):
        print "nbcoups",self.nb_coups
        case_precedente = self.case_perso

        if(self.toggleAlgo==False):
            self.case_perso = self.qlearning.exploration_pure(self.case_perso)
            self.nb_coups = self.nb_coups + 1
            if(self.nb_coups==self.nb_coups_max):
                self.toggleAlgo = True
                self.nb_coups = 0
                self.case_perso = self.case_init
        else:
            self.case_perso = self.qlearning.exploitation(self.case_perso)
            self.nb_coups = self.nb_coups + 1
            if(self.nb_coups==self.nb_coups_max-1):
                self.apprentissage_termine=True
                self.moyenne_renforcement[3] = self.qlearning.somme_recompenses/(self.nb_coups_max*2)
                print "Moyenne renforcement = ",self.moyenne_renforcement[3]

        self.deplacerPerso(case_precedente.position[0],case_precedente.position[1])

        if(self.nb_coups==self.nb_coups_max and self.toggleAlgo==True):
            self.apprentissage_fini.emit()


    ''' La politique 5 effectue la totalité des passes en exploration exigente puis recommence avec des exploitations. '''
    ''' L'exploration \"exigente\" défavorise un retour sur la dernière case explorée.                                 '''
    def politique5(self):
        print "nbcoups",self.nb_coups
        case_precedente = self.case_perso

        if(self.toggleAlgo==False):
            self.case_perso,self.last_move = self.qlearning.exploration_exigente(self.case_perso,self.last_move)
            self.nb_coups = self.nb_coups + 1
            if(self.nb_coups==self.nb_coups_max):
                self.toggleAlgo = True
                self.nb_coups = 0
                self.case_perso = self.case_init
        else:
            self.case_perso = self.qlearning.exploitation(self.case_perso)
            self.nb_coups = self.nb_coups + 1
            if(self.nb_coups==self.nb_coups_max-1):
                self.apprentissage_termine=True
                self.moyenne_renforcement[4] = self.qlearning.somme_recompenses/(self.nb_coups_max*2)
                print "Moyenne renforcement = ",self.moyenne_renforcement[4]
        self.deplacerPerso(case_precedente.position[0],case_precedente.position[1])


    ''' La politique 6 effectue la totalité des passes en exploration infaillible puis recommence avec des exploitations. '''
    ''' L'exploration \"infaillible\" interdit un retour sur la dernière case explorée (sauf cul-de-sac).                 '''
    def politique6(self):
        print "nbcoups",self.nb_coups
        case_precedente = self.case_perso

        if(self.toggleAlgo==False):
            self.case_perso,self.last_move = self.qlearning.exploration_infaillible(self.case_perso,self.last_move)
            self.nb_coups = self.nb_coups + 1
            if(self.nb_coups==self.nb_coups_max):
                self.toggleAlgo = True
                self.nb_coups = 0
                self.case_perso = self.case_init

        else:
            self.case_perso = self.qlearning.exploitation(self.case_perso)
            self.nb_coups = self.nb_coups + 1
            if(self.nb_coups==self.nb_coups_max-1):
                self.apprentissage_termine=True
                self.moyenne_renforcement[5] = self.qlearning.somme_recompenses/(self.nb_coups_max*2)
                print "Moyenne renforcement = ",self.moyenne_renforcement[5]

        self.deplacerPerso(case_precedente.position[0],case_precedente.position[1])


    ''' Réinitialise les données d'apprentissage lors d'un click sur stop ou à la fin de l'apprentissage '''
    def reinitialiserApprentissage(self):

        # Remet les compteurs à 0
        self.nb_coups_en_suivant=0
        self.nb_coups=0

        # Réinitialise l'affichage des boutons de controle de lecture
        self.apprentissage_termine=False
        self.lecture_en_cours = False
        self.toggleAlgo = False
        self.playAction.setIcon(QIcon('Icones/play.png'))

        self.nb_coups_simules.setEnabled(False)
        self.nextAction.setEnabled(False)
        self.playAction.setEnabled(False)
        self.stopAction.setEnabled(False)

        # Efface le personnage
        self.effacerPerso()
        self.case_perso = None


        # Signale à l'utilisateur que l'apprentissage est terminé
        msg_warning = QMessageBox()
        msg_warning.setStandardButtons(QMessageBox.Ok | QMessageBox.No)
        msg_warning.setDefaultButton(QMessageBox.Ok)
        msg_warning.setIcon(QMessageBox.Information)
        msg_warning.setText(u"Apprentissage terminé !\nVoulez-vous effacer les données d'apprentissage (table des q) ?")
        res = msg_warning.exec_()

        if(res==QMessageBox.Ok):
            # Reinitialise qlearning
            self.chargerParametresQLearning()

        # Autorise l'utilisateur à mofifier les parametres, à créer ou d'ouvrir un labyrinthe
        self.newAction.setEnabled(True)
        self.openAction.setEnabled(True)
        self.settingsAction.setEnabled(True)
Example #33
0
     opt_val = prob_env.still(prob_env.output_noiseless(opt_state))
     wall_time_limit = kwargs.wall_time_limit
     generation = call_counts  # for random search, generation means call counts
 elif kwargs.method == 'rl_prtr':
     model_env_name = get_model_env_name(kwargs.prtr_model_dir)
     assert model_env_name == prob_env_name
     opt = QLearning(
         k=prob_env.k,
         d=prob_env.d,
         env_name=prob_env_name,
         env_dir=kwargs.prob_env_dir,  # env_dir will load environment
         env_fixed_xo=False,
         n_hidden=0,
         save_and_load_path=kwargs.
         prtr_model_dir,  # model dir will load model
         load=True,
         tensorboard_path=None,
         logger_path=None,
         memory_capacity=None,
         memory_capacity_start_learning=None,
         learn_wall_time_limit=None,
         prioritized=None,
         save_model_iter=None,
         trial_size=0,
     )
     # opt may learned from non fixed xo environment
     # but we will test it under fixed xo environment
     opt.set_env_fixed_xo(prob_env.x_o)
     assert opt.get_env_if_set_fixed_xo()
     opt_val, start_x_o, opt_x_p, start_x_p, duration = extract_rl_exp_result(
         opt, prob_env)
Example #34
0
from environment import Env
from QLearning import QLearning

if __name__ == "__main__":
    env = Env()
    QL = QLearning(list(range(env.n_actions)))

    for episode in range(1000):
        state = env.reset()
        while True:
            env.render()

            # take action and proceed one step in the environment
            action = QL.get_action(str(state))
            next_state, reward, done = env.step(action)

            # with sample <s,a,r,s'>, agent learns new q function
            QL.learn(str(state), action, reward, str(next_state))

            state = next_state
            env.print_value_all(QL.q_table)

            # if episode ends, then break
            if done:
                break
# import Q Learning function
from QLearning import QLearning

# define experiment parameters
gamma = 0.99
lr = 0.1
epsilon = [0.01, 0.1, 0.25]
runs = 2
step_number = 100
episode_length = 100

# run experiment
QLearning(gamma, lr, epsilon, runs, step_number, episode_length)
Example #36
0
# Implement Q-learning and use this to solve the cartpole-environment
import gym

# Source: https://github.com/JoeSnow7/Reinforcement-Learning/blob/master/Cartpole%20Q-learning.ipynb

# We define a class to contain the learning algorithm
from QLearning import QLearning

env = gym.make("CartPole-v0")
agent = QLearning(env)
agent.train()
agent.run()
Example #37
0
game = Game()

#initialize window.
game.initialisationWindow()
game.initialisationBackground()
game.loadingPictures()

#recuperate window.
window = game.getterWindow()

#Recuperate wall and rewards coords on the .txt file
mapWallCoord = game.getterWallCoord()
rewardCoordB, rewardCoordM = game.getterReward()
"""---------------------- INITIALIZE QLEARNING ---------------------"""
#Create object from qlearning class.
QLearning = QLearning()
#Init qTable with rewards and '0' coords from the .txt file
QLearning.intiQtable()
"""---------------------- INITIALIZE TRAINING KNN---------------------"""
Trainning = Trainning()

epochs = 2000

dataX = []  #targets
dataY = []  #features

for epoch in range(epochs):

    #Simulate rayon (right or top detection)
    rayons = Trainning.simulateRayon()