Example #1
0
    def __init__(self,
                 alpha=0.001,
                 gamma=0.95,
                 marker=None,
                 batchSize=10000,
                 miniBatchSize=100,
                 updateFrequency=100):
        """
            gamma: discount factor
            alpha: learning rate
            marker: The marker that the agent will use in the tic tac toe environment 
                   (if None, it should be set before starting learning)
        """
        super().__init__(alpha=alpha, gamma=gamma, marker=marker)

        self.rnd = Seeds().DQN_AGENT_SEED
        from numpy.random import seed
        seed(self.rnd.randint(1, 10000))
        from tensorflow import set_random_seed
        set_random_seed(self.rnd.randint(1, 10000))

        self.batch = []
        self.batchSize = batchSize
        self.miniBatchSize = miniBatchSize
        self.updateFrequency = updateFrequency
        self.currentStep = 0

        self.init_network()
Example #2
0
    def __init__(self,gamma=0.99, alpha=0.2, marker=None, model = None, seed = None):
        """
            If model is not None, the agent follows the model and never
            performs exploration
        """
        #This should be executed before the constructor of the super class
        if model is not None:
            self.initialModel = model
        super().__init__(gamma, alpha, marker)

        self.rnd = Seeds().NEURALNET_AGENT_SEED
        if seed is not None:
            self.rnd.seed(seed)
        np.random.seed(self.rnd.randint(1,10000))
        if model is not None:   
            self.followModel = True
            self.exploring = False 
Example #3
0
    def __init__(self, alpha=0.01, gamma=0.1, marker=None):
        """
            gamma: discount factor
            alpha: learning rate
            marker: The marker that the agent will use in the tic tac toe environment 
                   (if None, it should be set before starting learning)
        """
        self.gamma = gamma
        self.alpha = alpha
        self.marker = marker

        self.rnd = Seeds().Q_AGENT_SEED

        self.initQ = 0.001
        #Initiating weights with value = self.initQ
        self.qWeights = np.multiply([1] * CHECKERS_FEATURE_COUNT,
                                    self.rnd.random() * self.initQ)
        #self.qBias = 0.001

        self.T = self.tempInit
Example #4
0
    def __init__(self,
                 gamma=0.99,
                 alpha=0.2,
                 marker=None,
                 qTableFollow=None,
                 seed=None):
        """
            If the QTableFollow is not None, the agent follows the Q-table and never
            performs exploration
        """
        super().__init__(gamma, alpha, marker)
        self.rnd = Seeds().LENO_AGENT_SEED
        if seed is not None:
            self.rnd.seed(seed)
        self.qTables = []

        if qTableFollow is not None:
            self.qTable = qTableFollow
            self.fixedPolicy = True
        self.expertStorage = {}
Example #5
0
    def __init__(self,
                 gamma=0.99,
                 alpha=0.2,
                 marker=None,
                 model=None,
                 seed=None):
        """
            If model is not None, the agent follows the model and never
            performs exploration
        """
        super().__init__(gamma, alpha, marker)

        self.rnd = Seeds().PROBMOD_AGENT_SEED
        if seed is not None:
            self.rnd.seed(seed)
        np.random.seed(self.rnd.randint(1, 10000))

        if model is not None:
            self.model = model
            self.followModel = True
            self.exploring = False
Example #6
0
class RandomAgent(Agent):
    gamma = 0.99

    def __init__(self):
        self.rnd = Seeds().RANDOM_AGENT_SEED

    def observe_reward(self, state, action, statePrime, reward):
        pass

    def select_action(self, state):
        return self.rnd.choice(self.environment.get_actions(state))

    def set_environment(self, environment, marker):
        """Connects to the domain environment"""
        self.environment = environment
        self.marker = marker
Example #7
0
class QLearningAgent(Agent):
    gamma = None
    alpha = None

    qWeights = None
    #qBias = None
    initQ = None  #Value to initiate the weights

    USE_EPSILON_GREEDY = True  #If false, uses Boltzmann exploration
    epsilon = 0.5

    T = None  #Current temperature for Boltzmann exploration
    tempInit = 1.0 / 2  #Initial temperature for Boltzmann exploration
    tempFinal = 1.0 / 50  #Final temperature for Boltzmann exploration

    rnd = None

    #gamma=0.99, alpha=0.2
    def __init__(self, alpha=0.01, gamma=0.1, marker=None):
        """
            gamma: discount factor
            alpha: learning rate
            marker: The marker that the agent will use in the tic tac toe environment 
                   (if None, it should be set before starting learning)
        """
        self.gamma = gamma
        self.alpha = alpha
        self.marker = marker

        self.rnd = Seeds().Q_AGENT_SEED

        self.initQ = 0.001
        #Initiating weights with value = self.initQ
        self.qWeights = np.multiply([1] * CHECKERS_FEATURE_COUNT,
                                    self.rnd.random() * self.initQ)
        #self.qBias = 0.001

        self.T = self.tempInit

    def observe_reward(self, state, action, statePrime, reward):
        """
            Updates the Q-table (only if the agent is exploring
        """
        if self.exploring:
            allActionsPrime = self.environment.get_actions(statePrime)

            qValue, features = self.calcQTable(state,
                                               action,
                                               returnFeatures=True)
            V = self.get_max_Q_value(statePrime, allActionsPrime)
            expected = reward + self.gamma * V

            temporal_difference = expected - qValue

            for i in range(len(self.qWeights)):
                self.qWeights[i] = self.qWeights[i] + self.alpha * (
                    temporal_difference) * features[i]
            #self.qBias += self.alpha * temporal_difference * self.qBias
            #print(str(self.qWeights))#+ " - " + str(self.qBias))
            if self.epsilon > 0.05:
                self.epsilon /= 1.0001
            if self.alpha > 0.001:
                self.alpha /= 1.00001
            #print(self.alpha)

    def calcQTable(self, state, action, returnFeatures=False):
        """Returns one value from the Qtable"""
        features = self.process_state(state, action)
        qValue = np.dot(self.qWeights, features)  #+ self.qBias

        if returnFeatures:
            return qValue, features

        return qValue

    def best_action_deterministic(self, state):
        allActions = self.environment.get_actions(state)
        maxVal = -float('inf')
        bestAct = None
        for act in allActions:
            q = self.calcQTable(state, act)
            if q > maxVal:
                bestAct = [act]
                maxVal = q
            elif maxVal == q:
                bestAct.append(act)
        return self.rnd.choice(bestAct)

    def select_action(self, state):
        """ When this method is called, the agent executes an action based on its Q-table 
            Boltzmann Exploration is used    def create_fictitious(self,expertSteps):
        """
        if self.USE_EPSILON_GREEDY:
            return self.select_action_epsilon_greedy(state)
        else:
            return self.select_action_boltzmann(state)

    def select_action_boltzmann(self, state):
        #Check here

        allActions = self.environment.get_actions()
        #Boltzmann exploration strategy
        valueActions = []
        sumActions = 0

        for action in allActions:
            qValue = self.calcQTable(state, action)
            vBoltz = math.pow(math.e, qValue / self.T) + 0.00001
            valueActions.append(vBoltz)
            sumActions += vBoltz

        probAct = [x / sumActions for x in valueActions]

        rndVal = self.rnd.random()

        sumProbs = 0
        i = -1

        while sumProbs <= rndVal:
            i = i + 1
            sumProbs += probAct[i]

        #Apply decay
        if self.T > self.tempFinal and self.exploring:
            self.T -= (self.tempInit - self.tempFinal) / (100000)

        return allActions[i]

    def select_action_epsilon_greedy(self, state):
        """
            Applies the epsilon greedy exploration when the agent is exploring,
            and chooses deterministically 
        """

        randv = self.rnd.random()
        if self.exploring and randv < self.epsilon:
            return self.rnd.choice(self.environment.get_actions())

        #If not random explorating, the action with best value is returned
        return self.best_action_deterministic(state)

    def get_max_Q_value(self, state, allActions):
        """
            returns max(Q) for all actions given a state
        """

        values = [self.calcQTable(state, action) for action in allActions]
        #Terminal states don't have applicable actions
        if len(values) == 0:
            return 0
        #Maximum Q value
        v = max(values)
        return v

    def create_fictitious(self, expertSteps, marker=None):
        """
            Regular Q-learning always plays against the expert
        """
        #This function is called before the agent is set to the new Marker.
        if self.marker == "X":
            return self.environment.agentX

        return self.environment.agentO
Example #8
0
 def __init__(self):
     self.rnd = Seeds().RANDOM_AGENT_SEED
Example #9
0
 def __init__(self, gamma=0.99, alpha=0.2, marker=None):
     super().__init__(gamma, alpha, marker)
     self.rnd = Seeds().SELFPLAY_AGENT_SEED
Example #10
0
class NeuralNetModQLearning(ModelQLearningAgent):
    
    followModel = False
    
    
    modelAlpha = 0.2 #Learning rate of the model   
    initOp = None #initializer for tf session
    y_hat = None #used for predictions
    X = None
    y= None
    predict = None
    initialModel = None
    W1 = None
    W2 = None
    b1 = None
    b2 = None
    session=None
    
    examplesChache = []
    maxExamples = 500
    
    
    
    
    
    
    def __init__(self,gamma=0.99, alpha=0.2, marker=None, model = None, seed = None):
        """
            If model is not None, the agent follows the model and never
            performs exploration
        """
        #This should be executed before the constructor of the super class
        if model is not None:
            self.initialModel = model
        super().__init__(gamma, alpha, marker)

        self.rnd = Seeds().NEURALNET_AGENT_SEED
        if seed is not None:
            self.rnd.seed(seed)
        np.random.seed(self.rnd.randint(1,10000))
        if model is not None:   
            self.followModel = True
            self.exploring = False 
            
    def init_model(self):
        """
            The model here is a tensorflow neural network. Everything will be prepared for later use.
        """

        
        
        num_features = 9
        num_actions = 9
        
        num_hidden_neurons = 20
        with tf.Graph().as_default() as g:
            #X will be the state variables (What is inside each of the 9 positions
            self.X = tf.placeholder(tf.float32, [None,9], name = 'X')
            #y is the action, 9 positions because of the one-hot encoding
            self.y = tf.placeholder(tf.float32, [None,9], name = "y")
            
            
        
    #        if self.initModel is not None:
    #            initW1 = self.initModel.W1
    #            initb1 = self.initModel.b1
    #            initW2 = self.initModel.W2
    #            initb2 = self.initModel.b2
                #weights and biases
    #            self.W1 = tf.Variable(initW1)
    #            self.b1 = tf.Variable(initb1)
    #            self.W2 = tf.Variable(initW2)
    #            self.b2 = tf.Variable(initb2)
    #        else:
            self.W1 = tf.Variable(tf.random_uniform([num_features,num_hidden_neurons],seed = self.rnd.randint(0,1000), 
                                                   minval = 0.0001, maxval=0.1), name='W1')
            self.b1 = tf.Variable(tf.random_uniform([num_hidden_neurons], seed = self.rnd.randint(0,1000)), name='b1')
            self.W2 = tf.Variable(tf.random_uniform([num_hidden_neurons,num_actions],seed = self.rnd.randint(0,1000), 
                                                   minval = 0.0001, maxval=0.1), name='W2')
            self.b2 = tf.Variable(tf.random_uniform([num_actions], seed = self.rnd.randint(0,1000)), name='b2')
            
            #Calculating the output of hidden layers
            hidden_out = tf.add(tf.matmul(self.X,self.W1),self.b1)
            hidden_out = tf.nn.sigmoid(hidden_out)
            
            self.y_hat = tf.nn.softmax(tf.add(tf.matmul(hidden_out, self.W2), self.b2))
            
            #self.predict = tf.argmax(self.y_hat,axis=1)
            self.predict = tf.multinomial(self.y_hat, seed = self.rnd.randint(0,1000), num_samples=1)
            y_clipped = tf.clip_by_value(self.y_hat, 1e-10, 0.9999999)
            #Cost function (cross-entropy)
            self.cost = -tf.reduce_mean(tf.reduce_sum(self.y * tf.log(y_clipped)
                             + (1 - self.y) * tf.log(1 - y_clipped), axis=1))
                   
            # add an optimizer
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.modelAlpha).minimize(self.cost)
            
            self.initOp = tf.global_variables_initializer()
            
            self.saver = tf.train.Saver()
        
        self.session = tf.Session(graph=g)
            
        self.session.run(self.initOp)
        
        if self.initialModel is not None:
            self.saver.restore(self.session,self.initialModel)
        
    def update_model(self,expertSteps,marker):
        """
            The model keeps counters of chosen actions, which are updated here
        """
        if len(self.examplesChache) + len(expertSteps) > self.maxExamples:
            #Open space in the batch for new samples
            del self.examplesChache[0:len(expertSteps) - (self.maxExamples - len(self.examplesChache))]
        self.examplesChache.extend(expertSteps)
        expertSteps = self.examplesChache 

        X = self.states_to_float(np.array(expertSteps)[:,0])
        
        #print(X)
        y = self.convert_actions(np.array(expertSteps)[:,1])       

        #Trains with the data for 10 epochs
        epochs = 10
        # start the session
        sess = self.session
        
        #print(len(expertSteps))
        for epoch in range(epochs):
            avg_cost = 0
            
            for i in range(len(expertSteps)):
                _, c = sess.run([self.optimizer,self.cost],feed_dict={self.X: X, self.y: y})
                avg_cost += c / len(expertSteps)
            #print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(avg_cost))
        #print(sess.run(self.W1))
                
        
    def states_to_float(self,states):
        """
            Convert the state in string to float
            0 == ., 1==N, 2==S
        """
        procStates = np.zeros((len(states),len(states[0])),dtype=np.float32)
        i=0
        for state in states:
            procStates[i,:] = [0. if x=='.' else 1. if x=='N' else 2. for x in state ]
            i += 1
        
        
        return procStates

    def convert_actions(self,actions):
        """
            Converts the action into the 0-9 interval and back if needed
        """
        convertedActions = []
        for act in actions:
            convertedActions.append(act[1]*3 + act[0])
        convertedActions = np.asarray(convertedActions)
        #Convert to one-hot
        num_actions = 9
        convertedActions = convertedActions.reshape(-1)
        convertedActions = np.eye(num_actions, dtype=np.float32)[convertedActions]
        return convertedActions
                
    def agent_from_model(self,marker):
        """
            Creates an agent from the model
        """
        model = "/tmp/model.save"
        #print("Will save")
        self.saver.save(self.session,model)
        
        #print("Saved")
        return NeuralNetModQLearning(model=model, seed = self.rnd.randint(1,10000))
    
    def action_from_model(self,state):
        """
            selects an action according to the observed decisions of the expert agent
        """
        state = self.process_state(state)
        X = np.array(self.states_to_float([np.array(state)]),dtype=np.float32)
        #print(X.dtype)
        #print(X)
        #print(X.shape)
        import math
        
        acts = self.environment.get_actions(state)
        
        sess = self.session
            
        act = float('inf')
        
        while not act in acts:
            act = sess.run([self.predict],feed_dict={self.X: X})
        
            #print(act[0][0])
            act = (math.floor(act[0][0]%3), math.floor(act[0][0]/3))
            
        #print(act)
        return act
        
        
        
            
        
        
        
            
    def select_action(self, state):
        if self.followModel:
            
            return self.action_from_model(state)
        return super().select_action(state)
Example #11
0
class ProbModelQLearning(ModelQLearningAgent):

    followModel = False

    def __init__(self,
                 gamma=0.99,
                 alpha=0.2,
                 marker=None,
                 model=None,
                 seed=None):
        """
            If model is not None, the agent follows the model and never
            performs exploration
        """
        super().__init__(gamma, alpha, marker)

        self.rnd = Seeds().PROBMOD_AGENT_SEED
        if seed is not None:
            self.rnd.seed(seed)
        np.random.seed(self.rnd.randint(1, 10000))

        if model is not None:
            self.model = model
            self.followModel = True
            self.exploring = False

    def init_model(self):
        """
            The model is a simple dictionary that will count the chosen actions
        """
        return {}

    def update_model(self, expertSteps, marker):
        """
            The model keeps counters of chosen actions, which are updated here
        """
        #Updates the model
        for tupStAct in expertSteps:
            state = tupStAct[0]
            if (state, tupStAct[1]) not in self.model:
                self.model[(state, tupStAct[1])] = 0
            self.model[(state, tupStAct[1])] += 1

    def agent_from_model(self, marker):
        """
            Creates an agent from the model
        """
        model = cp.deepcopy(self.model)
        return ProbModelQLearning(model=model, seed=self.rnd.randint(1, 10000))

    def action_from_model(self, state):
        """
            selects an action according to the observed decisions of the expert agent
        """
        state = self.process_state(state)
        actions = self.environment.get_actions(state)
        visits = []

        #Searches for the number of times that each action was chosen the current state
        for act in actions:
            v = self.model.get((state, act), 0)
            visits.append(v)

        totalVals = sum(visits)
        if totalVals == 0:
            return self.rnd.choice(actions)

        actProbs = [x / totalVals for x in visits]
        chosenIdx = np.random.choice(len(actions), p=actProbs)

        return actions[chosenIdx]

    def select_action(self, state):
        if self.followModel:
            return self.action_from_model(state)
        return super().select_action(state)
Example #12
0
class LenoSelfPlayAgent(QLearningAgent):

    fixedPolicy = False  #If true, the agent never explores
    qTables = None  #Library of previous Q-tables
    rnd = None

    #Maximum number of stored policies, if this number is exceeded, the less similar one is excluded
    maxCacheSize = 10

    #Storage from previously seen expert steps
    expertStorage = None

    def __init__(self,
                 gamma=0.99,
                 alpha=0.2,
                 marker=None,
                 qTableFollow=None,
                 seed=None):
        """
            If the QTableFollow is not None, the agent follows the Q-table and never
            performs exploration
        """
        super().__init__(gamma, alpha, marker)
        self.rnd = Seeds().LENO_AGENT_SEED
        if seed is not None:
            self.rnd.seed(seed)
        self.qTables = []

        if qTableFollow is not None:
            self.qTable = qTableFollow
            self.fixedPolicy = True
        self.expertStorage = {}

    def score_policy(self, qTable, expertSteps):
        """
            Returns the percentage of states in which this Qtable would choose
            the same action as the expert
        """
        score = 0
        stTuples = expertSteps.keys()
        for stActTuple in stTuples:
            #Deterministic selection of actions
            state = stActTuple
            #state = self.process_state(state,marker)
            allActions = self.environment.get_actions(state)
            maxVal = -float('inf')
            bestAct = None
            for act in allActions:
                q = qTable.get((state, act), -float('inf'))
                if q > maxVal:
                    bestAct = [act]
                    maxVal = q
                elif maxVal == q:
                    if bestAct is None:  #If the value does not exist in the q-table
                        bestAct = []
                    bestAct.append(act)
            chosen = self.rnd.choice(bestAct)
            if chosen in expertSteps[stActTuple]:
                score += 1
        return score

    def create_fictitious(self, expertSteps, marker):
        """
           Returns a fictitious agent with fixed policy
        """
        #Keeps a set of actions that have been already chosen by the expert
        for tupStAct in expertSteps:
            if tupStAct[0] not in self.expertStorage:
                self.expertStorage[(tupStAct[0])] = set()
            self.expertStorage[(tupStAct[0])].add(tupStAct[1])

        #Performs a copy of the current Qtable
        self.qTables.append(copy.deepcopy(self.qTable))
        #Calculates which of the previous policies are more similar to the expert policy
        policiesScore = [
            self.score_policy(qTab, self.expertStorage)
            for qTab in self.qTables
        ]

        #Selects the newest policy with highest score
        idx = len(policiesScore) - policiesScore[::-1].index(
            max(policiesScore)) - 1

        #Creates a new fictitious agent with random seed
        agent = LenoSelfPlayAgent(qTableFollow=self.qTables[idx],
                                  seed=self.rnd.randint(0, 1000))

        #If the maximum size of stored Qtables is exceeded, the one with lowest score is eliminated
        if len(self.qTables) > self.maxCacheSize:
            del self.qTables[policiesScore.index(min(policiesScore))]
        return agent

    """
        The other functions are the same as Q-Learning, but turning off exploration if a copy agent is used
    """

    def observe_reward(self, state, action, statePrime, reward):
        if self.fixedPolicy:
            self.exploring = False
        return super().observe_reward(state, action, statePrime, reward)

    def select_action(self, state):
        if self.fixedPolicy:
            self.exploring = False
        return super().select_action(state)
Example #13
0
class DeepQLearningAgent(QLearningAgent):

    batch = None
    miniBatchSize = None

    updateFrequency = None

    batchSize = None
    currentStep = None

    #gamma=0.99, alpha=0.2
    def __init__(self,
                 alpha=0.001,
                 gamma=0.95,
                 marker=None,
                 batchSize=10000,
                 miniBatchSize=100,
                 updateFrequency=100):
        """
            gamma: discount factor
            alpha: learning rate
            marker: The marker that the agent will use in the tic tac toe environment 
                   (if None, it should be set before starting learning)
        """
        super().__init__(alpha=alpha, gamma=gamma, marker=marker)

        self.rnd = Seeds().DQN_AGENT_SEED
        from numpy.random import seed
        seed(self.rnd.randint(1, 10000))
        from tensorflow import set_random_seed
        set_random_seed(self.rnd.randint(1, 10000))

        self.batch = []
        self.batchSize = batchSize
        self.miniBatchSize = miniBatchSize
        self.updateFrequency = updateFrequency
        self.currentStep = 0

        self.init_network()

    def init_network(self):
        """
            Builds the neural network for the agent
        """
        hiddenNeurons = 20

        inp = Input(shape=(CHECKERS_FEATURE_COUNT, ))

        net = layers.Dense(hiddenNeurons, activation="relu")(inp)
        net = layers.Dropout(0.2, seed=self.rnd.randint(1, 10000))(net)
        net = layers.Dense(hiddenNeurons, activation="sigmoid")(net)
        net = layers.Dense(1, activation="sigmoid")(net)

        self.network = Model(inputs=inp, outputs=net)
        self.target = keras.models.clone_model(self.network)

        self.cost = keras.losses.mean_squared_error
        self.network.compile(optimizer=keras.optimizers.Adam(lr=self.alpha),
                             loss=self.cost)

        #self.network.summary()

    def update_network(self, miniBatch):
        featuresOnBatch = np.array([x[0] for x in miniBatch])

        #Targets on target network
        targets = []
        for (features, statePrime, reward) in miniBatch:
            if statePrime.is_first_agent_win(
            ) or statePrime.is_second_agent_win():
                targets.append(reward)
            else:
                actions = self.environment.get_actions(statePrime)
                maxVal = self.get_max_Q_value(statePrime,
                                              allActions=actions,
                                              network=self.target)
                targets.append(reward + self.gamma * maxVal)
        targets = np.array(targets)

        #Q values on network being updated
        #q_values = self.network.predict_on_batch(featuresOnBatch)

        #Optimization process
        self.network.fit(featuresOnBatch, targets, verbose=0)
        #deltas = self.cost.get_errors(targets,q_values)
        #self.network.bprop(deltas)
        #self.optimizer.optimize(self.network.layers_to_optimize)

    def update_target_network(self):
        self.target.set_weights(self.network.get_weights())

    def observe_reward(self, state, action, statePrime, reward):
        """
            Updates the Q-table (only if the agent is exploring
        """
        if self.exploring:
            #Updating batch
            features = self.process_state(state, action)
            #Clipping between -1 and 1
            reward = (reward - REWARD_LOSE) / (REWARD_WIN -
                                               REWARD_LOSE) * (2) - 1
            self.batch.append((features, statePrime, reward))
            if len(self.batch) > self.batchSize:
                del self.batch[0]
            if len(self.batch) > self.miniBatchSize:
                miniBatch = self.rnd.sample(self.batch, self.miniBatchSize)
            else:
                miniBatch = self.batch
            self.update_network(miniBatch)

            if self.epsilon > 0.05:
                self.epsilon /= 1.0001
            if self.alpha > 0.001:
                self.alpha /= 1.00001

            if self.currentStep % self.updateFrequency == 0:
                self.update_target_network()
            self.currentStep += 1
            #print(self.alpha)

    def calcQTable(self, state, action, returnFeatures=False, network=None):
        """Returns one value from the Qtable"""
        if network is None:
            network = self.target
        features = self.process_state(state, action)
        qValue = float(network.predict(np.array([features])))  #+ self.qBias

        if returnFeatures:
            return qValue, features

        return qValue

    def get_max_Q_value(self, state, allActions, network=None):
        """
            returns max(Q) for all actions given a state
        """
        if network is None:
            network = self.target

        values = [
            self.calcQTable(state, action, network=network)
            for action in allActions
        ]
        #Terminal states don't have applicable actions
        if len(values) == 0:
            return 0
        #Maximum Q value
        v = max(values)
        return v

    def create_fictitious(self, expertSteps, marker=None):
        """
            Regular Q-learning always plays against the expert
        """
        #This function is called before the agent is set to the new Marker.
        if self.marker == "X":
            return self.environment.agentX

        return self.environment.agentO