def __init__(self, alpha=0.001, gamma=0.95, marker=None, batchSize=10000, miniBatchSize=100, updateFrequency=100): """ gamma: discount factor alpha: learning rate marker: The marker that the agent will use in the tic tac toe environment (if None, it should be set before starting learning) """ super().__init__(alpha=alpha, gamma=gamma, marker=marker) self.rnd = Seeds().DQN_AGENT_SEED from numpy.random import seed seed(self.rnd.randint(1, 10000)) from tensorflow import set_random_seed set_random_seed(self.rnd.randint(1, 10000)) self.batch = [] self.batchSize = batchSize self.miniBatchSize = miniBatchSize self.updateFrequency = updateFrequency self.currentStep = 0 self.init_network()
def __init__(self,gamma=0.99, alpha=0.2, marker=None, model = None, seed = None): """ If model is not None, the agent follows the model and never performs exploration """ #This should be executed before the constructor of the super class if model is not None: self.initialModel = model super().__init__(gamma, alpha, marker) self.rnd = Seeds().NEURALNET_AGENT_SEED if seed is not None: self.rnd.seed(seed) np.random.seed(self.rnd.randint(1,10000)) if model is not None: self.followModel = True self.exploring = False
def __init__(self, alpha=0.01, gamma=0.1, marker=None): """ gamma: discount factor alpha: learning rate marker: The marker that the agent will use in the tic tac toe environment (if None, it should be set before starting learning) """ self.gamma = gamma self.alpha = alpha self.marker = marker self.rnd = Seeds().Q_AGENT_SEED self.initQ = 0.001 #Initiating weights with value = self.initQ self.qWeights = np.multiply([1] * CHECKERS_FEATURE_COUNT, self.rnd.random() * self.initQ) #self.qBias = 0.001 self.T = self.tempInit
def __init__(self, gamma=0.99, alpha=0.2, marker=None, qTableFollow=None, seed=None): """ If the QTableFollow is not None, the agent follows the Q-table and never performs exploration """ super().__init__(gamma, alpha, marker) self.rnd = Seeds().LENO_AGENT_SEED if seed is not None: self.rnd.seed(seed) self.qTables = [] if qTableFollow is not None: self.qTable = qTableFollow self.fixedPolicy = True self.expertStorage = {}
def __init__(self, gamma=0.99, alpha=0.2, marker=None, model=None, seed=None): """ If model is not None, the agent follows the model and never performs exploration """ super().__init__(gamma, alpha, marker) self.rnd = Seeds().PROBMOD_AGENT_SEED if seed is not None: self.rnd.seed(seed) np.random.seed(self.rnd.randint(1, 10000)) if model is not None: self.model = model self.followModel = True self.exploring = False
class RandomAgent(Agent): gamma = 0.99 def __init__(self): self.rnd = Seeds().RANDOM_AGENT_SEED def observe_reward(self, state, action, statePrime, reward): pass def select_action(self, state): return self.rnd.choice(self.environment.get_actions(state)) def set_environment(self, environment, marker): """Connects to the domain environment""" self.environment = environment self.marker = marker
class QLearningAgent(Agent): gamma = None alpha = None qWeights = None #qBias = None initQ = None #Value to initiate the weights USE_EPSILON_GREEDY = True #If false, uses Boltzmann exploration epsilon = 0.5 T = None #Current temperature for Boltzmann exploration tempInit = 1.0 / 2 #Initial temperature for Boltzmann exploration tempFinal = 1.0 / 50 #Final temperature for Boltzmann exploration rnd = None #gamma=0.99, alpha=0.2 def __init__(self, alpha=0.01, gamma=0.1, marker=None): """ gamma: discount factor alpha: learning rate marker: The marker that the agent will use in the tic tac toe environment (if None, it should be set before starting learning) """ self.gamma = gamma self.alpha = alpha self.marker = marker self.rnd = Seeds().Q_AGENT_SEED self.initQ = 0.001 #Initiating weights with value = self.initQ self.qWeights = np.multiply([1] * CHECKERS_FEATURE_COUNT, self.rnd.random() * self.initQ) #self.qBias = 0.001 self.T = self.tempInit def observe_reward(self, state, action, statePrime, reward): """ Updates the Q-table (only if the agent is exploring """ if self.exploring: allActionsPrime = self.environment.get_actions(statePrime) qValue, features = self.calcQTable(state, action, returnFeatures=True) V = self.get_max_Q_value(statePrime, allActionsPrime) expected = reward + self.gamma * V temporal_difference = expected - qValue for i in range(len(self.qWeights)): self.qWeights[i] = self.qWeights[i] + self.alpha * ( temporal_difference) * features[i] #self.qBias += self.alpha * temporal_difference * self.qBias #print(str(self.qWeights))#+ " - " + str(self.qBias)) if self.epsilon > 0.05: self.epsilon /= 1.0001 if self.alpha > 0.001: self.alpha /= 1.00001 #print(self.alpha) def calcQTable(self, state, action, returnFeatures=False): """Returns one value from the Qtable""" features = self.process_state(state, action) qValue = np.dot(self.qWeights, features) #+ self.qBias if returnFeatures: return qValue, features return qValue def best_action_deterministic(self, state): allActions = self.environment.get_actions(state) maxVal = -float('inf') bestAct = None for act in allActions: q = self.calcQTable(state, act) if q > maxVal: bestAct = [act] maxVal = q elif maxVal == q: bestAct.append(act) return self.rnd.choice(bestAct) def select_action(self, state): """ When this method is called, the agent executes an action based on its Q-table Boltzmann Exploration is used def create_fictitious(self,expertSteps): """ if self.USE_EPSILON_GREEDY: return self.select_action_epsilon_greedy(state) else: return self.select_action_boltzmann(state) def select_action_boltzmann(self, state): #Check here allActions = self.environment.get_actions() #Boltzmann exploration strategy valueActions = [] sumActions = 0 for action in allActions: qValue = self.calcQTable(state, action) vBoltz = math.pow(math.e, qValue / self.T) + 0.00001 valueActions.append(vBoltz) sumActions += vBoltz probAct = [x / sumActions for x in valueActions] rndVal = self.rnd.random() sumProbs = 0 i = -1 while sumProbs <= rndVal: i = i + 1 sumProbs += probAct[i] #Apply decay if self.T > self.tempFinal and self.exploring: self.T -= (self.tempInit - self.tempFinal) / (100000) return allActions[i] def select_action_epsilon_greedy(self, state): """ Applies the epsilon greedy exploration when the agent is exploring, and chooses deterministically """ randv = self.rnd.random() if self.exploring and randv < self.epsilon: return self.rnd.choice(self.environment.get_actions()) #If not random explorating, the action with best value is returned return self.best_action_deterministic(state) def get_max_Q_value(self, state, allActions): """ returns max(Q) for all actions given a state """ values = [self.calcQTable(state, action) for action in allActions] #Terminal states don't have applicable actions if len(values) == 0: return 0 #Maximum Q value v = max(values) return v def create_fictitious(self, expertSteps, marker=None): """ Regular Q-learning always plays against the expert """ #This function is called before the agent is set to the new Marker. if self.marker == "X": return self.environment.agentX return self.environment.agentO
def __init__(self): self.rnd = Seeds().RANDOM_AGENT_SEED
def __init__(self, gamma=0.99, alpha=0.2, marker=None): super().__init__(gamma, alpha, marker) self.rnd = Seeds().SELFPLAY_AGENT_SEED
class NeuralNetModQLearning(ModelQLearningAgent): followModel = False modelAlpha = 0.2 #Learning rate of the model initOp = None #initializer for tf session y_hat = None #used for predictions X = None y= None predict = None initialModel = None W1 = None W2 = None b1 = None b2 = None session=None examplesChache = [] maxExamples = 500 def __init__(self,gamma=0.99, alpha=0.2, marker=None, model = None, seed = None): """ If model is not None, the agent follows the model and never performs exploration """ #This should be executed before the constructor of the super class if model is not None: self.initialModel = model super().__init__(gamma, alpha, marker) self.rnd = Seeds().NEURALNET_AGENT_SEED if seed is not None: self.rnd.seed(seed) np.random.seed(self.rnd.randint(1,10000)) if model is not None: self.followModel = True self.exploring = False def init_model(self): """ The model here is a tensorflow neural network. Everything will be prepared for later use. """ num_features = 9 num_actions = 9 num_hidden_neurons = 20 with tf.Graph().as_default() as g: #X will be the state variables (What is inside each of the 9 positions self.X = tf.placeholder(tf.float32, [None,9], name = 'X') #y is the action, 9 positions because of the one-hot encoding self.y = tf.placeholder(tf.float32, [None,9], name = "y") # if self.initModel is not None: # initW1 = self.initModel.W1 # initb1 = self.initModel.b1 # initW2 = self.initModel.W2 # initb2 = self.initModel.b2 #weights and biases # self.W1 = tf.Variable(initW1) # self.b1 = tf.Variable(initb1) # self.W2 = tf.Variable(initW2) # self.b2 = tf.Variable(initb2) # else: self.W1 = tf.Variable(tf.random_uniform([num_features,num_hidden_neurons],seed = self.rnd.randint(0,1000), minval = 0.0001, maxval=0.1), name='W1') self.b1 = tf.Variable(tf.random_uniform([num_hidden_neurons], seed = self.rnd.randint(0,1000)), name='b1') self.W2 = tf.Variable(tf.random_uniform([num_hidden_neurons,num_actions],seed = self.rnd.randint(0,1000), minval = 0.0001, maxval=0.1), name='W2') self.b2 = tf.Variable(tf.random_uniform([num_actions], seed = self.rnd.randint(0,1000)), name='b2') #Calculating the output of hidden layers hidden_out = tf.add(tf.matmul(self.X,self.W1),self.b1) hidden_out = tf.nn.sigmoid(hidden_out) self.y_hat = tf.nn.softmax(tf.add(tf.matmul(hidden_out, self.W2), self.b2)) #self.predict = tf.argmax(self.y_hat,axis=1) self.predict = tf.multinomial(self.y_hat, seed = self.rnd.randint(0,1000), num_samples=1) y_clipped = tf.clip_by_value(self.y_hat, 1e-10, 0.9999999) #Cost function (cross-entropy) self.cost = -tf.reduce_mean(tf.reduce_sum(self.y * tf.log(y_clipped) + (1 - self.y) * tf.log(1 - y_clipped), axis=1)) # add an optimizer self.optimizer = tf.train.AdamOptimizer(learning_rate=self.modelAlpha).minimize(self.cost) self.initOp = tf.global_variables_initializer() self.saver = tf.train.Saver() self.session = tf.Session(graph=g) self.session.run(self.initOp) if self.initialModel is not None: self.saver.restore(self.session,self.initialModel) def update_model(self,expertSteps,marker): """ The model keeps counters of chosen actions, which are updated here """ if len(self.examplesChache) + len(expertSteps) > self.maxExamples: #Open space in the batch for new samples del self.examplesChache[0:len(expertSteps) - (self.maxExamples - len(self.examplesChache))] self.examplesChache.extend(expertSteps) expertSteps = self.examplesChache X = self.states_to_float(np.array(expertSteps)[:,0]) #print(X) y = self.convert_actions(np.array(expertSteps)[:,1]) #Trains with the data for 10 epochs epochs = 10 # start the session sess = self.session #print(len(expertSteps)) for epoch in range(epochs): avg_cost = 0 for i in range(len(expertSteps)): _, c = sess.run([self.optimizer,self.cost],feed_dict={self.X: X, self.y: y}) avg_cost += c / len(expertSteps) #print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(avg_cost)) #print(sess.run(self.W1)) def states_to_float(self,states): """ Convert the state in string to float 0 == ., 1==N, 2==S """ procStates = np.zeros((len(states),len(states[0])),dtype=np.float32) i=0 for state in states: procStates[i,:] = [0. if x=='.' else 1. if x=='N' else 2. for x in state ] i += 1 return procStates def convert_actions(self,actions): """ Converts the action into the 0-9 interval and back if needed """ convertedActions = [] for act in actions: convertedActions.append(act[1]*3 + act[0]) convertedActions = np.asarray(convertedActions) #Convert to one-hot num_actions = 9 convertedActions = convertedActions.reshape(-1) convertedActions = np.eye(num_actions, dtype=np.float32)[convertedActions] return convertedActions def agent_from_model(self,marker): """ Creates an agent from the model """ model = "/tmp/model.save" #print("Will save") self.saver.save(self.session,model) #print("Saved") return NeuralNetModQLearning(model=model, seed = self.rnd.randint(1,10000)) def action_from_model(self,state): """ selects an action according to the observed decisions of the expert agent """ state = self.process_state(state) X = np.array(self.states_to_float([np.array(state)]),dtype=np.float32) #print(X.dtype) #print(X) #print(X.shape) import math acts = self.environment.get_actions(state) sess = self.session act = float('inf') while not act in acts: act = sess.run([self.predict],feed_dict={self.X: X}) #print(act[0][0]) act = (math.floor(act[0][0]%3), math.floor(act[0][0]/3)) #print(act) return act def select_action(self, state): if self.followModel: return self.action_from_model(state) return super().select_action(state)
class ProbModelQLearning(ModelQLearningAgent): followModel = False def __init__(self, gamma=0.99, alpha=0.2, marker=None, model=None, seed=None): """ If model is not None, the agent follows the model and never performs exploration """ super().__init__(gamma, alpha, marker) self.rnd = Seeds().PROBMOD_AGENT_SEED if seed is not None: self.rnd.seed(seed) np.random.seed(self.rnd.randint(1, 10000)) if model is not None: self.model = model self.followModel = True self.exploring = False def init_model(self): """ The model is a simple dictionary that will count the chosen actions """ return {} def update_model(self, expertSteps, marker): """ The model keeps counters of chosen actions, which are updated here """ #Updates the model for tupStAct in expertSteps: state = tupStAct[0] if (state, tupStAct[1]) not in self.model: self.model[(state, tupStAct[1])] = 0 self.model[(state, tupStAct[1])] += 1 def agent_from_model(self, marker): """ Creates an agent from the model """ model = cp.deepcopy(self.model) return ProbModelQLearning(model=model, seed=self.rnd.randint(1, 10000)) def action_from_model(self, state): """ selects an action according to the observed decisions of the expert agent """ state = self.process_state(state) actions = self.environment.get_actions(state) visits = [] #Searches for the number of times that each action was chosen the current state for act in actions: v = self.model.get((state, act), 0) visits.append(v) totalVals = sum(visits) if totalVals == 0: return self.rnd.choice(actions) actProbs = [x / totalVals for x in visits] chosenIdx = np.random.choice(len(actions), p=actProbs) return actions[chosenIdx] def select_action(self, state): if self.followModel: return self.action_from_model(state) return super().select_action(state)
class LenoSelfPlayAgent(QLearningAgent): fixedPolicy = False #If true, the agent never explores qTables = None #Library of previous Q-tables rnd = None #Maximum number of stored policies, if this number is exceeded, the less similar one is excluded maxCacheSize = 10 #Storage from previously seen expert steps expertStorage = None def __init__(self, gamma=0.99, alpha=0.2, marker=None, qTableFollow=None, seed=None): """ If the QTableFollow is not None, the agent follows the Q-table and never performs exploration """ super().__init__(gamma, alpha, marker) self.rnd = Seeds().LENO_AGENT_SEED if seed is not None: self.rnd.seed(seed) self.qTables = [] if qTableFollow is not None: self.qTable = qTableFollow self.fixedPolicy = True self.expertStorage = {} def score_policy(self, qTable, expertSteps): """ Returns the percentage of states in which this Qtable would choose the same action as the expert """ score = 0 stTuples = expertSteps.keys() for stActTuple in stTuples: #Deterministic selection of actions state = stActTuple #state = self.process_state(state,marker) allActions = self.environment.get_actions(state) maxVal = -float('inf') bestAct = None for act in allActions: q = qTable.get((state, act), -float('inf')) if q > maxVal: bestAct = [act] maxVal = q elif maxVal == q: if bestAct is None: #If the value does not exist in the q-table bestAct = [] bestAct.append(act) chosen = self.rnd.choice(bestAct) if chosen in expertSteps[stActTuple]: score += 1 return score def create_fictitious(self, expertSteps, marker): """ Returns a fictitious agent with fixed policy """ #Keeps a set of actions that have been already chosen by the expert for tupStAct in expertSteps: if tupStAct[0] not in self.expertStorage: self.expertStorage[(tupStAct[0])] = set() self.expertStorage[(tupStAct[0])].add(tupStAct[1]) #Performs a copy of the current Qtable self.qTables.append(copy.deepcopy(self.qTable)) #Calculates which of the previous policies are more similar to the expert policy policiesScore = [ self.score_policy(qTab, self.expertStorage) for qTab in self.qTables ] #Selects the newest policy with highest score idx = len(policiesScore) - policiesScore[::-1].index( max(policiesScore)) - 1 #Creates a new fictitious agent with random seed agent = LenoSelfPlayAgent(qTableFollow=self.qTables[idx], seed=self.rnd.randint(0, 1000)) #If the maximum size of stored Qtables is exceeded, the one with lowest score is eliminated if len(self.qTables) > self.maxCacheSize: del self.qTables[policiesScore.index(min(policiesScore))] return agent """ The other functions are the same as Q-Learning, but turning off exploration if a copy agent is used """ def observe_reward(self, state, action, statePrime, reward): if self.fixedPolicy: self.exploring = False return super().observe_reward(state, action, statePrime, reward) def select_action(self, state): if self.fixedPolicy: self.exploring = False return super().select_action(state)
class DeepQLearningAgent(QLearningAgent): batch = None miniBatchSize = None updateFrequency = None batchSize = None currentStep = None #gamma=0.99, alpha=0.2 def __init__(self, alpha=0.001, gamma=0.95, marker=None, batchSize=10000, miniBatchSize=100, updateFrequency=100): """ gamma: discount factor alpha: learning rate marker: The marker that the agent will use in the tic tac toe environment (if None, it should be set before starting learning) """ super().__init__(alpha=alpha, gamma=gamma, marker=marker) self.rnd = Seeds().DQN_AGENT_SEED from numpy.random import seed seed(self.rnd.randint(1, 10000)) from tensorflow import set_random_seed set_random_seed(self.rnd.randint(1, 10000)) self.batch = [] self.batchSize = batchSize self.miniBatchSize = miniBatchSize self.updateFrequency = updateFrequency self.currentStep = 0 self.init_network() def init_network(self): """ Builds the neural network for the agent """ hiddenNeurons = 20 inp = Input(shape=(CHECKERS_FEATURE_COUNT, )) net = layers.Dense(hiddenNeurons, activation="relu")(inp) net = layers.Dropout(0.2, seed=self.rnd.randint(1, 10000))(net) net = layers.Dense(hiddenNeurons, activation="sigmoid")(net) net = layers.Dense(1, activation="sigmoid")(net) self.network = Model(inputs=inp, outputs=net) self.target = keras.models.clone_model(self.network) self.cost = keras.losses.mean_squared_error self.network.compile(optimizer=keras.optimizers.Adam(lr=self.alpha), loss=self.cost) #self.network.summary() def update_network(self, miniBatch): featuresOnBatch = np.array([x[0] for x in miniBatch]) #Targets on target network targets = [] for (features, statePrime, reward) in miniBatch: if statePrime.is_first_agent_win( ) or statePrime.is_second_agent_win(): targets.append(reward) else: actions = self.environment.get_actions(statePrime) maxVal = self.get_max_Q_value(statePrime, allActions=actions, network=self.target) targets.append(reward + self.gamma * maxVal) targets = np.array(targets) #Q values on network being updated #q_values = self.network.predict_on_batch(featuresOnBatch) #Optimization process self.network.fit(featuresOnBatch, targets, verbose=0) #deltas = self.cost.get_errors(targets,q_values) #self.network.bprop(deltas) #self.optimizer.optimize(self.network.layers_to_optimize) def update_target_network(self): self.target.set_weights(self.network.get_weights()) def observe_reward(self, state, action, statePrime, reward): """ Updates the Q-table (only if the agent is exploring """ if self.exploring: #Updating batch features = self.process_state(state, action) #Clipping between -1 and 1 reward = (reward - REWARD_LOSE) / (REWARD_WIN - REWARD_LOSE) * (2) - 1 self.batch.append((features, statePrime, reward)) if len(self.batch) > self.batchSize: del self.batch[0] if len(self.batch) > self.miniBatchSize: miniBatch = self.rnd.sample(self.batch, self.miniBatchSize) else: miniBatch = self.batch self.update_network(miniBatch) if self.epsilon > 0.05: self.epsilon /= 1.0001 if self.alpha > 0.001: self.alpha /= 1.00001 if self.currentStep % self.updateFrequency == 0: self.update_target_network() self.currentStep += 1 #print(self.alpha) def calcQTable(self, state, action, returnFeatures=False, network=None): """Returns one value from the Qtable""" if network is None: network = self.target features = self.process_state(state, action) qValue = float(network.predict(np.array([features]))) #+ self.qBias if returnFeatures: return qValue, features return qValue def get_max_Q_value(self, state, allActions, network=None): """ returns max(Q) for all actions given a state """ if network is None: network = self.target values = [ self.calcQTable(state, action, network=network) for action in allActions ] #Terminal states don't have applicable actions if len(values) == 0: return 0 #Maximum Q value v = max(values) return v def create_fictitious(self, expertSteps, marker=None): """ Regular Q-learning always plays against the expert """ #This function is called before the agent is set to the new Marker. if self.marker == "X": return self.environment.agentX return self.environment.agentO