class DeepQ: def __init__(self, environment, inputs): self.environment = environment self.state_size = inputs self.nr_actions = environment.action_space.n self.memory = Memory(30000) self.discountFactor = 0.975 self.predictionModels = [] def initImaginationNetworks(self): for t in xrange(self.nr_actions): self.predictionModels.insert(t, self.createModel(self.state_size, self.state_size, [self.state_size, self.state_size, self.state_size], "relu", 0.01)) def initRewardNetwork(self): self.rewardModel = self.createModel(self.state_size, 1, [self.state_size, self.state_size, self.state_size], "relu", 0.01) def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate): model = Sequential() if len(hiddenLayers) == 0: model.add(Dense(outputs, input_shape=(inputs,), init='lecun_uniform')) model.add(Activation("linear")) else : model.add(Dense(hiddenLayers[0], input_shape=(inputs,), init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) for index in range(1, len(hiddenLayers)-1): layerSize = hiddenLayers[index] model.add(Dense(layerSize, init='lecun_uniform')) if (activationType == "LeakyReLU") : model.add(LeakyReLU(alpha=0.01)) else : model.add(Activation(activationType)) model.add(Dense(outputs, init='lecun_uniform')) model.add(Activation("linear")) optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06) model.compile(loss="mse", optimizer=optimizer) return model def backupNetwork(self, model, backup): weightMatrix = [] for layer in self.model.layers: weights = layer.get_weights() weightMatrix.append(weights) i = 0 for layer in self.secondBrain.layers: weights = weightMatrix[i] layer.set_weights(weights) i += 1 def getStatePrediction(self, state, action): predicted = self.predictionModels[action].predict(state.reshape(1,len(state))) return predicted[0] def getPredictedStates(self, state): predictedStates = [] for a in xrange(self.nr_actions): predictedStates.insert(a, self.getStatePrediction(state, a)) return predictedStates def getStateValuePrediction(self, state): predictedReward = self.rewardModel.predict(state.reshape(1,len(state))) return predictedReward[0][0] def getPredictedActionValues(self, state): predictedActionValues = [] for a in xrange(self.nr_actions): predictedActionValues.insert(a, self.getStateValuePrediction(self.getStatePrediction(state, a))) return predictedActionValues def getMaxValue(self, array): return np.max(array) def getMaxIndex(self, array): return np.argmax(array) def getTarget(self, state, reward, isFinal): if isFinal: return reward else: predictedActionValues = self.getPredictedActionValues(state) # return reward + self.discountFactor * (sum(predictedActionValues)/len(predictedActionValues)) return reward + self.discountFactor * np.max(predictedActionValues) def printStatePredictionTree(self, state): root = Tree() # first layer predicted1 = self.getPredictedStates(state) root.data = state root.left = Tree() root.left.data = predicted1[0] root.right = Tree() root.right.data = predicted1[1] # second layer predicted2left = self.getPredictedStates(predicted1[0]) root.left.left = Tree() root.left.left.data = predicted2left[0] root.left.right = Tree() root.left.right.data = predicted2left[1] predicted2right = self.getPredictedStates(predicted1[1]) root.right.left = Tree() root.right.left.data = predicted2right[0] root.right.right = Tree() root.right.right.data = predicted2right[1] print "" print "\t\t\t\t\t\t\t\t\t\t",root.data print "\t\t\t\t",root.left.data,"\t\t\t\t\t\t\t",root.right.data print root.left.left.data,"\t",root.left.right.data,"\t",root.right.left.data,"\t",root.right.right.data def printStateValueTree(self, state): root = Tree() # first layer predicted1 = self.getPredictedStates(state) root.data = state root.left = Tree() root.left.data = predicted1[0] root.right = Tree() root.right.data = predicted1[1] # second layer predicted2left = self.getPredictedStates(predicted1[0]) root.left.left = Tree() root.left.left.data = predicted2left[0] root.left.right = Tree() root.left.right.data = predicted2left[1] predicted2right = self.getPredictedStates(predicted1[1]) root.right.left = Tree() root.right.left.data = predicted2right[0] root.right.right = Tree() root.right.right.data = predicted2right[1] print "" print "\t\t\t\t\t\t\t\t\t\t",self.getStateValuePrediction(root.data) print "\t\t\t\t",self.getStateValuePrediction(root.left.data),"\t\t\t\t\t\t\t\t\t\t\t",self.getStateValuePrediction(root.right.data) print self.getStateValuePrediction(root.left.left.data),"\t\t\t\t\t",self.getStateValuePrediction(root.left.right.data),"\t\t\t\t\t",self.getStateValuePrediction(root.right.left.data),"\t\t\t\t\t",self.getStateValuePrediction(root.right.right.data) # select the action with the highest Q value def selectAction(self, state, explorationRate): rand = random.random() if rand < explorationRate : action = np.random.randint(0, self.nr_actions) else : action = self.getMaxIndex(self.getPredictedActionValues(state)) return action def selectActionStepsForward(self, state, depth): root = Tree() # first layer predicted1 = self.getPredictedStates(state) leftMax = self.getStateValuePrediction(predicted1[0]) rightMax = self.getStateValuePrediction(predicted1[1]) predicted2left = self.getPredictedActionValues(predicted1[0]) leftMax = max(leftMax, np.max(self.getStateValuePrediction(predicted1[0]))) predicted2right = self.getPredictedStates(predicted1[1]) rightMax = max(rightMax, np.max(self.getStateValuePrediction(predicted1[1]))) if rightMax > leftMax: return 1 else: return 0 def addMemory(self, state, action, reward, newState, isFinal): self.memory.addMemory(state, action, reward, newState, isFinal) def trainStatePredictionOnLastState(self): X_batch = np.empty((0,self.state_size), dtype = np.float64) Y_batch = np.empty((0,self.state_size), dtype = np.float64) lastMemory = self.memory.getLastMemory() isFinal = lastMemory['isFinal'] state = lastMemory['state'] action = lastMemory['action'] reward = lastMemory['reward'] newState = lastMemory['newState'] X_batch = np.append(X_batch, [state], axis=0) Y_batch = np.append(Y_batch, [newState], axis=0) self.predictionModels[action].fit(X_batch, Y_batch, batch_size = len(X_batch), verbose = 0) def trainStatePreditions(self, miniBatchSize): X_batches = [] Y_batches = [] for t in xrange(self.nr_actions): X_batches.append(np.empty((0,self.state_size), dtype = np.float64)) Y_batches.append(np.empty((0,self.state_size), dtype = np.float64)) miniBatch = self.memory.getMiniBatch(miniBatchSize) for sample in miniBatch: isFinal = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] inputValues = state.copy() targetValues = newState.copy() X_batches[action] = np.append(X_batches[action], np.array([inputValues]), axis=0) Y_batches[action] = np.append(Y_batches[action], np.array([targetValues]), axis=0) for a in xrange(self.nr_actions): if len(X_batches[action]) > 0: self.predictionModels[action].fit(X_batches[action].reshape(len(X_batches[action]),4), Y_batches[action], batch_size = len(X_batches[action]), verbose = 0) def trainRewardModel(self, miniBatchSize): miniBatch = self.memory.getMiniBatch(miniBatchSize) X_batch = np.empty((0,self.state_size), dtype = np.float64) Y_batch = np.empty((0,1), dtype = np.float64) for sample in miniBatch: isFinal = sample['isFinal'] state = sample['state'] action = sample['action'] reward = sample['reward'] newState = sample['newState'] inputValues = newState.copy() targetValue = [self.getTarget(newState, reward, isFinal)] X_batch = np.append(X_batch, np.array([inputValues]), axis=0) Y_batch = np.append(Y_batch, [targetValue], axis=0) self.rewardModel.fit(X_batch, Y_batch, batch_size = len(miniBatch), verbose = 0)