def policyIteration(env): ''' Simple test for policy iteration ''' polIter = Learning(0.9, env, augmentActionSet=False) V, pi = polIter.solvePolicyIteration() # I'll assign the goal as the termination action pi[env.getGoalState()] = 4 # Now we just plot the learned value function and the obtained policy plot = Plotter(outputPath, env) plot.plotValueFunction(V[0:numStates], 'goal_') plot.plotPolicy(pi[0:numStates], 'goal_')
def getAvgNumStepsBetweenEveryPoint(self, fullActionSet, optionsActionSet, verbose, initOption=0, numOptionsToConsider=0): ''' ''' toPlot = [] numPrimitiveActions = 4 actionSetToUse = fullActionSet[:numPrimitiveActions] for i in xrange(numOptionsToConsider + 1): avgs = [] # I'm going to use a matrix encoding the random policy. For each # state I encode the equiprobable policy for primitive actions and # options. However, I need to add the condition that, if the # option's policy says terminate, it should have probability zero # for the equiprobable policy. pi = [] for j in xrange(self.numStates - 1): pi.append([]) for k in xrange(numPrimitiveActions): pi[j].append(1.0) if i > 0: for k in xrange(i): #current number of options to consider idx1 = i + initOption - 1 idx2 = numPrimitiveActions + k + initOption nAction = optionsActionSet[idx1][fullActionSet[idx2] [j]] if nAction == "terminate": pi[j].append(0.0) else: pi[j].append(1.0) denominator = sum(pi[j]) for k in xrange(len(pi[j])): pi[j][k] = pi[j][k] / denominator if i > 0: actionSetToUse.append(fullActionSet[numPrimitiveActions + i - 1 + initOption]) if verbose: print 'Obtaining shortest paths for ' + str(numPrimitiveActions) \ + ' primitive actions and ' + str(i) + ' options.' for s in xrange(self.environment.getNumStates()): goalChanged = self.environment.defineGoalState(s) if goalChanged: bellman = Learning(self.gamma, self.environment, augmentActionSet=False) expectation = bellman.solveBellmanEquations( pi, actionSetToUse, optionsActionSet) avgs.append(self._computeAvgOnMDP((-1.0 * expectation))) toPlot.append(sum(avgs) / float(len(avgs))) if numOptionsToConsider > 0: plt = Plotter(self.outputPath, self.environment) plt.plotLine(xrange(len(toPlot)), toPlot, '# options', 'Avg. # steps', 'Avg. # steps between any two points', 'avg_num_steps.pdf') return toPlot
def discoverOptions(env, epsilon, verbose, discoverNegation, plotGraphs=False): #I'll need this when computing the expected number of steps: options = [] actionSetPerOption = [] # Computing the Combinatorial Laplacian W = env.getAdjacencyMatrix() D = np.zeros((numStates, numStates)) # Obtaining the Valency Matrix for i in xrange(numStates): for j in xrange(numStates): D[i][i] = np.sum(W[i]) # Making sure our final matrix will be full rank for i in xrange(numStates): if D[i][i] == 0.0: D[i][i] = 1.0 # Normalized Laplacian L = D - W expD = Utils.exponentiate(D, -0.5) normalizedL = expD.dot(L).dot(expD) # Eigendecomposition # IMPORTANT: The eigenvectors are in columns eigenvalues, eigenvectors = np.linalg.eig(normalizedL) # I need to sort the eigenvalues and eigenvectors idx = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] # If I decide to use both directions of the eigenvector, I do it here. # It is easier to just change the list eigenvector, even though it may # not be the most efficient solution. The rest of the code remains the same. if discoverNegation: oldEigenvalues = eigenvalues oldEigenvectors = eigenvectors.T eigenvalues = [] eigenvectors = [] for i in xrange(len(oldEigenvectors)): eigenvalues.append(oldEigenvalues[i]) eigenvalues.append(oldEigenvalues[i]) eigenvectors.append(oldEigenvectors[i]) eigenvectors.append(-1 * oldEigenvectors[i]) eigenvalues = np.asarray(eigenvalues) eigenvectors = np.asarray(eigenvectors).T if plotGraphs: # Plotting all the basis plot = Plotter(outputPath, env) plot.plotBasisFunctions(eigenvalues, eigenvectors) # Now I will define a reward function and solve the MDP for it # I iterate over the columns, not rows. I can index by 0 here. guard = len(eigenvectors[0]) for i in xrange(guard): idx = guard - i - 1 if verbose: print 'Solving for eigenvector #' + str(idx) polIter = Learning(0.9, env, augmentActionSet=True) env.defineRewardFunction(eigenvectors[:, idx]) V, pi = polIter.solvePolicyIteration() # Now I will eliminate any actions that may give us a small improvement. # This is where the epsilon parameter is important. If it is not set all # it will never be considered, since I set it to a very small value for j in xrange(len(V)): if V[j] < epsilon: pi[j] = len(env.getActionSet()) if plotGraphs: plot.plotValueFunction(V[0:numStates], str(idx) + '_') plot.plotPolicy(pi[0:numStates], str(idx) + '_') options.append(pi[0:numStates]) optionsActionSet = env.getActionSet() optionsActionSet.append('terminate') actionSetPerOption.append(optionsActionSet) #I need to do this after I'm done with the PVFs: env.defineRewardFunction(None) env.reset() return options, actionSetPerOption
if not verbose: warnings.filterwarnings('ignore') # Create environment env = GridWorld(path=inputMDP, useNegativeRewards=False) numStates = env.getNumStates() numRows, numCols = env.getGridDimensions() # I may load options if I'm told so: loadedOptions = None if optionsToLoad != None: loadedOptions = [] for i in xrange(len(optionsToLoad)): loadedOptions.append(Utils.loadOption(optionsToLoad[i])) plot = Plotter(outputPath, env) plot.plotPolicy(loadedOptions[i], str(i + 1) + '_') if taskToPerform == 1: #Discover options optionDiscoveryThroughPVFs(env=env, epsilon=epsilon, verbose=verbose, discoverNegation=bothDirections) elif taskToPerform == 2: #Solve for a given goal (policy iteration) policyIteration(env) elif taskToPerform == 3: #Evaluate random policy (policy evaluation) #TODO: I should allow one to evaluate a loaded policy policyEvaluation(env)