Exemple #1
0
def policyIteration(env):
    ''' Simple test for policy iteration '''

    polIter = Learning(0.9, env, augmentActionSet=False)
    V, pi = polIter.solvePolicyIteration()

    # I'll assign the goal as the termination action
    pi[env.getGoalState()] = 4

    # Now we just plot the learned value function and the obtained policy
    plot = Plotter(outputPath, env)
    plot.plotValueFunction(V[0:numStates], 'goal_')
    plot.plotPolicy(pi[0:numStates], 'goal_')
Exemple #2
0
    def getAvgNumStepsBetweenEveryPoint(self,
                                        fullActionSet,
                                        optionsActionSet,
                                        verbose,
                                        initOption=0,
                                        numOptionsToConsider=0):
        ''' '''
        toPlot = []
        numPrimitiveActions = 4

        actionSetToUse = fullActionSet[:numPrimitiveActions]

        for i in xrange(numOptionsToConsider + 1):
            avgs = []

            # I'm going to use a matrix encoding the random policy. For each
            # state I encode the equiprobable policy for primitive actions and
            # options. However, I need to add the condition that, if the
            # option's policy says terminate, it should have probability zero
            # for the equiprobable policy.
            pi = []
            for j in xrange(self.numStates - 1):
                pi.append([])

                for k in xrange(numPrimitiveActions):
                    pi[j].append(1.0)

                if i > 0:
                    for k in xrange(i):  #current number of options to consider
                        idx1 = i + initOption - 1
                        idx2 = numPrimitiveActions + k + initOption
                        nAction = optionsActionSet[idx1][fullActionSet[idx2]
                                                         [j]]
                        if nAction == "terminate":
                            pi[j].append(0.0)
                        else:
                            pi[j].append(1.0)

                denominator = sum(pi[j])
                for k in xrange(len(pi[j])):
                    pi[j][k] = pi[j][k] / denominator

            if i > 0:
                actionSetToUse.append(fullActionSet[numPrimitiveActions + i -
                                                    1 + initOption])

            if verbose:
                print 'Obtaining shortest paths for ' + str(numPrimitiveActions) \
                 + ' primitive actions and ' + str(i) + ' options.'

            for s in xrange(self.environment.getNumStates()):
                goalChanged = self.environment.defineGoalState(s)

                if goalChanged:
                    bellman = Learning(self.gamma,
                                       self.environment,
                                       augmentActionSet=False)
                    expectation = bellman.solveBellmanEquations(
                        pi, actionSetToUse, optionsActionSet)

                    avgs.append(self._computeAvgOnMDP((-1.0 * expectation)))

            toPlot.append(sum(avgs) / float(len(avgs)))

        if numOptionsToConsider > 0:
            plt = Plotter(self.outputPath, self.environment)
            plt.plotLine(xrange(len(toPlot)), toPlot, '# options',
                         'Avg. # steps', 'Avg. # steps between any two points',
                         'avg_num_steps.pdf')

        return toPlot
Exemple #3
0
def discoverOptions(env, epsilon, verbose, discoverNegation, plotGraphs=False):
    #I'll need this when computing the expected number of steps:
    options = []
    actionSetPerOption = []

    # Computing the Combinatorial Laplacian
    W = env.getAdjacencyMatrix()
    D = np.zeros((numStates, numStates))

    # Obtaining the Valency Matrix
    for i in xrange(numStates):
        for j in xrange(numStates):
            D[i][i] = np.sum(W[i])
    # Making sure our final matrix will be full rank
    for i in xrange(numStates):
        if D[i][i] == 0.0:
            D[i][i] = 1.0

    # Normalized Laplacian
    L = D - W
    expD = Utils.exponentiate(D, -0.5)
    normalizedL = expD.dot(L).dot(expD)

    # Eigendecomposition
    # IMPORTANT: The eigenvectors are in columns
    eigenvalues, eigenvectors = np.linalg.eig(normalizedL)
    # I need to sort the eigenvalues and eigenvectors
    idx = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]

    # If I decide to use both directions of the eigenvector, I do it here.
    # It is easier to just change the list eigenvector, even though it may
    # not be the most efficient solution. The rest of the code remains the same.
    if discoverNegation:
        oldEigenvalues = eigenvalues
        oldEigenvectors = eigenvectors.T
        eigenvalues = []
        eigenvectors = []
        for i in xrange(len(oldEigenvectors)):
            eigenvalues.append(oldEigenvalues[i])
            eigenvalues.append(oldEigenvalues[i])
            eigenvectors.append(oldEigenvectors[i])
            eigenvectors.append(-1 * oldEigenvectors[i])

        eigenvalues = np.asarray(eigenvalues)
        eigenvectors = np.asarray(eigenvectors).T

    if plotGraphs:
        # Plotting all the basis
        plot = Plotter(outputPath, env)
        plot.plotBasisFunctions(eigenvalues, eigenvectors)

    # Now I will define a reward function and solve the MDP for it
    # I iterate over the columns, not rows. I can index by 0 here.
    guard = len(eigenvectors[0])
    for i in xrange(guard):
        idx = guard - i - 1
        if verbose:
            print 'Solving for eigenvector #' + str(idx)
        polIter = Learning(0.9, env, augmentActionSet=True)
        env.defineRewardFunction(eigenvectors[:, idx])
        V, pi = polIter.solvePolicyIteration()

        # Now I will eliminate any actions that may give us a small improvement.
        # This is where the epsilon parameter is important. If it is not set all
        # it will never be considered, since I set it to a very small value
        for j in xrange(len(V)):
            if V[j] < epsilon:
                pi[j] = len(env.getActionSet())

        if plotGraphs:
            plot.plotValueFunction(V[0:numStates], str(idx) + '_')
            plot.plotPolicy(pi[0:numStates], str(idx) + '_')

        options.append(pi[0:numStates])
        optionsActionSet = env.getActionSet()
        optionsActionSet.append('terminate')
        actionSetPerOption.append(optionsActionSet)

    #I need to do this after I'm done with the PVFs:
    env.defineRewardFunction(None)
    env.reset()

    return options, actionSetPerOption
Exemple #4
0
    if not verbose:
        warnings.filterwarnings('ignore')

    # Create environment
    env = GridWorld(path=inputMDP, useNegativeRewards=False)
    numStates = env.getNumStates()
    numRows, numCols = env.getGridDimensions()

    # I may load options if I'm told so:
    loadedOptions = None
    if optionsToLoad != None:
        loadedOptions = []
        for i in xrange(len(optionsToLoad)):
            loadedOptions.append(Utils.loadOption(optionsToLoad[i]))
            plot = Plotter(outputPath, env)
            plot.plotPolicy(loadedOptions[i], str(i + 1) + '_')

    if taskToPerform == 1:  #Discover options
        optionDiscoveryThroughPVFs(env=env,
                                   epsilon=epsilon,
                                   verbose=verbose,
                                   discoverNegation=bothDirections)

    elif taskToPerform == 2:  #Solve for a given goal (policy iteration)
        policyIteration(env)

    elif taskToPerform == 3:  #Evaluate random policy (policy evaluation)
        #TODO: I should allow one to evaluate a loaded policy
        policyEvaluation(env)