Example #1
0
def pIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)
    policy_converged = defaultdict(list)    
    last_policy = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    print("*** {} Policy Iteration Analysis".format(world))

    MAX_ITERATIONS = MAX_ITERATIONS
    iterations = range(1, MAX_ITERATIONS + 1)
    pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1,1); 
    pi.setDebugCode(0)
    for nIter in iterations:
        startTime = clock()
        #pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1, nIter); 
        #pi.setDebugCode(0)
        # run planning from our initial state
        p = pi.planFromState(initialState);
        endTime = clock()
        timing['Policy'].append((endTime-startTime)*1000)

        convergence['Policy'].append(pi.lastPIDelta)         
        # evaluate the policy with one roll out visualize the trajectory
        runEvals(initialState, p, rewards['Policy'], steps['Policy'], rf, tf, evalTrials=1)
        if nIter == 1 or nIter == 50:
            simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
 
        policy = pi.getComputedPolicy()
        allStates = pi.getAllStates()
        current_policy = [[(action.ga, action.pSelection) 
            for action in policy.getActionDistributionForState(state)] 
            for state in allStates]
        policy_converged['Policy'].append(current_policy == last_policy)
        last_policy = current_policy
 
    simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
            world + ' Policy Iteration Policy Map.pkl')
    dumpCSVp(iterations, timing['Policy'], rewards['Policy'], steps['Policy'],convergence['Policy'], 
            world, 'Policy', policy_converged['Policy'])
        if (vi.latestDelta < 1e-6) and flag:
            flag = False
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                                   "Value Iteration {}".format(nIter))
            dumpPolicyMap(
                MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                'Value {} Iter {} Policy Map.pkl'.format(world, nIter))
        # if vi.latestDelta <1e-6:
        #     break
    print "\n\n\n"
    simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                           "Value Iteration {}".format(nIter))
    dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'],
            convergence['Value'], world, 'Value')

    pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-3, 10, 1)
    pi.toggleUseCachedTransitionDynamics(False)
    print "//{} Policy Iteration Analysis//".format(world)
    flag = True
    timing['Policy'].append(0)
    for nIter in iterations:
        startTime = clock()
        p = pi.planFromState(initialState)
        #timing['Policy'].append((clock()-startTime) * 1000)
        timing['Policy'].append(timing['Policy'][-1] + clock() - startTime)
        policy = pi.getComputedPolicy()
        current_policy = {
            state: policy.getAction(state).toString()
            for state in allStates
        }
        convergence['Policy2'].append(pi.lastPIDelta)
Example #3
0
        runEvals(initialState, p, rewards['Value'], steps['Value'])
        if nIter == 1:
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                                   "Value Iteration {}".format(nIter))
    MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap())
    print "\n\n\n"
    simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                           "Value Iteration {}".format(nIter))
    #input('x')
    dumpCSV(iterations, timing['Value'], rewards['Value'], steps['Value'],
            convergence['Value'], world, 'Value')

    print "//Easy Policy Iteration Analysis//"
    for nIter in iterations:
        startTime = clock()
        pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, -1, 1,
                             nIter)
        #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms.
        # run planning from our initial state
        pi.setDebugCode(0)
        p = pi.planFromState(initialState)
        timing['Policy'].append((clock() - startTime) * 1000)
        convergence['Policy'].append(pi.lastPIDelta)
        # evaluate the policy with one roll out visualize the trajectory
        runEvals(initialState, p, rewards['Policy'], steps['Policy'])
        #ea = p.evaluateBehavior(initialState, rf, tf);
        #rewards['Policy'].append(calcRewardInEpisode(ea));
        #steps['Policy'].append(ea.numTimeSteps());
        if (nIter == 1):
            simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory,
                                   "Policy Iteration{}".format(nIter))
        p = vi.planFromState(initialState)
        timing['Value'].append(clock() - startTime)
        iterations['Value'].append(vi.numIterations)
        # evaluate the policy with one roll out visualize the trajectory
        runEvals(initialState, p, rewards['Value'], steps['Value'])

        MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap())
        print "\n\n"
        #     simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration")

        dumpCSV(iterations['Value'], timing['Value'], rewards['Value'],
                steps['Value'], [n], 'Value', n == 2)

        #
        print "//Size Policy Iteration Analysis//"
        pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-6,
                             20, MAX_ITERATIONS)
        #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms.
        pi.toggleUseCachedTransitionDynamics(False)
        # run planning from our initial state
        pi.setDebugCode(0)
        pi.setPolicyToEvaluate(p)
        startTime = clock()
        p = pi.planFromState(initialState)
        timing['Policy'].append(clock() - startTime)
        iterations['Policy'].append(pi.totalPolicyIterations)
        # evaluate the policy with one roll out visualize the trajectory
        runEvals(initialState, p, rewards['Policy'], steps['Policy'])
        MapPrinter.printPolicyMap(pi.getAllStates(), p, gen.getMap())
        print "\n\n"
        #     simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration")
        dumpCSV(iterations['Policy'], timing['Policy'], rewards['Policy'],
Example #5
0
        runEvals(initialState,p,rewards['Value'],steps['Value'])
        if nIter == 1:
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))
        if nIter == MAX_ITERATIONS/2:
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))
    MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap());
    print "\n\n\n"
    simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))
    dumpCSV(iterations, timing['Value'], rewards['Value'], steps['Value'],convergence['Value'], world, 'Value')


#   Policy Iteration starts
    print "//hard Policy Iteration Analysis//"
    for nIter in iterations:
        startTime = clock()
        pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1, nIter); #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms.
       # run planning from our initial state
        pi.setDebugCode(0)
        p = pi.planFromState(initialState);
        timing['Policy'].append((clock()-startTime)*1000)
        convergence['Policy'].append(pi.lastPIDelta)
        # evaluate the policy
        runEvals(initialState,p,rewards['Policy'],steps['Policy'])
        if (nIter == 1):
            simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
        if (nIter == MAX_ITERATIONS/2):
            simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
        policy = pi.getComputedPolicy()
        allStates = pi.getAllStates()
        current_policy = [[(action.ga, action.pSelection) for action in policy.getActionDistributionForState(state)] for state in allStates]
        policy_converged['Policy'].append(current_policy == last_policy)