"Value Iteration {}".format(nIter)) if vi.latestDelta < 1e-6 and flag: simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) flag = False # dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()), # 'Value {} Iter {} Policy Map.pkl'.format(world, nIter)) # if vi.latestDelta < 1e-6: # break print "\n\n\n" simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value') pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-3, 10, 1) pi.toggleUseCachedTransitionDynamics(False) print "//{} Policy Iteration Analysis//".format(world) timing['Policy'].append(0) flag = True for nIter in iterations: startTime = clock() p = pi.planFromState(initialState) #timing['Policy'].append((clock() - startTime) * 1000) timing['Policy'].append(timing['Policy'][-1] + clock() - startTime) policy = pi.getComputedPolicy() current_policy = { state: policy.getAction(state).toString() for state in allStates } convergence['Policy2'].append(pi.lastPIDelta)
# evaluate the policy with evalTrials roll outs runEvals(initialState, p, rewards['Value'], steps['Value']) if nIter == 1: simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) if nIter == MAX_ITERATIONS / 2: simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap()); print "\n\n\n" simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter)) # dumpCSV(iterations, timing['Value'], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value') # Policy Iteration starts print "//Easy Policy Iteration Analysis//" for nIter in iterations: startTime = clock() pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1, nIter); #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms. # run planning from our initial state pi.setDebugCode(0) p = pi.planFromState(initialState); timing['Policy'].append((clock()-startTime)*1000) convergence['Policy'].append(pi.lastPIDelta) # evaluate the policy runEvals(initialState,p,rewards['Policy'],steps['Policy']) if (nIter == 1): simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) if (nIter == MAX_ITERATIONS/2): simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) policy = pi.getComputedPolicy() allStates = pi.getAllStates() current_policy = [[(action.ga, action.pSelection) for action in policy.getActionDistributionForState(state)] for state in allStates] policy_converged['Policy'].append(current_policy == last_policy)
MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap()) print "\n\n\n" simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, Vname) #input('x') dumpCSV(iterations, timing[Vname], rewards[Vname], steps[Vname], convergence[Vname], world, Vname) discount = 0.99 print "//Easy Policy Iteration Analysis//" for discount in [0.99, 0.90, 0.7, 0.4]: Pname = 'Policy gmma{:0.2f}'.format(discount) for nIter in iterations: startTime = clock() pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, -1, 1, nIter) #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms. # run planning from our initial state pi.setDebugCode(0) p = pi.planFromState(initialState) timing[Pname].append((clock() - startTime) * 1000) convergence[Pname].append(pi.lastPIDelta) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards[Pname], steps[Pname]) #ea = p.evaluateBehavior(initialState, rf, tf); #rewards['Policy'].append(calcRewardInEpisode(ea)); #steps['Policy'].append(ea.numTimeSteps()); if (nIter == 1): simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
vi.setDebugCode(0) p = vi.planFromState(initialState) timing['Value'].append(clock() - startTime) iterations['Value'].append(vi.numIterations) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards['Value'], steps['Value']) MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap()) print("\n\n") # simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration") dumpCSV(iterations['Value'], timing['Value'], rewards['Value'], steps['Value'], [n], 'Value', n == 2) print("//Size Policy Iteration Analysis//") pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-6, 20, MAX_ITERATIONS) #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms. pi.toggleUseCachedTransitionDynamics(False) # run planning from our initial state pi.setDebugCode(0) pi.setPolicyToEvaluate(p) startTime = clock() p = pi.planFromState(initialState) timing['Policy'].append(clock() - startTime) iterations['Policy'].append(pi.totalPolicyIterations) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards['Policy'], steps['Policy']) MapPrinter.printPolicyMap(pi.getAllStates(), p, gen.getMap()) print("\n\n") #simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration") dumpCSV(iterations['Policy'], timing['Policy'], rewards['Policy'],