def pIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100): gen = BasicGridWorld(userMap, maxX, maxY) domain = gen.generateDomain() initialState = gen.getExampleState(domain); rf = BasicRewardFunction(maxX, maxY, userMap) tf = BasicTerminalFunction(maxX, maxY) env = SimulatedEnvironment(domain, rf, tf, initialState) visualizeInitialGridWorld(domain, gen, env) hashingFactory = SimpleHashableStateFactory() timing = defaultdict(list) rewards = defaultdict(list) steps = defaultdict(list) convergence = defaultdict(list) policy_converged = defaultdict(list) last_policy = defaultdict(list) allStates = getAllStates(domain, rf, tf, initialState) print("*** {} Policy Iteration Analysis".format(world)) MAX_ITERATIONS = MAX_ITERATIONS iterations = range(1, MAX_ITERATIONS + 1) pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1,1); pi.setDebugCode(0) for nIter in iterations: startTime = clock() #pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1, nIter); #pi.setDebugCode(0) # run planning from our initial state p = pi.planFromState(initialState); endTime = clock() timing['Policy'].append((endTime-startTime)*1000) convergence['Policy'].append(pi.lastPIDelta) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards['Policy'], steps['Policy'], rf, tf, evalTrials=1) if nIter == 1 or nIter == 50: simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) policy = pi.getComputedPolicy() allStates = pi.getAllStates() current_policy = [[(action.ga, action.pSelection) for action in policy.getActionDistributionForState(state)] for state in allStates] policy_converged['Policy'].append(current_policy == last_policy) last_policy = current_policy simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()), world + ' Policy Iteration Policy Map.pkl') dumpCSVp(iterations, timing['Policy'], rewards['Policy'], steps['Policy'],convergence['Policy'], world, 'Policy', policy_converged['Policy'])
convergence['Policy'].append( comparePolicies(last_policy, current_policy)) last_policy = current_policy runEvals(initialState, p, rewards['Policy'], steps['Policy']) #if (nIter == 1): #simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) if (convergence['Policy2'][-1] < 1e-6) and flag: flag = False simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration {}".format(nIter)) dumpPolicyMap( MapPrinter.printPolicyMap(allStates, p, gen.getMap()), 'Policy {} Iter {} Policy Map.pkl'.format(world, nIter)) # if convergence['Policy2'][-1] <1e-6: # break MapPrinter.printPolicyMap(pi.getAllStates(), p, gen.getMap()) print "\n\n\n" simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter)) dumpCSV(nIter, timing['Policy'][1:], rewards['Policy'], steps['Policy'], convergence['Policy2'], world, 'Policy') #raise MAX_ITERATIONS = NUM_INTERVALS = MAX_ITERATIONS * 100 increment = MAX_ITERATIONS / NUM_INTERVALS iterations = range(1, MAX_ITERATIONS + 1) for lr in [0.1, 0.9]: for qInit in [-100, 0, 100]: for epsilon in [0.1, 0.3, 0.5]: flag = True last10Chg = deque([99] * 10, maxlen=10)