def qLearning(world, userMap, maxX, maxY, discount=0.9, MAX_ITERATIONS=1000): gen = BasicGridWorld(userMap, maxX, maxY) domain = gen.generateDomain() initialState = gen.getExampleState(domain); rf = BasicRewardFunction(maxX, maxY, userMap) tf = BasicTerminalFunction(maxX, maxY) env = SimulatedEnvironment(domain, rf, tf, initialState) visualizeInitialGridWorld(domain, gen, env) hashingFactory = SimpleHashableStateFactory() timing = defaultdict(list) rewards = defaultdict(list) steps = defaultdict(list) convergence = defaultdict(list) allStates = getAllStates(domain, rf, tf, initialState) MAX_ITERATIONS = MAX_ITERATIONS NUM_INTERVALS = MAX_ITERATIONS; iterations = range(1, MAX_ITERATIONS + 1) qInit = 0 for lr in [0.01, 0.1, 0.5]: for epsilon in [0.3, 0.5, 0.7]: last10Chg = deque([10] * 10, maxlen=10) Qname = 'Q-Learning L{:0.2f} E{:0.1f}'.format(lr, epsilon) #agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300) agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon) agent.setDebugCode(0) print("*** {}: {}".format(world, Qname)) for nIter in iterations: if nIter % 200 == 0: print('Iteration: {}'.format(nIter)) startTime = clock() #ea = agent.runLearningEpisode(env, 300) ea = agent.runLearningEpisode(env) env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) p = agent.planFromState(initialState) # run planning from our initial state endTime = clock() timing[Qname].append((endTime-startTime)*1000) last10Chg.append(agent.maxQChangeInLastEpisode) convergence[Qname].append(sum(last10Chg)/10.) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards[Qname], steps[Qname], rf, tf, evalTrials=1) if nIter % 1000 == 0: dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()), '{} {} Iter {} Policy Map.pkl'.format(world, Qname, nIter)) simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname) dumpCSV(nIter, timing[Qname], rewards[Qname], steps[Qname], convergence[Qname], world, Qname)
#raise MAX_ITERATIONS = NUM_INTERVALS = MAX_ITERATIONS * 100 increment = MAX_ITERATIONS / NUM_INTERVALS iterations = range(1, MAX_ITERATIONS + 1) for lr in [0.1, 0.9]: for qInit in [-100, 0, 100]: for epsilon in [0.1, 0.3, 0.5]: flag = True last10Chg = deque([99] * 10, maxlen=10) Qname = 'Q-Learning L{:0.1f} q{:0.1f} E{:0.1f}'.format( lr, qInit, epsilon) agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300) #agent.setLearningRateFunction(SoftTimeInverseDecayLR(1.,0.)) agent.setDebugCode(0) print "//{} {} Iteration Analysis//".format(world, Qname) for nIter in iterations: if nIter % 50 == 0: print(nIter) startTime = clock() ea = agent.runLearningEpisode(env, 300) # if len(timing[Qname])> 0: # timing[Qname].append(timing[Qname][-1]+clock()-startTime) # else: #timing[Qname].append((clock()-startTime) * 1000) if len(timing[Qname]) > 0: timing[Qname].append(timing[Qname][-1] + clock() - startTime) else: timing[Qname].append(clock() - startTime) env.resetEnvironment()