MAX_ITERATIONS = 10000 #MAX_ITERATIONS = 10000; NUM_INTERVALS = 1000 increment = MAX_ITERATIONS / NUM_INTERVALS iterations = range(1, MAX_ITERATIONS + 1) for lr in [0.1, 0.5, 0.9]: for epsilon in [0.2, 0.8]: last10Rewards = deque([10] * 10, maxlen=10) Qname = 'Q-Learning L{:0.1f} E{:0.1f}'.format(lr, epsilon) agent = QLearning(domain, discount, hashingFactory, 1, lr, epsilon) agent.setDebugCode(0) print "//hard {} Iteration Analysis//".format(Qname) for nIter in iterations: startTime = clock() ea = agent.runLearningEpisode(env) env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) p = agent.planFromState( initialState) # run planning from our initial state timing[Qname].append((clock() - startTime) * 1000) last10Rewards.append(agent.maxQChangeInLastEpisode) convergence[Qname].append(sum(last10Rewards) / 10.) # evaluate the policy runEvals(initialState, p, rewards[Qname], steps[Qname]) # uncomment the code below to produce screenshot at certain iteration # if (nIter == 1 and lr == 0.1 and epsilon == 0.2): # simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname+str(nIter)) # break # if (nIter == 100/2 and lr == 0.1 and epsilon == 0.2):
if True: for lr in [0.1, 0.9]: for qInit in [-100, 0, 100]: for epsilon in [0.1, 0.3, 0.5]: last10Chg = deque([99] * 10, maxlen=10) Qname = 'Q-Learning L{:0.1f} q{:0.1f} E{:0.1f}'.format( lr, qInit, epsilon) agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 900) #agent.setLearningRateFunction(SoftTimeInverseDecayLR(1.,0.)) agent.setDebugCode(0) print("//{} {} Iteration Analysis//".format(world, Qname)) for nIter in iterations: if nIter % 50 == 0: print(nIter) startTime = clock() ea = agent.runLearningEpisode(env, 300) if len(timing[Qname]) > 0: timing[Qname].append(timing[Qname][-1] + clock() - startTime) else: timing[Qname].append(clock() - startTime) env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) p = agent.planFromState( initialState ) # run planning from our initial state last10Chg.append(agent.maxQChangeInLastEpisode) convergence[Qname].append(sum(last10Chg) / 10.) # evaluate the policy with one roll out visualize the trajectory runEvals(initialState, p, rewards[Qname], steps[Qname]) if nIter == 50:
last10Rewards = deque([10] * 10, maxlen=10) Qname = 'QL L{:0.1f} q{:0.1f} E{:0.1f}'.format( lr, qInit, epsilon) agent = QLearning(domain, disc, hashingFactory, qInit, lr, epsilon) agent.setDebugCode(0) print "//Treasure Hunt {} Iteration Analysis//".format( Qname) print Qname for nIter in iterations: #print " ====> Iter = ", nIter startTime = clock() if nIter % 50 == 0: print nIter ea = agent.runLearningEpisode(env) env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) p = agent.planFromState( initialState ) # run planning from our initial state #timing[Qname].append((clock()-startTime)*1000) if len(timing[Qname]) > 0: timing[Qname].append(timing[Qname][-1] + clock() - startTime) else: timing[Qname].append(clock() - startTime) last10Rewards.append(agent.maxQChangeInLastEpisode) convergence[Qname].append(sum(last10Rewards) / 10.) # evaluate the policy with one roll out visualize the trajectory
lr, epsilon, MAX_EPISODESIZE) # agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon) # QLearning(Domain domain, double gamma, HashableStateFactory hashingFactory, # double qInit, double learningRate, double epsilon, int maxEpisodeSize) # agent.setLearningRateFunction(SoftTimeInverseDecayLR(lr,0.)) agent.setDebugCode(0) print "//{} {} Iteration Analysis//".format(world, Qname) for nIter in iterations: if nIter % 50 == 0: print(nIter) # agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300) print("start learning") startTime = clock() ea = agent.runLearningEpisode(env, MAX_EPISODESIZE) # ea = agent.runLearningEpisode(env) # runLearningEpisode(Environment env, int maxSteps) print("stop learning") env.resetEnvironment() agent.initializeForPlanning(rf, tf, 1) # public void initializeForPlanning(RewardFunction rf, TerminalFunction tf, int numEpisodesForPlanning) p = agent.planFromState( initialState ) # run planning from our initial state if len(timing[Qname]) > 0: timing[Qname].append(timing[Qname][-1] + clock() - startTime) else: timing[Qname].append(clock() - startTime)