Esempio n. 1
0
def qLearning(world, userMap, maxX, maxY, discount=0.9, MAX_ITERATIONS=1000):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    MAX_ITERATIONS = MAX_ITERATIONS
    NUM_INTERVALS = MAX_ITERATIONS;
    iterations = range(1, MAX_ITERATIONS + 1)
    qInit = 0
    for lr in [0.01, 0.1, 0.5]:
        for epsilon in [0.3, 0.5, 0.7]:
            last10Chg = deque([10] * 10, maxlen=10)
            Qname = 'Q-Learning L{:0.2f} E{:0.1f}'.format(lr, epsilon)
            #agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300)
            agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon)
            agent.setDebugCode(0)

            print("*** {}: {}".format(world, Qname))

            for nIter in iterations:
                if nIter % 200 == 0: 
                    print('Iteration: {}'.format(nIter))

                startTime = clock()
                #ea = agent.runLearningEpisode(env, 300)
                ea = agent.runLearningEpisode(env)
                env.resetEnvironment()
                agent.initializeForPlanning(rf, tf, 1)
                p = agent.planFromState(initialState)  # run planning from our initial state
                endTime = clock()
                timing[Qname].append((endTime-startTime)*1000)

                last10Chg.append(agent.maxQChangeInLastEpisode)
                convergence[Qname].append(sum(last10Chg)/10.)
                # evaluate the policy with one roll out visualize the trajectory
                runEvals(initialState, p, rewards[Qname], steps[Qname], rf, tf, evalTrials=1)
                if nIter % 1000 == 0:
                    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                                  '{} {} Iter {} Policy Map.pkl'.format(world, Qname, nIter))
                    simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname)
                
            dumpCSV(nIter, timing[Qname], rewards[Qname], steps[Qname], convergence[Qname], world, Qname) 
Esempio n. 2
0
                           "Policy Iteration{}".format(nIter))
    dumpCSV(nIter, timing['Policy'][1:], rewards['Policy'], steps['Policy'],
            convergence['Policy2'], world, 'Policy')
    #raise

    MAX_ITERATIONS = NUM_INTERVALS = MAX_ITERATIONS * 100
    increment = MAX_ITERATIONS / NUM_INTERVALS
    iterations = range(1, MAX_ITERATIONS + 1)
    for lr in [0.1, 0.9]:
        for qInit in [-100, 0, 100]:
            for epsilon in [0.1, 0.3, 0.5]:
                flag = True
                last10Chg = deque([99] * 10, maxlen=10)
                Qname = 'Q-Learning L{:0.1f} q{:0.1f} E{:0.1f}'.format(
                    lr, qInit, epsilon)
                agent = QLearning(domain, discount, hashingFactory, qInit, lr,
                                  epsilon, 300)
                #agent.setLearningRateFunction(SoftTimeInverseDecayLR(1.,0.))
                agent.setDebugCode(0)
                print "//{} {} Iteration Analysis//".format(world, Qname)
                for nIter in iterations:
                    if nIter % 50 == 0: print(nIter)
                    startTime = clock()
                    ea = agent.runLearningEpisode(env, 300)
                    # if len(timing[Qname])> 0:
                    #     timing[Qname].append(timing[Qname][-1]+clock()-startTime)
                    # else:
                    #timing[Qname].append((clock()-startTime) * 1000)
                    if len(timing[Qname]) > 0:
                        timing[Qname].append(timing[Qname][-1] + clock() -
                                             startTime)
                    else:
Esempio n. 3
0
                          #'Value {} Iter {} Policy Map.pkl'.format(world, nIter))
            break
    MapPrinter.printPolicyMap(pi.getAllStates(), p, gen.getMap());
    print("\n\n\n")
    dumpCSV(nIter, timing['Policy'][1:], rewards['Policy'], steps['Policy'],convergence['Policy2'], world, 'Policy')

      
    MAX_ITERATIONS=NUM_INTERVALS = MAX_ITERATIONS*10;
    increment = MAX_ITERATIONS/NUM_INTERVALS
    iterations = range(1,MAX_ITERATIONS+1)
    for lr in [0.1,0.9]:
        for qInit in [-100,0,100]:
            for epsilon in [0.1,0.3,0.5]:
                last10Chg = deque([99]*10,maxlen=10)
                Qname = 'Q-Learning L{:0.1f} q{:0.1f} E{:0.1f}'.format(lr,qInit,epsilon)
                agent = QLearning(domain,discount,hashingFactory,qInit,lr,epsilon,300)
                agent.setLearningRateFunction(SoftTimeInverseDecayLR(1.,0.))
                agent.setLearningPolicy(EpsilonGreedy(agent,epsilon))
                agent.setDebugCode(0)
                print("//{} {} Iteration Analysis//".format(world, Qname))
                for nIter in iterations: 
                    if nIter % 50 == 0: print(nIter)			
                    startTime = clock()    
                    ea = agent.runLearningEpisode(env, 300)
                    if len(timing[Qname]) > 0:
                        timing[Qname].append(timing[Qname][-1]+clock()-startTime)   
                    else:
                        timing[Qname].append(clock()-startTime)             
                    env.resetEnvironment()
                    agent.initializeForPlanning(rf, tf, 1)
                    p = agent.planFromState(initialState)     # run planning from our initial state