Exemple #1
0
    MAX_ITERATIONS = 10000
    #MAX_ITERATIONS = 10000;
    NUM_INTERVALS = 1000
    increment = MAX_ITERATIONS / NUM_INTERVALS
    iterations = range(1, MAX_ITERATIONS + 1)
    for lr in [0.1, 0.5, 0.9]:
        for epsilon in [0.2, 0.8]:
            last10Rewards = deque([10] * 10, maxlen=10)
            Qname = 'Q-Learning L{:0.1f} E{:0.1f}'.format(lr, epsilon)
            agent = QLearning(domain, discount, hashingFactory, 1, lr, epsilon)
            agent.setDebugCode(0)
            print "//hard {} Iteration Analysis//".format(Qname)

            for nIter in iterations:
                startTime = clock()
                ea = agent.runLearningEpisode(env)

                env.resetEnvironment()
                agent.initializeForPlanning(rf, tf, 1)
                p = agent.planFromState(
                    initialState)  # run planning from our initial state
                timing[Qname].append((clock() - startTime) * 1000)
                last10Rewards.append(agent.maxQChangeInLastEpisode)
                convergence[Qname].append(sum(last10Rewards) / 10.)
                # evaluate the policy
                runEvals(initialState, p, rewards[Qname], steps[Qname])
                # uncomment the code below to produce screenshot at certain iteration
                #                 if (nIter == 1 and lr == 0.1 and epsilon == 0.2):
                #                     simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname+str(nIter))
                #                     break
                #                 if (nIter == 100/2 and lr == 0.1 and epsilon == 0.2):
Exemple #2
0
 if True:
     for lr in [0.1, 0.9]:
         for qInit in [-100, 0, 100]:
             for epsilon in [0.1, 0.3, 0.5]:
                 last10Chg = deque([99] * 10, maxlen=10)
                 Qname = 'Q-Learning L{:0.1f} q{:0.1f} E{:0.1f}'.format(
                     lr, qInit, epsilon)
                 agent = QLearning(domain, discount, hashingFactory, qInit,
                                   lr, epsilon, 900)
                 #agent.setLearningRateFunction(SoftTimeInverseDecayLR(1.,0.))
                 agent.setDebugCode(0)
                 print("//{} {} Iteration Analysis//".format(world, Qname))
                 for nIter in iterations:
                     if nIter % 50 == 0: print(nIter)
                     startTime = clock()
                     ea = agent.runLearningEpisode(env, 300)
                     if len(timing[Qname]) > 0:
                         timing[Qname].append(timing[Qname][-1] + clock() -
                                              startTime)
                     else:
                         timing[Qname].append(clock() - startTime)
                     env.resetEnvironment()
                     agent.initializeForPlanning(rf, tf, 1)
                     p = agent.planFromState(
                         initialState
                     )  # run planning from our initial state
                     last10Chg.append(agent.maxQChangeInLastEpisode)
                     convergence[Qname].append(sum(last10Chg) / 10.)
                     # evaluate the policy with one roll out visualize the trajectory
                     runEvals(initialState, p, rewards[Qname], steps[Qname])
                     if nIter == 50:
                        last10Rewards = deque([10] * 10, maxlen=10)
                        Qname = 'QL L{:0.1f} q{:0.1f} E{:0.1f}'.format(
                            lr, qInit, epsilon)
                        agent = QLearning(domain, disc, hashingFactory, qInit,
                                          lr, epsilon)
                        agent.setDebugCode(0)
                        print "//Treasure Hunt {} Iteration Analysis//".format(
                            Qname)
                        print Qname

                        for nIter in iterations:
                            #print " ====> Iter = ", nIter
                            startTime = clock()
                            if nIter % 50 == 0:
                                print nIter
                            ea = agent.runLearningEpisode(env)
                            env.resetEnvironment()
                            agent.initializeForPlanning(rf, tf, 1)
                            p = agent.planFromState(
                                initialState
                            )  # run planning from our initial state
                            #timing[Qname].append((clock()-startTime)*1000)
                            if len(timing[Qname]) > 0:
                                timing[Qname].append(timing[Qname][-1] +
                                                     clock() - startTime)
                            else:
                                timing[Qname].append(clock() - startTime)

                            last10Rewards.append(agent.maxQChangeInLastEpisode)
                            convergence[Qname].append(sum(last10Rewards) / 10.)
                            # evaluate the policy with one roll out visualize the trajectory
Exemple #4
0
                                      lr, epsilon, MAX_EPISODESIZE)
                    # agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon)

                    # QLearning(Domain domain, double gamma, HashableStateFactory hashingFactory,
                    # 			double qInit, double learningRate, double epsilon, int maxEpisodeSize)
                    # agent.setLearningRateFunction(SoftTimeInverseDecayLR(lr,0.))
                    agent.setDebugCode(0)

                    print "//{} {} Iteration Analysis//".format(world, Qname)
                    for nIter in iterations:
                        if nIter % 50 == 0: print(nIter)
                        # agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300)

                        print("start learning")
                        startTime = clock()
                        ea = agent.runLearningEpisode(env, MAX_EPISODESIZE)
                        # ea = agent.runLearningEpisode(env)
                        # runLearningEpisode(Environment env, int maxSteps)
                        print("stop learning")
                        env.resetEnvironment()

                        agent.initializeForPlanning(rf, tf, 1)
                        # public void initializeForPlanning(RewardFunction rf, TerminalFunction tf, int numEpisodesForPlanning)
                        p = agent.planFromState(
                            initialState
                        )  # run planning from our initial state
                        if len(timing[Qname]) > 0:
                            timing[Qname].append(timing[Qname][-1] + clock() -
                                                 startTime)
                        else:
                            timing[Qname].append(clock() - startTime)