Beispiel #1
0
def qLearning(world, userMap, maxX, maxY, discount=0.9, MAX_ITERATIONS=1000):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    MAX_ITERATIONS = MAX_ITERATIONS
    NUM_INTERVALS = MAX_ITERATIONS;
    iterations = range(1, MAX_ITERATIONS + 1)
    qInit = 0
    for lr in [0.01, 0.1, 0.5]:
        for epsilon in [0.3, 0.5, 0.7]:
            last10Chg = deque([10] * 10, maxlen=10)
            Qname = 'Q-Learning L{:0.2f} E{:0.1f}'.format(lr, epsilon)
            #agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300)
            agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon)
            agent.setDebugCode(0)

            print("*** {}: {}".format(world, Qname))

            for nIter in iterations:
                if nIter % 200 == 0: 
                    print('Iteration: {}'.format(nIter))

                startTime = clock()
                #ea = agent.runLearningEpisode(env, 300)
                ea = agent.runLearningEpisode(env)
                env.resetEnvironment()
                agent.initializeForPlanning(rf, tf, 1)
                p = agent.planFromState(initialState)  # run planning from our initial state
                endTime = clock()
                timing[Qname].append((endTime-startTime)*1000)

                last10Chg.append(agent.maxQChangeInLastEpisode)
                convergence[Qname].append(sum(last10Chg)/10.)
                # evaluate the policy with one roll out visualize the trajectory
                runEvals(initialState, p, rewards[Qname], steps[Qname], rf, tf, evalTrials=1)
                if nIter % 1000 == 0:
                    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                                  '{} {} Iter {} Policy Map.pkl'.format(world, Qname, nIter))
                    simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname)
                
            dumpCSV(nIter, timing[Qname], rewards[Qname], steps[Qname], convergence[Qname], world, Qname) 
Beispiel #2
0
def pIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)
    policy_converged = defaultdict(list)    
    last_policy = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    print("*** {} Policy Iteration Analysis".format(world))

    MAX_ITERATIONS = MAX_ITERATIONS
    iterations = range(1, MAX_ITERATIONS + 1)
    pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1,1); 
    pi.setDebugCode(0)
    for nIter in iterations:
        startTime = clock()
        #pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1, nIter); 
        #pi.setDebugCode(0)
        # run planning from our initial state
        p = pi.planFromState(initialState);
        endTime = clock()
        timing['Policy'].append((endTime-startTime)*1000)

        convergence['Policy'].append(pi.lastPIDelta)         
        # evaluate the policy with one roll out visualize the trajectory
        runEvals(initialState, p, rewards['Policy'], steps['Policy'], rf, tf, evalTrials=1)
        if nIter == 1 or nIter == 50:
            simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
 
        policy = pi.getComputedPolicy()
        allStates = pi.getAllStates()
        current_policy = [[(action.ga, action.pSelection) 
            for action in policy.getActionDistributionForState(state)] 
            for state in allStates]
        policy_converged['Policy'].append(current_policy == last_policy)
        last_policy = current_policy
 
    simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
            world + ' Policy Iteration Policy Map.pkl')
    dumpCSVp(iterations, timing['Policy'], rewards['Policy'], steps['Policy'],convergence['Policy'], 
            world, 'Policy', policy_converged['Policy'])
Beispiel #3
0
def vIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    print("*** {} Value Iteration Analysis".format(world))

    MAX_ITERATIONS = MAX_ITERATIONS
    iterations = range(1, MAX_ITERATIONS + 1)
    vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1, 1);
    vi.setDebugCode(0)
    vi.performReachabilityFrom(initialState)
    vi.toggleUseCachedTransitionDynamics(False)
    timing['Value'].append(0)
    for nIter in iterations:
        startTime = clock()
        vi.runVI()
        p = vi.planFromState(initialState);
        endTime = clock()
        timing['Value'].append((endTime-startTime)*1000)

        convergence['Value'].append(vi.latestDelta)
        # evaluate the policy with evalTrials roll outs
        runEvals(initialState, p, rewards['Value'], steps['Value'], rf, tf, evalTrials=1)
        if nIter == 1 or nIter == 50:
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))

    simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))
    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
            world + ' Value Iteration Policy Map.pkl')
    dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value')
    flag = True
    timing['Value'].append(0)
    for nIter in iterations:
        startTime = clock()
        vi.runVI()
        #timing['Value'].append((clock()-startTime) * 1000)
        timing['Value'].append(timing['Value'][-1] + clock() - startTime)
        p = vi.planFromState(initialState)
        convergence['Value'].append(vi.latestDelta)
        # evaluate the policy with evalTrials roll outs
        runEvals(initialState, p, rewards['Value'], steps['Value'])
        #if nIter == 1:
        #simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))
        if (vi.latestDelta < 1e-6) and flag:
            flag = False
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                                   "Value Iteration {}".format(nIter))
            dumpPolicyMap(
                MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                'Value {} Iter {} Policy Map.pkl'.format(world, nIter))
        # if vi.latestDelta <1e-6:
        #     break
    print "\n\n\n"
    simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                           "Value Iteration {}".format(nIter))
    dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'],
            convergence['Value'], world, 'Value')

    pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-3, 10, 1)
    pi.toggleUseCachedTransitionDynamics(False)
    print "//{} Policy Iteration Analysis//".format(world)
    flag = True
            lr, qInit, epsilon)
        agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon,
                          300)
        #agent.setLearningRateFunction(SoftTimeInverseDecayLR(1.,0.))
        agent.setDebugCode(0)
        print "//{} {} Iteration Analysis//".format(world, Qname)
        for nIter in iterations:
            if nIter % 50 == 0: print(nIter)
            startTime = clock()
            ea = agent.runLearningEpisode(env, 300)
            if len(timing[Qname]) > 0:
                timing[Qname].append(timing[Qname][-1] + clock() - startTime)
            else:
                timing[Qname].append(clock() - startTime)
            env.resetEnvironment()
            agent.initializeForPlanning(rf, tf, 1)
            p = agent.planFromState(
                initialState)  # run planning from our initial state
            last10Chg.append(agent.maxQChangeInLastEpisode)
            convergence[Qname].append(sum(last10Chg) / 10.)
            # evaluate the policy with one roll out visualize the trajectory
            runEvals(initialState, p, rewards[Qname], steps[Qname])

            if nIter == 9 or nIter == 100 or nIter == 1066 or nIter == 2900:
                simpleValueFunctionVis(agent, p, initialState, domain,
                                       hashingFactory,
                                       "Q-learning Iteration {}".format(nIter))
                raw_input('Press enter to continue')

    print("C'est fin")
                ea = agent.runLearningEpisode(env)

                env.resetEnvironment()
                agent.initializeForPlanning(rf, tf, 1)
                p = agent.planFromState(
                    initialState)  # run planning from our initial state
                timing[Qname].append((clock() - startTime) * 1000)
                last10Rewards.append(agent.maxQChangeInLastEpisode)
                convergence[Qname].append(sum(last10Rewards) / 10.)
                # evaluate the policy with one roll out visualize the trajectory
                runEvals(initialState, p, rewards[Qname], steps[Qname])
                # if (lr == 0.9 and epsilon == 0.5 and nIter == 1):
                # simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname + " Iter: 1")

                # # Uncomment to visualize environment after first iteration
                # if nIter == 1:
                #     simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname + " {}".format(nIter))
                #     break
            MapPrinter.printPolicyMap(
                getAllStates(domain, rf, tf, initialState), p, gen.getMap())
            print "\n\n\n"
            simpleValueFunctionVis(agent, p, initialState, domain,
                                   hashingFactory, Qname + " {}".format(nIter))
            dumpCSV(iterations, timing[Qname], rewards[Qname], steps[Qname],
                    convergence[Qname], world, Qname)

    print('done')
    # if lr ==0.9 and epsilon ==0.3:
    # simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname+' {}'.format(nIter))
    # input('s')
                for state in allStates
            }
            if nIter == 1:
                convergence['Policy'].append(18)
            else:
                convergence['Policy'].append(
                    comparePolicies(last_policy, current_policy))
                print('convergence policy = ' +
                      str(comparePolicies(last_policy, current_policy)))
            last_policy = current_policy
            # evaluate the policy with evalTrials roll outs
            runEvals(initialState, p, rewards['Value'], steps['Value'])

            if nIter == 1:
                simpleValueFunctionVis(
                    vi, p, initialState, domain, hashingFactory,
                    "Value Iter {} Disc {}".format(nIter, discount))
                dumpPolicyMap(
                    MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                    'Value {} Iter {} Disc {} Policy Map.pkl'.format(
                        world, nIter, str(discount)))
            if nIter % 2 == 1:
                simpleValueFunctionVis(
                    vi, p, initialState, domain, hashingFactory,
                    "Value Iter {} Disc {}".format(nIter, discount))
                dumpPolicyMap(
                    MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                    'Value {} Iter {} Disc {} Policy Map.pkl'.format(
                        world, nIter, str(discount)))
            if nIter == 5 or vi.latestDelta < 1e-6:
                dumpPolicyMap(
Beispiel #8
0
        timing['Policy'].append(timing['Policy'][-1] + clock() - startTime)
        policy = pi.getComputedPolicy()
        current_policy = {
            state: policy.getAction(state).toString()
            for state in allStates
        }
        convergence['Policy2'].append(pi.lastPIDelta)
        if nIter == 1:
            convergence['Policy'].append(999)
        else:
            convergence['Policy'].append(
                comparePolicies(last_policy, current_policy))
        last_policy = current_policy
        runEvals(initialState, p, rewards['Policy'], steps['Policy'])
        if nIter == 5 or convergence['Policy2'][-1] < 1e-6:
            simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory,
                                   "Policy Iteration {}".format(nIter))
            dumpPolicyMap(
                MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                'Policy {} Iter {} Policy Map.pkl'.format(world, nIter))
        if convergence['Policy2'][-1] < 1e-6:
            break
    MapPrinter.printPolicyMap(pi.getAllStates(), p, gen.getMap())
    print "\n\n\n"
    dumpCSV(nIter, timing['Policy'][1:], rewards['Policy'], steps['Policy'],
            convergence['Policy2'], world, 'Policy')
    #raise

    MAX_ITERATIONS = NUM_INTERVALS = MAX_ITERATIONS * 10
    increment = MAX_ITERATIONS / NUM_INTERVALS
    iterations = range(1, MAX_ITERATIONS + 1)
    for qInit in [-100, 0, 100]:
    print "//Hard Value Iteration Analysis//"
    for nIter in iterations:
        startTime = clock()
        vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1,
                            nIter)
        #//Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms.
        # run planning from our initial state
        vi.setDebugCode(0)
        p = vi.planFromState(initialState)
        timing['Value'].append(clock() - startTime)
        convergence['Value'].append(vi.latestDelta)
        # evaluate the policy with evalTrials roll outs
        runEvals(initialState, p, rewards['Value'], steps['Value'])

        if nIter == 2 or nIter == 20 or nIter == 100:
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                                   'Value Iteration %s' % (nIter))

    MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap())
    print "\n\n\n"
    # simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, 'Value Iteration %s' % (nIter))
    # input('c')
    dumpCSV(iterations,
            timing['Value'],
            rewards['Value'],
            steps['Value'],
            convergence['Value'],
            world,
            'Value',
            discount=discount)

    print "//Hard Policy Iteration Analysis//"
            startTime = clock()
            vi.runVI()
            timing['Value'].append(timing['Value'][-1] + clock() - startTime)
            p = vi.planFromState(initialState)
            convergence['Value'].append(vi.latestDelta)
            # evaluate the policy with evalTrials roll outs
            runEvals(initialState, p, rewards['Value'], steps['Value'])
            if nIter == 5 or vi.latestDelta < 1e-6:
                #simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))
                dumpPolicyMap(
                    MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                    'Hard/VI/Value {} Iter {} Policy Map.pkl'.format(
                        world, nIter))
            if nIter == 100:
                simpleValueFunctionVis(vi, p, initialState, domain,
                                       hashingFactory,
                                       "Value Iteration {}".format(nIter))
                dumpPolicyMap(
                    MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                    'Hard/VI/Value {} Iter {} Policy Map.pkl'.format(
                        world, nIter))
            #if vi.latestDelta <1e-6:
            #    break
        print "\n\n\n"
        dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'],
                convergence['Value'], world, 'Value')
        time.sleep(20)

    pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, 1e-3, 10, 1)
    pi.toggleUseCachedTransitionDynamics(False)
    print "//{} Policy Iteration Analysis//".format(world)
Beispiel #11
0
	print
	"//Easy Value Iteration Analysis//"
	
	for nIter in iterations:
		startTime = clock()
		vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1,
		                    nIter);  # //Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms.
		# run planning from our initial state
		vi.setDebugCode(0)
		p = vi.planFromState(initialState);
		timing['Value'].append(clock() - startTime)
		convergence['Value'].append(vi.latestDelta)
		# evaluate the policy with evalTrials roll outs
		runEvals(initialState, p, rewards['Value'], steps['Value'])
		if nIter == 1 or nIter == 25 or nIter == 50 or nIter == 15 or nIter == 100 :
			simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, 'Value Iteration %s' % (nIter))
	MapPrinter.printPolicyMap(vi.getAllStates(), p, gen.getMap());
	print
	"\n\n\n"
	# simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, 'Value Iteration %s' % (nIter))
	dumpCSV(iterations, timing['Value'], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value',
	        discount=discount)
	#

	print
	"//Easy Policy Iteration Analysis//"
	for nIter in iterations:
		startTime = clock()
		pi = PolicyIteration(domain, rf, tf, discount, hashingFactory, -1, 1,
		                     nIter);  # //Added a very high delta number in order to guarantee that value iteration occurs the max number of iterations for comparison with the other algorithms.
		# run planning from our initial state
Beispiel #12
0
    for nIter in iterations:
        startTime = clock()
        vi.runVI()
        timing['Value'].append(timing['Value'][-1] + clock() - startTime)
        p = vi.planFromState(initialState)
        convergence['Value'].append(vi.latestDelta)
        # evaluate the policy with evalTrials roll outs
        runEvals(initialState, p, rewards['Value'], steps['Value'])
        if nIter == 10:
            dumpPolicyMap(
                MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                'out/hard/Value {} Iter {} Policy Map.pkl'.format(
                    world, nIter))
        if vi.latestDelta < 1e-5:
            simpleValueFunctionVis(
                vi, p, initialState, domain, hashingFactory,
                "Converged Value Iteration {}".format(nIter))
            dumpPolicyMap(
                MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                'out/hard/Converged Value {} Iter {} Policy Map.pkl'.format(
                    world, nIter))
            break
        elif nIter == MAX_ITERATIONS:
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory,
                                   "Final Value Iteration {}".format(nIter))
            dumpPolicyMap(
                MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                'out/hard/Final Value {} Iter {} Policy Map.pkl'.format(
                    world, nIter))

    print "\n\n\n"