예제 #1
0
def gridworld(world = 'Easy'):
    if world == 'Easy':
        #userMap = [[-4, -4, -4, -4, 100],
        #           [-4, 1, -4, 1, -100],
        #           [-4, 1, 1, 1, -4],
        #           [-4, 1, -4, 1, -4],
        #           [-4, -4, -4, -4, -4]]
        #userMap = [[1,0,0,0],
        #       [0,1,0,0],
        #       [0,1,1,0],
        #       [0,0,0,0]]
        userMap = [[0,1,0,0,0],
                   [0,1,0,1,0],
                   [0,1,0,0,0],
                   [0,1,1,1,0],
                   [0,0,0,0,0]]
    else:
        userMap = [[1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
               [1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0],
               [1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0],
               [1,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0],
               [0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0],
               [0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0],
               [0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0],
               [0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0],
               [0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0],
               [0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0],
               [0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0],
               [0,0,0,1,1,0,0,1,1,1,1,1,1,1,0,0],
               [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
               [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
               [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
               [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]

    n = len(userMap)
    tmp = deepcopy(userMap)
    userMap = MapPrinter().mapToMatrix(tmp)
    maxX = n - 1
    maxY = n - 1
    
    # Print the map that is being analyzed
    print("\n\n*** {} Grid World Analysis ***\n".format(world))
    MapPrinter().printMap(MapPrinter.matrixToMap(userMap));
    
    return userMap, maxX, maxY
예제 #2
0
def qLearning(world, userMap, maxX, maxY, discount=0.9, MAX_ITERATIONS=1000):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    MAX_ITERATIONS = MAX_ITERATIONS
    NUM_INTERVALS = MAX_ITERATIONS;
    iterations = range(1, MAX_ITERATIONS + 1)
    qInit = 0
    for lr in [0.01, 0.1, 0.5]:
        for epsilon in [0.3, 0.5, 0.7]:
            last10Chg = deque([10] * 10, maxlen=10)
            Qname = 'Q-Learning L{:0.2f} E{:0.1f}'.format(lr, epsilon)
            #agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon, 300)
            agent = QLearning(domain, discount, hashingFactory, qInit, lr, epsilon)
            agent.setDebugCode(0)

            print("*** {}: {}".format(world, Qname))

            for nIter in iterations:
                if nIter % 200 == 0: 
                    print('Iteration: {}'.format(nIter))

                startTime = clock()
                #ea = agent.runLearningEpisode(env, 300)
                ea = agent.runLearningEpisode(env)
                env.resetEnvironment()
                agent.initializeForPlanning(rf, tf, 1)
                p = agent.planFromState(initialState)  # run planning from our initial state
                endTime = clock()
                timing[Qname].append((endTime-startTime)*1000)

                last10Chg.append(agent.maxQChangeInLastEpisode)
                convergence[Qname].append(sum(last10Chg)/10.)
                # evaluate the policy with one roll out visualize the trajectory
                runEvals(initialState, p, rewards[Qname], steps[Qname], rf, tf, evalTrials=1)
                if nIter % 1000 == 0:
                    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
                                  '{} {} Iter {} Policy Map.pkl'.format(world, Qname, nIter))
                    simpleValueFunctionVis(agent, p, initialState, domain, hashingFactory, Qname)
                
            dumpCSV(nIter, timing[Qname], rewards[Qname], steps[Qname], convergence[Qname], world, Qname) 
예제 #3
0
def pIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)
    policy_converged = defaultdict(list)    
    last_policy = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    print("*** {} Policy Iteration Analysis".format(world))

    MAX_ITERATIONS = MAX_ITERATIONS
    iterations = range(1, MAX_ITERATIONS + 1)
    pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1,1); 
    pi.setDebugCode(0)
    for nIter in iterations:
        startTime = clock()
        #pi = PolicyIteration(domain,rf,tf,discount,hashingFactory,-1,1, nIter); 
        #pi.setDebugCode(0)
        # run planning from our initial state
        p = pi.planFromState(initialState);
        endTime = clock()
        timing['Policy'].append((endTime-startTime)*1000)

        convergence['Policy'].append(pi.lastPIDelta)         
        # evaluate the policy with one roll out visualize the trajectory
        runEvals(initialState, p, rewards['Policy'], steps['Policy'], rf, tf, evalTrials=1)
        if nIter == 1 or nIter == 50:
            simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
 
        policy = pi.getComputedPolicy()
        allStates = pi.getAllStates()
        current_policy = [[(action.ga, action.pSelection) 
            for action in policy.getActionDistributionForState(state)] 
            for state in allStates]
        policy_converged['Policy'].append(current_policy == last_policy)
        last_policy = current_policy
 
    simpleValueFunctionVis(pi, p, initialState, domain, hashingFactory, "Policy Iteration{}".format(nIter))
    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
            world + ' Policy Iteration Policy Map.pkl')
    dumpCSVp(iterations, timing['Policy'], rewards['Policy'], steps['Policy'],convergence['Policy'], 
            world, 'Policy', policy_converged['Policy'])
예제 #4
0
def vIteration(world, userMap, maxX, maxY, discount=0.99, MAX_ITERATIONS=100):
    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)

    allStates = getAllStates(domain, rf, tf, initialState)

    print("*** {} Value Iteration Analysis".format(world))

    MAX_ITERATIONS = MAX_ITERATIONS
    iterations = range(1, MAX_ITERATIONS + 1)
    vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1, 1);
    vi.setDebugCode(0)
    vi.performReachabilityFrom(initialState)
    vi.toggleUseCachedTransitionDynamics(False)
    timing['Value'].append(0)
    for nIter in iterations:
        startTime = clock()
        vi.runVI()
        p = vi.planFromState(initialState);
        endTime = clock()
        timing['Value'].append((endTime-startTime)*1000)

        convergence['Value'].append(vi.latestDelta)
        # evaluate the policy with evalTrials roll outs
        runEvals(initialState, p, rewards['Value'], steps['Value'], rf, tf, evalTrials=1)
        if nIter == 1 or nIter == 50:
            simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))

    simpleValueFunctionVis(vi, p, initialState, domain, hashingFactory, "Value Iteration {}".format(nIter))
    dumpPolicyMap(MapPrinter.printPolicyMap(allStates, p, gen.getMap()),
            world + ' Value Iteration Policy Map.pkl')
    dumpCSV(nIter, timing['Value'][1:], rewards['Value'], steps['Value'], convergence['Value'], world, 'Value')
예제 #5
0
    #           [ 0, -3, 0, 0, 0, 0, 0, 0, 0, 0],
    #           ]

    userMap = [[0, 0, 0, 0, -5, 0, 0, 0, -1, 0],
               [0, 1, 0, 0, 1, 1, 1, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
               [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0],
               [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0],
               [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 1, 1, 0, 0, 1, 0, 0],
               [0, 0, 0, -3, 0, 0, 0, 0, 0, 0]]

    n = len(userMap)
    tmp = java.lang.reflect.Array.newInstance(java.lang.Integer.TYPE, [n, n])
    for i in range(n):
        for j in range(n):
            tmp[i][j] = userMap[i][j]
    userMap = MapPrinter().mapToMatrix(tmp)
    maxX = maxY = n - 1

    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain)

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)
    #    Print the map that is being analyzed
    print "/////{} Grid World Analysis/////\n".format(world)
    MapPrinter().printMap(MapPrinter.matrixToMap(userMap))
    visualizeInitialGridWorld(domain, gen, env)

    hashingFactory = SimpleHashableStateFactory()
예제 #6
0
                [0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0],
                [0,1,-5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
                [0,0,-5,0,0,1,0,0,0,0,0,1,-5,0,1,0,0,0,0,0],
                [0,1,-5,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,-1,-1],
                [0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,1,-1,-1],
                [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,-1,-1],
                [0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0],
                [0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0],
                [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]
  
    n = len(userMap)
    tmp = java.lang.reflect.Array.newInstance(java.lang.Integer.TYPE,[n,n])
    for i in range(n):
        for j in range(n):
            tmp[i][j]= userMap[i][j]
    userMap = MapPrinter().mapToMatrix(tmp)
    maxX = maxY= n-1
    
    gen = BasicGridWorld(userMap,maxX,maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX,maxY,userMap)
    tf = BasicTerminalFunction(maxX,maxY)
    env = SimulatedEnvironment(domain, rf, tf,initialState);
#    Print the map that is being analyzed
    print "/////{} Grid World Analysis/////\n".format(world)
    MapPrinter().printMap(MapPrinter.matrixToMap(userMap));

    hashingFactory = SimpleHashableStateFactory()
    increment = MAX_ITERATIONS/NUM_INTERVALS
               [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
               [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
               [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0],
               [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
               [0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0],
               [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0],
               [0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0],
               [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0],
               [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]

    n = len(userMap)
    tmp = java.lang.reflect.Array.newInstance(java.lang.Integer.TYPE, [n, n])
    for i in range(n):
        for j in range(n):
            tmp[i][j] = userMap[i][j]
    userMap = MapPrinter().mapToMatrix(tmp)
    maxX = maxY = n - 1

    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain)

    # rf = BasicRewardFunction(4,2,userMap)
    # tf = BasicTerminalFunction(4,2)
    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)

    env = SimulatedEnvironment(domain, rf, tf, initialState)

    #    Print the map that is being analyzed
    print "/////Easy Grid World Analysis/////\n"
예제 #8
0
              [ 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
              [ 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
              [ 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
              [ 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
              [ 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
              [ 0, 1, 0, 1, 1, 1, 1, 1, 1, 0],
              [ 0, -3, 0, 0, 0, 0, 0, 0, 0, 0],              
              ]'''
    # userMap = [ [0,0],[0,0]]
    userMap = [[0, 0, -5, 0], [0, 1, 0, 0], [0, 1, 1, 0], [0, 0, 0, 0]]
    n = len(userMap)
    tmp = java.lang.reflect.Array.newInstance(java.lang.Integer.TYPE, [n, n])
    for i in range(n):
        for j in range(n):
            tmp[i][j] = userMap[i][j]
    userMap = MapPrinter().mapToMatrix(tmp)
    maxX = maxY = n - 1

    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain)

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState)

    # shows starting and ending positions
    #visualizeInitialGridWorld(domain, gen, env)
    #    Print the map that is being analyzed
    print("/////{} Grid World Analysis/////\n".format(world))
    MapPrinter().printMap(MapPrinter.matrixToMap(userMap))
예제 #9
0
파일: hardGW.py 프로젝트: solb0039/CS7641
    # for i in range(n):
    #     for j in range(n):
    #         tmp[i][j] = userMap[i][j]
    # userMap = MapPrinter().mapToMatrix(tmp)
    maxX = maxY = n - 1

    gen = BasicGridWorld(userMap, maxX, maxY)
    domain = gen.generateDomain()
    initialState = gen.getExampleState(domain);

    rf = BasicRewardFunction(maxX, maxY, userMap)
    tf = BasicTerminalFunction(maxX, maxY)
    env = SimulatedEnvironment(domain, rf, tf, initialState);
    #    Print the map that is being analyzed
    print "/////{} Grid World Analysis/////\n".format(world)
    MapPrinter().printMap(MapPrinter.matrixToMap(userMap));

    hashingFactory = SimpleHashableStateFactory()
    increment = MAX_ITERATIONS / NUM_INTERVALS
    timing = defaultdict(list)
    rewards = defaultdict(list)
    steps = defaultdict(list)
    convergence = defaultdict(list)
    allStates = getAllStates(domain, rf, tf, initialState)
    # Value Iteration
    iterations = range(1, MAX_ITERATIONS + 1)
    vi = ValueIteration(domain, rf, tf, discount, hashingFactory, -1, 1);
    vi.toggleUseCachedTransitionDynamics(False)
    vi.setDebugCode(0)
    vi.performReachabilityFrom(initialState)
    print "//{} Value Iteration Analysis//".format(world)