def f(in1,in2):
    " write your linear function approximator here (5 lines or so)"
    tilecode(in1, in2, tileIndices)
    fvalue = 0.0
    for i in range (0, numTilings):
        fvalue += w[tileIndices[i]]
    return fvalue    
Exemple #2
0
def learnEpisode(alpha, eps, gamma, theta1, theta2):
        in1, in2 = mountaincar.init()
        currentStates = tilecode(in1, in2, [-1]*numTilings) # returns the initial state
        episodeReturn = 0
        step = 0
        while(True): # continue until we reach terminal state (None)
            action = epsGreedyPolicy(currentStates, eps, theta1, theta2)
            reward, nextStatePosVel = mountaincar.sample((in1, in2), action)
            episodeReturn += reward
            step += 1
            if nextStatePosVel:
                nextIn1, nextIn2 = nextStatePosVel
                nextStates = tilecode(nextIn1, nextIn2, [-1]*numTilings)
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma)
                else:
                    updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma)
                currentStates = nextStates
                in1, in2 = nextIn1, nextIn2
            else: # next state is terminal state
                if(np.random.randint(0,2)):  # will return ints between [0,2)
                    updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma)
                else:
                    updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma)
                return episodeReturn, step
Exemple #3
0
def f(in1, in2):
    # write your linear function approximator here (5 lines or so)
    sum = 0.0
    tilecode(in1, in2, tileIndices)
    for i in tileIndices:
        sum = sum + theta[i] * 1
    return sum
def learn(x, y, target):
    global weights
    tilecode(x, y, tileIndices)
    innerProduct = f(x, y)
    for index in tileIndices:
        newWeight = weights[index] + step_size * (target - innerProduct)
        weights[index] = newWeight
def f(x,y):
    # write your linear function approximator here (5 lines or so)
   total=0.0
   tilecode(x,y,tileIndices)
   for i in tileIndices:
      total+=weight[i]
   return total
Exemple #6
0
def f(x, y):
    # write your linear function approximator here (5 lines or so)
    tilecode(x, y, tileIndices)
    sum = 0.0
    for i in tileIndices:
        sum += weights[i]
    return sum
Exemple #7
0
def f(in1,in2):
    tilecode(in1, in2, tileIndices)
    # Calculate the estimated function value for the inputs in1, in2
    f = 0
    for index in tileIndices:
    	f += weights[index]
    return f
def learn(x, y, target):
    global weights
    tilecode(x, y, tileIndices)
    innerProduct = f(x, y)
    for index in tileIndices:
        newWeight = weights[index] + step_size * (target - innerProduct)
        weights[index] = newWeight
Exemple #9
0
def f(x,y):
	# write your linear function approximator here (5 lines or so)
	tilecode(x,y,tileIndices)
	sum = 0.0
	for i in tileIndices:
		sum+=weights[i]
	return sum
Exemple #10
0
def f(x,y):
    # write your linear function approximator here (5 lines or so)
    tilecode(x, y, tileIndices)
    f = 0
    for i in tileIndices:
        f = f + weight[int(i)]
    return f
Exemple #11
0
def f(x, y):
    # write your linear function approximator here (5 lines or so)
    total = 0.0
    tilecode(x, y, tileIndices)
    for i in tileIndices:
        total += weight[i]
    return total
def f(in1, in2):
    # write your linear function approximator here (5 lines or so)
    tilecode(in1, in2, tileIndices)
    sum1 = 0
    for index in tileIndices:
        sum1 += theta[index]
    return sum1
Exemple #13
0
def f(in1, in2):
    #i = 0
    thef = 0
    tilecode(in1, in2, tileIndices)
    #print(tileIndices)
    for a in tileIndices:
        thef += theta[a]
    return thef
Exemple #14
0
def learn(in1, in2, target):
    # write your gradient descent learning algorithm here (3 lines or so)
    tileIndices = [-1
                   ] * numTilings  # initialize your list of tile indices here
    tilecode(in1, in2, tileIndices)
    estimate = f(in1, in2)
    for indicie in tileIndices:
        theta[indicie] += alpha * (target - estimate)
def learn(x,y,target):
    #gradient descent learning algorithm
    
    #Getting the important features
    tilecode(x,y,tileIndices)
    #Runing the algorithm given for the important features
    for features in tileIndices:
        weights[int(features)] = weights[int(features)] + alpha*(target - f(x,y))
Exemple #16
0
def f(in1, in2):
    # write your linear function approximator here (5 lines or so)
    sumOfWeights = 0
    tilecode(in1, in2, indices)

    for tile in indices:
        sumOfWeights += theta[tile]

    return sumOfWeights
Exemple #17
0
def f(in1, in2):
    # write your linear function approximator here (5 lines or so)
    tileIndices = [-1
                   ] * numTilings  # initialize your list of tile indices here
    tilecode(in1, in2, tileIndices)
    rValue = 0
    for indicie in tileIndices:
        rValue += theta[indicie]

    return rValue
Exemple #18
0
def writeF(theta1, theta2):
    fout = open('value', 'w')
    steps = 50
    for i in range(steps):
        for j in range(steps):
            F = [-1] * numTilings
            tilecode(-1.2 + (i * 1.7 / steps), -0.07 + (j * 0.14 / steps), F)
            height = -max(Qs(F, theta1 + theta2 / 2))
            fout.write(repr(height) + ' ')
        fout.write('\n')
    fout.close()
def writeF():
    fout = open('value', 'w')
    F = [0]*numTilings
    steps = 50
    for i in range(steps):
        for j in range(steps):
            tilecode(-1.2+i*1.7/steps, -0.07+j*0.14/steps, F)
            height = -max(Qs(F))
            fout.write(repr(height) + ' ')
        fout.write('\n')
    fout.close()
def f(x,y):
    
    #Getting which tiles are important. 
    tilecode(x,y,tileIndices)
    
    #Value to store the sum of the weights for the features
    sum = 0
    
    #Adding important weights for the features
    for features in tileIndices:
        sum += weights[int(features)]
    return sum
Exemple #21
0
def learn():    
    runSum = 0.0
    for run in xrange(numRuns):
        theta = -0.01*rand(n)
        returnSum = 0.0
        for episodeNum in xrange(numEpisodes):
            step = 0
            G = 0        
            traces = zeros(n)
            S=mountaincar.init()
            # Until S is terminal:
            while S!=None:
                # Choose action
                tilecode(S,F)
                if rand() <= Emu:                 # randomly explore
                    a = randint(0, 2)
                else:                             # greedy action choice
                    a = argmax([QValue(F,0,theta),QValue(F,1,theta),QValue(F,2,theta)])
                # Replacing traces on indices where feature vector is 1
                for index in F:
                    traces[index+(a*numTiles)] = 1                     
                # Take action, observe r,Sp
                r,Sp=mountaincar.sample(S,a)
                G += r
                # If terminal action update theta and end episode
                if Sp == None:
                    delta = r - QValue(F,a,theta)
                    theta =  theta + alpha*delta*traces
                    break
                # Choose expected next action
                tilecode(Sp,Fp)
                ap = argmax([QValue(Fp,0,theta),QValue(Fp,1,theta),QValue(Fp,2,theta)])
                # Update theta
                randomAction = (Epi/3)*QValue(Fp,0,theta) + (Epi/3)*QValue(Fp,1,theta)+ (Epi/3)*QValue(Fp,2,theta)
                delta = r + randomAction + (1-Epi)*QValue(Fp,ap,theta) - QValue(F,a,theta)
                theta = theta + alpha*delta*traces
                # Decay every component
                traces = gamma*lmbda*traces
                S=Sp
                step += 1
            returnSum += G        
    
            print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
            episodeReturn[episodeNum] += (G-episodeReturn[episodeNum])/(numRuns+1)
            episodeSteps[episodeNum] += (step-episodeSteps[episodeNum])/(numRuns+1)
            returnSum = returnSum + G
        print "Average return:", returnSum/numEpisodes
        runSum += returnSum
    print "Overall performance: Average sum of return per run:", runSum/numRuns
    writeAverages(episodeReturn,episodeSteps)
Exemple #22
0
def writeF():
    fout = open('value', 'w')
    F = [0]*numTilings
    steps = 50
    for i in range(steps):
        for j in range(steps):
            S = (-1.2+i*1.7/steps, -0.07+j*0.14/steps)
            tilecode(S, F)
            Qa = zeros(3)
            for a_poss in [0,1,2]:
                Qa[a_poss] = getStateActionValue(w,F,a_poss)
            height = -max(Qa)
            fout.write(repr(height) + ' ')
        fout.write('\n')
    fout.close()
Exemple #23
0
def Total(state, action, theta):
    tileIndices = [-1] * numTilings
    tileIndices = tilecode(state[0], state[1], tileIndices)
    total = 0
    for i in range(0, numTilings):
        total += theta[tileIndices[i] + (action * numTiles)]
    return total
def learn(in1, in2, target):
    error = target - f(in1, in2)
    global theta
    update = alpha * error
    active_features = tilecode(in1, in2, indices)
    for i in active_features:
        theta[i] += update
Exemple #25
0
def learn(x,y,target):
    # write your gradient descent learning algorithm here (3 lines or so)
    currentFXY = f(x,y)
    featureVectorArray = tilecode(x,y,[-1]*numTilings)
    for i in range(len(theta)):
        if i in featureVectorArray:
            theta[i] = theta[i] + alpha*(target - currentFXY)
def f(in1, in2):
    # write your linear function approximator here (5 lines or so)
    features = zeros(n)
    global theta
    for i in tilecode(in1, in2, indices):
        features[i] = 1
    return dot(theta, features)
Exemple #27
0
def f(in1, in2):
    # write your linear function approximator here (5 lines or so)
    totla_f = 0
    TileCoderIndices = tilecode(in1, in2, tileIndices)
    # list of indices j where !j(i) is 1, with all others assumed 0.
    for i in TileCoderIndices:
        totla_f = totla_f + theta[i]
    return totla_f
Exemple #28
0
def learn(in1, in2, target):
    # write your gradient descent learning algorithm here (3 lines or so)
    f_new = f(in1, in2)
    TileCoderIndices = tilecode(in1, in2, tileIndices)
    for j in TileCoderIndices:
        theta[j] = theta[j] + alpha * (target - f_new)
    #print(target)
    return theta
def updateDelta(tiles, theta, action, newState):
    nextTiles = tilecode(newState[0], newState[1],[-1]*numTilings)
    delta = 0
    nextAction = getBestAction(nextTiles, theta)
    for i in nextTiles:
        delta += theta[i + nextAction*4*81]
    for i in tiles:
        delta -= theta[i + action*4*81]
    return delta
Exemple #30
0
def f(x,y):
    # write your linear function approximator here (5 lines or so)
    total = 0
    vectorLength = len(theta) # corresponds to n in the algorithm
    featureVectorArray = tilecode(x,y,[-1]*numTilings)
    for i in range(vectorLength):
        if i in featureVectorArray: 
            total += theta[i] 
    return total
Exemple #31
0
def writeF(theta1, theta2):
    fout = open('value', 'w')
    steps = 50
    for i in range(steps):
        for j in range(steps):
            F = tilecode(-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps)
            height = -max(Qs(F, theta1, theta2))
            fout.write(repr(height) + ' ')
        fout.write('\n')
    fout.close()
def writeF():
    fout = open('value', 'w')
    F = [0] * numTilings
    steps = 50
    for i in range(steps):
        for j in range(steps):
            tilecode(-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps,
                     F)
            Q = np.sum(theta[F],axis=0)
            height = -max(Q)
            fout.write(repr(height) + ' ')
        fout.write('\n')
    fout.close()

    fout = open('returnVal', 'w')
    fout1 = open('stepAvg', 'w')
    for i in range(numEpisodes):
        fout1.write(repr(averageStep[i]) + ' ')
        fout.write(repr(averageReturn[i]) + ' ')
    fout.close()
    fout1.close()
Exemple #33
0
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        #your code goes here (20-30 lines, depending on modularity)
        state = mountaincar.init()
        #q1 = [0] * 3 # state-action value q for each
        #q2 = [0] * 3
        #feature_vectors = np.zeros(n)

        while state != None:
            tileIndices = [-1]*numTilings
            tilecode(s[0], s[1], tileIndices) # s[0]:position s[1]:velocity
            q0 = Qs(theta1, tileIndices) + Qs(theta2, tileIndices) # if action is 0
            q1 = Qs(theta1, tileIndices+numTiles) + Qs(theta2, tileIndices+numTiles) #if action is 1
            q2 = Qs(theta1, tileIndices+numTiles*2) + Qs(theta2, tileIndices+numTiles*2) # if action is 2
            Q = np.array([q0, q1, q2])

            # apply epsilon greedy to choose actions
            greedy = np.random.random()
            if(greedy >= epsilon):
                action = Q.argmax()
            else:
                action = np.random.randint(0,3)

            reward, nextS = mountaincar.sample(state, action)
            G = G + reward

            while nextS == None: # if next state is terminal state




        print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        returnSum += G
    print("Average return:", returnSum / numEpisodes)
    return returnSum, theta1, theta2
Exemple #34
0
def learn(alpha=.1/numTilings, epsilon=0, numEpisodes=1000, numRuns=1):

    returnSum = 0.0
    avgEpisodeReturns = [0]*numEpisodes
    doubleQ = DoubleQ(alpha, epsilon)

    for run in range(numRuns):
        doubleQ.resetQ()
        for episodeNum in range(numEpisodes):
            print("Run: " + str(run) + ", Episode: " + str(episodeNum) + " ....")
            G = 0
            isTerminal = False
            #initialize the mountain car
            stateTuple = mountaincar.init()
            state = tilecode(stateTuple[0], stateTuple[1])

            while (not isTerminal):
                action = doubleQ.policy(state)
                reward, stateTuple = mountaincar.sample(stateTuple, action)
                G+=reward
                if stateTuple:
                    nextState = tilecode(stateTuple[0], stateTuple[1])
                else:
                    nextState = None
                    
                doubleQ.learn(state, action, nextState, reward)           

                if not stateTuple:
                    isTerminal = True
                else:
                    state = nextState

            print("Run: ",  run+1, " Episode: ", episodeNum, " Steps:", step, " Return: ", G)
            returnSum = returnSum + G
            avgEpisodeReturns[episodeNum] = avgEpisodeReturns[episodeNum] +  (1/(run+1))*(G - avgEpisodeReturns[episodeNum])

    return avgEpisodeReturns, doubleQ.theta1, doubleQ.theta2
def test_params(_lmbda, _alpha, _epsilon):
	global theta, e
	Epi = Emu = _epsilon
	alpha = _alpha
	lmbda = _lmbda
	runSum = 0.0
	for run in xrange(numRuns):
		e = np.zeros(numTilings*n*3)
		theta = -0.01*np.random.random_sample(numTilings*n*3)
		returnSum = 0.0
		for episodeNum in xrange(numEpisodes):
		    G = 0
		    S = mountaincar.init()
		    step = 0
		    while(S!=None):
		        step+=1
		        A = epsilon_greedy_policy(S)
		        R, S_next = mountaincar.sample(S,A)
		        G+=R
		        #since value of terminal state is 0 by definition
		        #computation for delta is simplified
		        if(S_next==None):
		            delta = R - q(S,A)
		        else:
		            delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\
		                (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A)
		        e*=gamma*lmbda
		        tilecode(S[0], S[1], F)
		        for index in [i+A*numTilings*n for i in F]:
		            e[index] = 1
		        theta +=alpha*delta*e
		        S=S_next
		        if(step >10000): return -10000000000
		    returnSum = returnSum + G
		runSum += returnSum
	return runSum/numRuns
Exemple #36
0
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200):
    theta1 = -0.001 * rand(n)
    theta2 = -0.001 * rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0.0
        tileIndices = [-1] * numTilings
        pos, vel = mountaincar.init()
        state = (pos, vel)
        step = 0
        while state != None:
            tilecode(pos, vel, tileIndices)
            action = chooseaction(state, theta1, theta2)
            r, nstate = mountaincar.sample(state, action)
            tileIndices = [-1] * numTilings
            if nstate != None:
                if randint(0, 2) == 0:
                    naction = chooseaction(nstate, theta1, theta2)
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta1[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r + Total(nstate, naction, theta2) -
                                   Total(state, action, theta1))
                else:
                    naction = chooseaction(nstate, theta1, theta2)
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta2[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r + Total(nstate, naction, theta1) -
                                   Total(state, action, theta2))
            else:
                if randint(0, 2) == 0:
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta1[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r - Total(state, action, theta1))
                else:
                    tileIndices = tilecode(state[0], state[1], tileIndices)
                    for i in range(numTilings):
                        theta2[tileIndices[i] +
                               (action * numTiles)] += alpha * (
                                   r - Total(state, action, theta2))
            state = nstate
            G += r
            step += 1
        #print("Episode:", episodeNum, "Steps:", step, "Return: ", G)
        avgrlist[episodeNum] += G
        avgslist[episodeNum] += step
        returnSum += G
    #print("Average return:", returnSum / numEpisodes)

    return returnSum, theta1, theta2, step
Exemple #37
0
def writeF(theta1, theta2):
    doubleQ = DoubleQ(0.1/4, 0)
    doubleQ.theta1 = theta1
    doubleQ.theta2 = theta2
    
    
    fout = open('value', 'w')
    steps = 50
    for i in range(steps):
        for j in range(steps):
            #F = tilecode(-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps)
            state = (-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps)
            state = tilecode(state[0], state[1])
            bestAction = doubleQ.policy(state)
            #height = -max(Qs(F, theta1, theta2))
            #def qHat(self, state, action, theta):            
            height = -doubleQ.qHat(state, bestAction, np.add(theta1,theta2)/2)
            fout.write(repr(height) + ' ')
        fout.write('\n')
    fout.close()
def learn(in1,in2,target):
    " write your gradient descent learning algorithm here (3 lines or so)"
    tilecode(in1, in2, tileIndices)
    for i in range (0, numTilings):
        w[tileIndices[i]] += alpha * (target - f(in1,in2))    
Exemple #39
0
def learn(x, y, target):
    # write your gradient descent learning algorithm here (3 lines or so)
    tilecode(x, y, tileIndices)
    learn = f(x, y)
    for i in tileIndices:
        weight[i] += alpha * (target - learn)
            step+=1
            A = epsilon_greedy_policy(S)
            R, S_next = mountaincar.sample(S,A)
            G+=R
            #value of terminal state is 0 by definition so no need to compute
            #q values for it
            if(S_next==None):
                delta = R - q(S,A)
            #otherwise expected q value is just the average value weighted by the 
            #we choose randomly plus the max value weighted by probabilty we choose
            #greedily
            else:
                delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\
                    (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A)
            e*=gamma*lmbda
            tilecode(S[0], S[1], F)
            for index in [i+A*numTilings*n for i in F]:
                e[index] = 1
            theta +=alpha*delta*e
            S=S_next
        returnSum = returnSum + G
        #running average for each episode number
        avgret[episodeNum] = (avgret[episodeNum]*run + G)/(run+1)
        avgstep[episodeNum] = (avgstep[episodeNum]*run + G)/(run+1)
        print "Episode: ", episodeNum, "Steps:", step, "Return: ", G
    print "Average return:", returnSum/numEpisodes
    runSum += returnSum
print "Overall performance: Average sum of return per run:", runSum/numRuns
writeF()
writeAvgret()
Exemple #41
0
def learn(x,y,target):
    # write your gradient descent learning algorithm here (3 lines or so)
    tilecode(x, y, tileIndices)
    for i in tileIndices:
        weight[int(i)] = weight[int(i)] + alpha * (target - f(x, y))
def q(s, a):
    p = s[0]
    v = s[1]
    tilecode(p, v, F)
    return np.sum([theta[a*numTilings*n+index] for index in F])
def f(x, y):
    tilecode(x, y, tileIndices)
    innerProduct = float(0)
    for index in tileIndices:
        innerProduct += weights[index]
    return innerProduct
def f(in1,in2):
    " write your linear function approximator here (5 lines or so)"
    tilecode(in1, in2, tileIndices)
    sum = 0.0
    for i in tileIndices: sum += w[i]
    return sum    
def actionTileCode(F,S,A):
    tilecode(S[0],S[1],F)
    F = [x + A*(numTilings*tiles*tiles) for x in F]
    return F
numActions = 3
returns = np.zeros([numRuns,numEpisodes])
stepList = np.zeros([numRuns,numEpisodes])
runList = np.zeros(numRuns)

runSum = 0.0
for run in xrange(numRuns):
    theta = -1*ones([numTiles,3]) #*rand(numTiles,3)
    returnSum = 0.0
    for episodeNum in xrange(numEpisodes):
        G = 0
        step = 0
        e = np.zeros([numTiles,3])
        (position, velocity) = mountaincar.init()
        while 1: 
            tilecode(position, velocity, F)
            Q = np.sum(theta[F],axis=0) 

            if np.random.random() > epsilon:
                A = np.argmax(Q)
            else:
                A = np.random.randint(numActions)
     
            R, result = mountaincar.sample((position, velocity), A)
            error = R - Q[A]
            eOld = copy.copy(e)
            e[F,A] = 1
            G += R
            if result == None:
                theta = theta + alpha * error * e
                break
def learn(in1,in2,target):
    " write your gradient descent learning algorithm here (3 lines or so)"
    tilecode(in1, in2, tileIndices)
    for i in tileIndices:
        w[i] += alpha * (target - f(in1, in2))    
Exemple #48
0
def learn(in1, in2, target):
    tilecodesList = tilecode(in1, in2, tileIndices)
    thetaSum = f(in1, in2)
    for tileCodeIndex in tilecodesList:
        theta[tileCodeIndex] = theta[tileCodeIndex] + alpha * (
            target - thetaSum)  # phi_j(i) is always 1
Exemple #49
0
returns=np.zeros(numEpisodes)

runSum = 0.0
for run in xrange(numRuns):
	theta = -0.01*rand(n)
	returnSum = 0.0
	for episodeNum in xrange(numEpisodes):
		G = 0	
#	your code goes here (20-30 lines, depending on modularity)
		step=0
		e=np.zeros(n)
		s=mc.init()
		Q=np.zeros(numActions)
		while s!=None:
			step=step+1
			tilecode(s[0],s[1],F)
			Q=np.zeros(numActions)
			for a in range(3):
				for _ in F:
					Q[a]=Q[a]+theta[_+a*324]
			a=np.argmax(Q)
			r, s1=mc.sample(s,a)
			G+=r
			delta=r-Q[a]
			for i in F:
				e[i+a*324]=1
			if s1==None:
				for i in range(n):
					theta[i]=theta[i]+alpha*delta*e[i]
				break
			tilecode(s1[0],s1[1],F)
Exemple #50
0
def f(in1, in2):
    tilecodesList = tilecode(in1, in2, tileIndices)
    thetaSum = 0
    for tileCodeIndex in tilecodesList:
        thetaSum += theta[tileCodeIndex]
    return thetaSum
Exemple #51
0
    savetxt(filename,returnsAverages)

runSum = 0.0
runSums = zeros(numRuns)
for runNum in range(numRuns):
    returnSum = 0.0
    w = zeros(n)
    for episodeNum in range(numEpisodes):
        G = 0
        e = zeros(n)
        carState = mountaincar.init()
        while not carState==None:
            Qa = zeros(3)
            Fa = zeros(4)
            for a_poss in [0,1,2]:
                tilecode(carState,Fa)
                assert (sum(Fa) > 0) # make sure Fa is populated
                Qa[a_poss] = getStateActionValue(w,Fa,a_poss)

            # get an action, act on it, and observe the reward
            A = getEpsilonGreedyAction(Qa)
            R,carStateNew = mountaincar.sample(carState,A)
            G = G + R

            delta = R - Qa[A]

            for i in Fa: # for each active feature index
                e[i+numTiles*A] = 1

            # if the new state is the terminal state, update the weight vector and break
            if carStateNew==None:
for run in xrange(numRuns):
    theta = -0.01*rand(n) 
    returnSum = 0.0
    #stepSum = 0
    print "Run: ", run
    for episodeNum in xrange(numEpisodes):
        eTrace = [0]*n
        G = 0
        delta = 0

        state = mountaincar.init()
        step = 0
        while state != None:
            step += 1

            tiles = tilecode(state[0], state[1],[-1]*numTilings)
            explore = (random.random() < epsilon)

            if explore:
                action = random.randint(0,2)
                reward, newState = mountaincar.sample(state, action)
            else:
                action = getBestAction(tiles, theta)
                reward, newState = mountaincar.sample(state, action)
            G += reward

            if newState != None:
                delta = reward + updateDelta(tiles, theta, action, newState)
                eTrace = updateETrace(eTrace, tiles, action)
                theta = updateTheta(theta, delta, eTrace)
            else:
def get_features(S):
    F = np.zeros(numTilings)
    tilecode(S[0],S[1],F)
    return F
    
    #Initializing the weight vec
    w = -0.01*rand(n)
    returnSum = 0.0
    for episodeNum in range(numEpisodes):
        G = 0
        "..."
        "your code goes here (20-30 lines, depending on modularity)"
        S = mountaincar.init()                         #Initialize state
        e = np.zeros(n)                                #Initialize eligibility vector
        steps = 0

        while (True):
            Q = [0, 0, 0]                              #The Q learning (S, A) pair with Feature
            A = 0                                           
            tilecode (S[0], S[1], F)                   #Get the (Position, velocity) and Fearture
            for j in range(3):	
                for i in F:
                    Q[j] = Q[j] + w[i + (j*9*9*4)]     # To compplete one tiling, 4 mapping is needed
	    
                if (random.uniform(0,1) < epsilon):    # Epsilon greedy
                    A = random.choice(actions)
                else: 
                    A = Q.index(max(Q))
                R,Sp  = mountaincar.sample (S,A)       # Learing update in one episode 
                delta = R - Q[A]
            G += R

            for i in F: e[i+(A*4*9*9)] = 1

            if (Sp == None): w += alpha*delta*e; break # If teminal state, end the episode
Exemple #55
0
 for episodeNum in xrange(numEpisodes):
     G = 0
     #your code goes here (20-30 lines, depending on modularity)
     
     #initialize
     
     Q=numpy.zeros(3)
     St = mountaincar.init()
     et = numpy.zeros(n)
     step = 0
     
     
     while St != None:  
         
         step+=1
         tilecode(St[0],St[1],F)
         Q=newQ(F)
         
         # policy here, if Epi is changed, action may select differently
         action = numpy.argmax(Q)
         if Epi > random_sample():
             action = randint(0,3)
             
         r, St1 = mountaincar.sample(St,action)
         G+=r
         delta=r-Q[action]
         for i in F:
                 et[i+action*e_para]=1
         if St1 == None:
             for i in range(n):
                 theta[i]+=alpha*delta*et[i]