def f(in1,in2): " write your linear function approximator here (5 lines or so)" tilecode(in1, in2, tileIndices) fvalue = 0.0 for i in range (0, numTilings): fvalue += w[tileIndices[i]] return fvalue
def learnEpisode(alpha, eps, gamma, theta1, theta2): in1, in2 = mountaincar.init() currentStates = tilecode(in1, in2, [-1]*numTilings) # returns the initial state episodeReturn = 0 step = 0 while(True): # continue until we reach terminal state (None) action = epsGreedyPolicy(currentStates, eps, theta1, theta2) reward, nextStatePosVel = mountaincar.sample((in1, in2), action) episodeReturn += reward step += 1 if nextStatePosVel: nextIn1, nextIn2 = nextStatePosVel nextStates = tilecode(nextIn1, nextIn2, [-1]*numTilings) if(np.random.randint(0,2)): # will return ints between [0,2) updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma) else: updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma) currentStates = nextStates in1, in2 = nextIn1, nextIn2 else: # next state is terminal state if(np.random.randint(0,2)): # will return ints between [0,2) updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma) else: updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma) return episodeReturn, step
def f(in1, in2): # write your linear function approximator here (5 lines or so) sum = 0.0 tilecode(in1, in2, tileIndices) for i in tileIndices: sum = sum + theta[i] * 1 return sum
def learn(x, y, target): global weights tilecode(x, y, tileIndices) innerProduct = f(x, y) for index in tileIndices: newWeight = weights[index] + step_size * (target - innerProduct) weights[index] = newWeight
def f(x,y): # write your linear function approximator here (5 lines or so) total=0.0 tilecode(x,y,tileIndices) for i in tileIndices: total+=weight[i] return total
def f(x, y): # write your linear function approximator here (5 lines or so) tilecode(x, y, tileIndices) sum = 0.0 for i in tileIndices: sum += weights[i] return sum
def f(in1,in2): tilecode(in1, in2, tileIndices) # Calculate the estimated function value for the inputs in1, in2 f = 0 for index in tileIndices: f += weights[index] return f
def f(x,y): # write your linear function approximator here (5 lines or so) tilecode(x,y,tileIndices) sum = 0.0 for i in tileIndices: sum+=weights[i] return sum
def f(x,y): # write your linear function approximator here (5 lines or so) tilecode(x, y, tileIndices) f = 0 for i in tileIndices: f = f + weight[int(i)] return f
def f(x, y): # write your linear function approximator here (5 lines or so) total = 0.0 tilecode(x, y, tileIndices) for i in tileIndices: total += weight[i] return total
def f(in1, in2): # write your linear function approximator here (5 lines or so) tilecode(in1, in2, tileIndices) sum1 = 0 for index in tileIndices: sum1 += theta[index] return sum1
def f(in1, in2): #i = 0 thef = 0 tilecode(in1, in2, tileIndices) #print(tileIndices) for a in tileIndices: thef += theta[a] return thef
def learn(in1, in2, target): # write your gradient descent learning algorithm here (3 lines or so) tileIndices = [-1 ] * numTilings # initialize your list of tile indices here tilecode(in1, in2, tileIndices) estimate = f(in1, in2) for indicie in tileIndices: theta[indicie] += alpha * (target - estimate)
def learn(x,y,target): #gradient descent learning algorithm #Getting the important features tilecode(x,y,tileIndices) #Runing the algorithm given for the important features for features in tileIndices: weights[int(features)] = weights[int(features)] + alpha*(target - f(x,y))
def f(in1, in2): # write your linear function approximator here (5 lines or so) sumOfWeights = 0 tilecode(in1, in2, indices) for tile in indices: sumOfWeights += theta[tile] return sumOfWeights
def f(in1, in2): # write your linear function approximator here (5 lines or so) tileIndices = [-1 ] * numTilings # initialize your list of tile indices here tilecode(in1, in2, tileIndices) rValue = 0 for indicie in tileIndices: rValue += theta[indicie] return rValue
def writeF(theta1, theta2): fout = open('value', 'w') steps = 50 for i in range(steps): for j in range(steps): F = [-1] * numTilings tilecode(-1.2 + (i * 1.7 / steps), -0.07 + (j * 0.14 / steps), F) height = -max(Qs(F, theta1 + theta2 / 2)) fout.write(repr(height) + ' ') fout.write('\n') fout.close()
def writeF(): fout = open('value', 'w') F = [0]*numTilings steps = 50 for i in range(steps): for j in range(steps): tilecode(-1.2+i*1.7/steps, -0.07+j*0.14/steps, F) height = -max(Qs(F)) fout.write(repr(height) + ' ') fout.write('\n') fout.close()
def f(x,y): #Getting which tiles are important. tilecode(x,y,tileIndices) #Value to store the sum of the weights for the features sum = 0 #Adding important weights for the features for features in tileIndices: sum += weights[int(features)] return sum
def learn(): runSum = 0.0 for run in xrange(numRuns): theta = -0.01*rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): step = 0 G = 0 traces = zeros(n) S=mountaincar.init() # Until S is terminal: while S!=None: # Choose action tilecode(S,F) if rand() <= Emu: # randomly explore a = randint(0, 2) else: # greedy action choice a = argmax([QValue(F,0,theta),QValue(F,1,theta),QValue(F,2,theta)]) # Replacing traces on indices where feature vector is 1 for index in F: traces[index+(a*numTiles)] = 1 # Take action, observe r,Sp r,Sp=mountaincar.sample(S,a) G += r # If terminal action update theta and end episode if Sp == None: delta = r - QValue(F,a,theta) theta = theta + alpha*delta*traces break # Choose expected next action tilecode(Sp,Fp) ap = argmax([QValue(Fp,0,theta),QValue(Fp,1,theta),QValue(Fp,2,theta)]) # Update theta randomAction = (Epi/3)*QValue(Fp,0,theta) + (Epi/3)*QValue(Fp,1,theta)+ (Epi/3)*QValue(Fp,2,theta) delta = r + randomAction + (1-Epi)*QValue(Fp,ap,theta) - QValue(F,a,theta) theta = theta + alpha*delta*traces # Decay every component traces = gamma*lmbda*traces S=Sp step += 1 returnSum += G print "Episode: ", episodeNum, "Steps:", step, "Return: ", G episodeReturn[episodeNum] += (G-episodeReturn[episodeNum])/(numRuns+1) episodeSteps[episodeNum] += (step-episodeSteps[episodeNum])/(numRuns+1) returnSum = returnSum + G print "Average return:", returnSum/numEpisodes runSum += returnSum print "Overall performance: Average sum of return per run:", runSum/numRuns writeAverages(episodeReturn,episodeSteps)
def writeF(): fout = open('value', 'w') F = [0]*numTilings steps = 50 for i in range(steps): for j in range(steps): S = (-1.2+i*1.7/steps, -0.07+j*0.14/steps) tilecode(S, F) Qa = zeros(3) for a_poss in [0,1,2]: Qa[a_poss] = getStateActionValue(w,F,a_poss) height = -max(Qa) fout.write(repr(height) + ' ') fout.write('\n') fout.close()
def Total(state, action, theta): tileIndices = [-1] * numTilings tileIndices = tilecode(state[0], state[1], tileIndices) total = 0 for i in range(0, numTilings): total += theta[tileIndices[i] + (action * numTiles)] return total
def learn(in1, in2, target): error = target - f(in1, in2) global theta update = alpha * error active_features = tilecode(in1, in2, indices) for i in active_features: theta[i] += update
def learn(x,y,target): # write your gradient descent learning algorithm here (3 lines or so) currentFXY = f(x,y) featureVectorArray = tilecode(x,y,[-1]*numTilings) for i in range(len(theta)): if i in featureVectorArray: theta[i] = theta[i] + alpha*(target - currentFXY)
def f(in1, in2): # write your linear function approximator here (5 lines or so) features = zeros(n) global theta for i in tilecode(in1, in2, indices): features[i] = 1 return dot(theta, features)
def f(in1, in2): # write your linear function approximator here (5 lines or so) totla_f = 0 TileCoderIndices = tilecode(in1, in2, tileIndices) # list of indices j where !j(i) is 1, with all others assumed 0. for i in TileCoderIndices: totla_f = totla_f + theta[i] return totla_f
def learn(in1, in2, target): # write your gradient descent learning algorithm here (3 lines or so) f_new = f(in1, in2) TileCoderIndices = tilecode(in1, in2, tileIndices) for j in TileCoderIndices: theta[j] = theta[j] + alpha * (target - f_new) #print(target) return theta
def updateDelta(tiles, theta, action, newState): nextTiles = tilecode(newState[0], newState[1],[-1]*numTilings) delta = 0 nextAction = getBestAction(nextTiles, theta) for i in nextTiles: delta += theta[i + nextAction*4*81] for i in tiles: delta -= theta[i + action*4*81] return delta
def f(x,y): # write your linear function approximator here (5 lines or so) total = 0 vectorLength = len(theta) # corresponds to n in the algorithm featureVectorArray = tilecode(x,y,[-1]*numTilings) for i in range(vectorLength): if i in featureVectorArray: total += theta[i] return total
def writeF(theta1, theta2): fout = open('value', 'w') steps = 50 for i in range(steps): for j in range(steps): F = tilecode(-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps) height = -max(Qs(F, theta1, theta2)) fout.write(repr(height) + ' ') fout.write('\n') fout.close()
def writeF(): fout = open('value', 'w') F = [0] * numTilings steps = 50 for i in range(steps): for j in range(steps): tilecode(-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps, F) Q = np.sum(theta[F],axis=0) height = -max(Q) fout.write(repr(height) + ' ') fout.write('\n') fout.close() fout = open('returnVal', 'w') fout1 = open('stepAvg', 'w') for i in range(numEpisodes): fout1.write(repr(averageStep[i]) + ' ') fout.write(repr(averageReturn[i]) + ' ') fout.close() fout1.close()
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 #your code goes here (20-30 lines, depending on modularity) state = mountaincar.init() #q1 = [0] * 3 # state-action value q for each #q2 = [0] * 3 #feature_vectors = np.zeros(n) while state != None: tileIndices = [-1]*numTilings tilecode(s[0], s[1], tileIndices) # s[0]:position s[1]:velocity q0 = Qs(theta1, tileIndices) + Qs(theta2, tileIndices) # if action is 0 q1 = Qs(theta1, tileIndices+numTiles) + Qs(theta2, tileIndices+numTiles) #if action is 1 q2 = Qs(theta1, tileIndices+numTiles*2) + Qs(theta2, tileIndices+numTiles*2) # if action is 2 Q = np.array([q0, q1, q2]) # apply epsilon greedy to choose actions greedy = np.random.random() if(greedy >= epsilon): action = Q.argmax() else: action = np.random.randint(0,3) reward, nextS = mountaincar.sample(state, action) G = G + reward while nextS == None: # if next state is terminal state print("Episode:", episodeNum, "Steps:", step, "Return: ", G) returnSum += G print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
def learn(alpha=.1/numTilings, epsilon=0, numEpisodes=1000, numRuns=1): returnSum = 0.0 avgEpisodeReturns = [0]*numEpisodes doubleQ = DoubleQ(alpha, epsilon) for run in range(numRuns): doubleQ.resetQ() for episodeNum in range(numEpisodes): print("Run: " + str(run) + ", Episode: " + str(episodeNum) + " ....") G = 0 isTerminal = False #initialize the mountain car stateTuple = mountaincar.init() state = tilecode(stateTuple[0], stateTuple[1]) while (not isTerminal): action = doubleQ.policy(state) reward, stateTuple = mountaincar.sample(stateTuple, action) G+=reward if stateTuple: nextState = tilecode(stateTuple[0], stateTuple[1]) else: nextState = None doubleQ.learn(state, action, nextState, reward) if not stateTuple: isTerminal = True else: state = nextState print("Run: ", run+1, " Episode: ", episodeNum, " Steps:", step, " Return: ", G) returnSum = returnSum + G avgEpisodeReturns[episodeNum] = avgEpisodeReturns[episodeNum] + (1/(run+1))*(G - avgEpisodeReturns[episodeNum]) return avgEpisodeReturns, doubleQ.theta1, doubleQ.theta2
def test_params(_lmbda, _alpha, _epsilon): global theta, e Epi = Emu = _epsilon alpha = _alpha lmbda = _lmbda runSum = 0.0 for run in xrange(numRuns): e = np.zeros(numTilings*n*3) theta = -0.01*np.random.random_sample(numTilings*n*3) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 S = mountaincar.init() step = 0 while(S!=None): step+=1 A = epsilon_greedy_policy(S) R, S_next = mountaincar.sample(S,A) G+=R #since value of terminal state is 0 by definition #computation for delta is simplified if(S_next==None): delta = R - q(S,A) else: delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\ (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A) e*=gamma*lmbda tilecode(S[0], S[1], F) for index in [i+A*numTilings*n for i in F]: e[index] = 1 theta +=alpha*delta*e S=S_next if(step >10000): return -10000000000 returnSum = returnSum + G runSum += returnSum return runSum/numRuns
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 tileIndices = [-1] * numTilings pos, vel = mountaincar.init() state = (pos, vel) step = 0 while state != None: tilecode(pos, vel, tileIndices) action = chooseaction(state, theta1, theta2) r, nstate = mountaincar.sample(state, action) tileIndices = [-1] * numTilings if nstate != None: if randint(0, 2) == 0: naction = chooseaction(nstate, theta1, theta2) tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta1[tileIndices[i] + (action * numTiles)] += alpha * ( r + Total(nstate, naction, theta2) - Total(state, action, theta1)) else: naction = chooseaction(nstate, theta1, theta2) tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta2[tileIndices[i] + (action * numTiles)] += alpha * ( r + Total(nstate, naction, theta1) - Total(state, action, theta2)) else: if randint(0, 2) == 0: tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta1[tileIndices[i] + (action * numTiles)] += alpha * ( r - Total(state, action, theta1)) else: tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta2[tileIndices[i] + (action * numTiles)] += alpha * ( r - Total(state, action, theta2)) state = nstate G += r step += 1 #print("Episode:", episodeNum, "Steps:", step, "Return: ", G) avgrlist[episodeNum] += G avgslist[episodeNum] += step returnSum += G #print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2, step
def writeF(theta1, theta2): doubleQ = DoubleQ(0.1/4, 0) doubleQ.theta1 = theta1 doubleQ.theta2 = theta2 fout = open('value', 'w') steps = 50 for i in range(steps): for j in range(steps): #F = tilecode(-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps) state = (-1.2 + i * 1.7 / steps, -0.07 + j * 0.14 / steps) state = tilecode(state[0], state[1]) bestAction = doubleQ.policy(state) #height = -max(Qs(F, theta1, theta2)) #def qHat(self, state, action, theta): height = -doubleQ.qHat(state, bestAction, np.add(theta1,theta2)/2) fout.write(repr(height) + ' ') fout.write('\n') fout.close()
def learn(in1,in2,target): " write your gradient descent learning algorithm here (3 lines or so)" tilecode(in1, in2, tileIndices) for i in range (0, numTilings): w[tileIndices[i]] += alpha * (target - f(in1,in2))
def learn(x, y, target): # write your gradient descent learning algorithm here (3 lines or so) tilecode(x, y, tileIndices) learn = f(x, y) for i in tileIndices: weight[i] += alpha * (target - learn)
step+=1 A = epsilon_greedy_policy(S) R, S_next = mountaincar.sample(S,A) G+=R #value of terminal state is 0 by definition so no need to compute #q values for it if(S_next==None): delta = R - q(S,A) #otherwise expected q value is just the average value weighted by the #we choose randomly plus the max value weighted by probabilty we choose #greedily else: delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\ (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A) e*=gamma*lmbda tilecode(S[0], S[1], F) for index in [i+A*numTilings*n for i in F]: e[index] = 1 theta +=alpha*delta*e S=S_next returnSum = returnSum + G #running average for each episode number avgret[episodeNum] = (avgret[episodeNum]*run + G)/(run+1) avgstep[episodeNum] = (avgstep[episodeNum]*run + G)/(run+1) print "Episode: ", episodeNum, "Steps:", step, "Return: ", G print "Average return:", returnSum/numEpisodes runSum += returnSum print "Overall performance: Average sum of return per run:", runSum/numRuns writeF() writeAvgret()
def learn(x,y,target): # write your gradient descent learning algorithm here (3 lines or so) tilecode(x, y, tileIndices) for i in tileIndices: weight[int(i)] = weight[int(i)] + alpha * (target - f(x, y))
def q(s, a): p = s[0] v = s[1] tilecode(p, v, F) return np.sum([theta[a*numTilings*n+index] for index in F])
def f(x, y): tilecode(x, y, tileIndices) innerProduct = float(0) for index in tileIndices: innerProduct += weights[index] return innerProduct
def f(in1,in2): " write your linear function approximator here (5 lines or so)" tilecode(in1, in2, tileIndices) sum = 0.0 for i in tileIndices: sum += w[i] return sum
def actionTileCode(F,S,A): tilecode(S[0],S[1],F) F = [x + A*(numTilings*tiles*tiles) for x in F] return F
numActions = 3 returns = np.zeros([numRuns,numEpisodes]) stepList = np.zeros([numRuns,numEpisodes]) runList = np.zeros(numRuns) runSum = 0.0 for run in xrange(numRuns): theta = -1*ones([numTiles,3]) #*rand(numTiles,3) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 step = 0 e = np.zeros([numTiles,3]) (position, velocity) = mountaincar.init() while 1: tilecode(position, velocity, F) Q = np.sum(theta[F],axis=0) if np.random.random() > epsilon: A = np.argmax(Q) else: A = np.random.randint(numActions) R, result = mountaincar.sample((position, velocity), A) error = R - Q[A] eOld = copy.copy(e) e[F,A] = 1 G += R if result == None: theta = theta + alpha * error * e break
def learn(in1,in2,target): " write your gradient descent learning algorithm here (3 lines or so)" tilecode(in1, in2, tileIndices) for i in tileIndices: w[i] += alpha * (target - f(in1, in2))
def learn(in1, in2, target): tilecodesList = tilecode(in1, in2, tileIndices) thetaSum = f(in1, in2) for tileCodeIndex in tilecodesList: theta[tileCodeIndex] = theta[tileCodeIndex] + alpha * ( target - thetaSum) # phi_j(i) is always 1
returns=np.zeros(numEpisodes) runSum = 0.0 for run in xrange(numRuns): theta = -0.01*rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 # your code goes here (20-30 lines, depending on modularity) step=0 e=np.zeros(n) s=mc.init() Q=np.zeros(numActions) while s!=None: step=step+1 tilecode(s[0],s[1],F) Q=np.zeros(numActions) for a in range(3): for _ in F: Q[a]=Q[a]+theta[_+a*324] a=np.argmax(Q) r, s1=mc.sample(s,a) G+=r delta=r-Q[a] for i in F: e[i+a*324]=1 if s1==None: for i in range(n): theta[i]=theta[i]+alpha*delta*e[i] break tilecode(s1[0],s1[1],F)
def f(in1, in2): tilecodesList = tilecode(in1, in2, tileIndices) thetaSum = 0 for tileCodeIndex in tilecodesList: thetaSum += theta[tileCodeIndex] return thetaSum
savetxt(filename,returnsAverages) runSum = 0.0 runSums = zeros(numRuns) for runNum in range(numRuns): returnSum = 0.0 w = zeros(n) for episodeNum in range(numEpisodes): G = 0 e = zeros(n) carState = mountaincar.init() while not carState==None: Qa = zeros(3) Fa = zeros(4) for a_poss in [0,1,2]: tilecode(carState,Fa) assert (sum(Fa) > 0) # make sure Fa is populated Qa[a_poss] = getStateActionValue(w,Fa,a_poss) # get an action, act on it, and observe the reward A = getEpsilonGreedyAction(Qa) R,carStateNew = mountaincar.sample(carState,A) G = G + R delta = R - Qa[A] for i in Fa: # for each active feature index e[i+numTiles*A] = 1 # if the new state is the terminal state, update the weight vector and break if carStateNew==None:
for run in xrange(numRuns): theta = -0.01*rand(n) returnSum = 0.0 #stepSum = 0 print "Run: ", run for episodeNum in xrange(numEpisodes): eTrace = [0]*n G = 0 delta = 0 state = mountaincar.init() step = 0 while state != None: step += 1 tiles = tilecode(state[0], state[1],[-1]*numTilings) explore = (random.random() < epsilon) if explore: action = random.randint(0,2) reward, newState = mountaincar.sample(state, action) else: action = getBestAction(tiles, theta) reward, newState = mountaincar.sample(state, action) G += reward if newState != None: delta = reward + updateDelta(tiles, theta, action, newState) eTrace = updateETrace(eTrace, tiles, action) theta = updateTheta(theta, delta, eTrace) else:
def get_features(S): F = np.zeros(numTilings) tilecode(S[0],S[1],F) return F
#Initializing the weight vec w = -0.01*rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 "..." "your code goes here (20-30 lines, depending on modularity)" S = mountaincar.init() #Initialize state e = np.zeros(n) #Initialize eligibility vector steps = 0 while (True): Q = [0, 0, 0] #The Q learning (S, A) pair with Feature A = 0 tilecode (S[0], S[1], F) #Get the (Position, velocity) and Fearture for j in range(3): for i in F: Q[j] = Q[j] + w[i + (j*9*9*4)] # To compplete one tiling, 4 mapping is needed if (random.uniform(0,1) < epsilon): # Epsilon greedy A = random.choice(actions) else: A = Q.index(max(Q)) R,Sp = mountaincar.sample (S,A) # Learing update in one episode delta = R - Q[A] G += R for i in F: e[i+(A*4*9*9)] = 1 if (Sp == None): w += alpha*delta*e; break # If teminal state, end the episode
for episodeNum in xrange(numEpisodes): G = 0 #your code goes here (20-30 lines, depending on modularity) #initialize Q=numpy.zeros(3) St = mountaincar.init() et = numpy.zeros(n) step = 0 while St != None: step+=1 tilecode(St[0],St[1],F) Q=newQ(F) # policy here, if Epi is changed, action may select differently action = numpy.argmax(Q) if Epi > random_sample(): action = randint(0,3) r, St1 = mountaincar.sample(St,action) G+=r delta=r-Q[action] for i in F: et[i+action*e_para]=1 if St1 == None: for i in range(n): theta[i]+=alpha*delta*et[i]