def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 for episodeNum in range(numTrainingEpisodes): G = 0 S = mountaincar.init() R, S = mountaincar.sample(S, 1) G += R while (S): Q = Q1[S, :] + Q2[S, :] prob1 = np.random.random() if prob1 < eps: # explore A = np.random.choice([0, 1]) else: # greedy A = Q.argmax() R, S_prime = mountaincar.sample(S, A) G += R S_prime = int(S_prime) prob2 = np.random.choice([1, 2]) if prob2 == 1: Q1[S, A] = Q1[S, A] + alpha * ( R + GAMMA * Q2[S_prime, (Q1[S_prime]).argmax()] - Q1[S, A]) else: Q2[S, A] = Q2[S, A] + alpha * ( R + GAMMA * Q1[S_prime, (Q2[S_prime]).argmax()] - Q2[S, A]) S = S_prime #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G
def updatePlot(i): global prototypePlot, steps, S, x, y, s, gotOut if gotOut: speed = 1 else: speed = 1 for i in range(speed): res = loop() if steps >= 5000 or res: S = mountaincar.init() print('init mc') steps = 0 acrl.Erase_Traces() break s = [50]*n +[200] # reset sizes for index in get_features(S, False): # change sizes of the close prototypes s[index] = 100 prototypePlot._sizes = s x[-1], y[-1] = S[0], S[1] # update the state prototypePlot.set_offsets(zip(x,y)) # display the points values = clamp(acrl.w + [acrl.value], 0, 1) prototypePlot.set_array(values) # display the values return prototypePlot,
def learnEpisode(alpha, eps, gamma, theta1, theta2): in1, in2 = mountaincar.init() currentStates = tilecode(in1, in2, [-1]*numTilings) # returns the initial state episodeReturn = 0 step = 0 while(True): # continue until we reach terminal state (None) action = epsGreedyPolicy(currentStates, eps, theta1, theta2) reward, nextStatePosVel = mountaincar.sample((in1, in2), action) episodeReturn += reward step += 1 if nextStatePosVel: nextIn1, nextIn2 = nextStatePosVel nextStates = tilecode(nextIn1, nextIn2, [-1]*numTilings) if(np.random.randint(0,2)): # will return ints between [0,2) updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma) else: updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma) currentStates = nextStates in1, in2 = nextIn1, nextIn2 else: # next state is terminal state if(np.random.randint(0,2)): # will return ints between [0,2) updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma) else: updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma) return episodeReturn, step
def updatePlot(i): global prototypePlot, steps, S, x, y, s, gotOut if gotOut: speed = 1 else: speed = 1 for i in range(speed): res = loop() if steps >= 5000 or res: S = mountaincar.init() print('init mc') steps = 0 acrl.Erase_Traces() break s = [50] * n + [200] # reset sizes for index in get_features(S, False): # change sizes of the close prototypes s[index] = 100 prototypePlot._sizes = s x[-1], y[-1] = S[0], S[1] # update the state prototypePlot.set_offsets(zip(x, y)) # display the points values = clamp(acrl.w + [acrl.value], 0, 1) prototypePlot.set_array(values) # display the values return prototypePlot,
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 tileIndices = [-1] * numTilings pos, vel = mountaincar.init() state = (pos, vel) step = 0 while state != None: tilecode(pos, vel, tileIndices) action = chooseaction(state, theta1, theta2) r, nstate = mountaincar.sample(state, action) tileIndices = [-1] * numTilings if nstate != None: if randint(0, 2) == 0: naction = chooseaction(nstate, theta1, theta2) tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta1[tileIndices[i] + (action * numTiles)] += alpha * ( r + Total(nstate, naction, theta2) - Total(state, action, theta1)) else: naction = chooseaction(nstate, theta1, theta2) tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta2[tileIndices[i] + (action * numTiles)] += alpha * ( r + Total(nstate, naction, theta1) - Total(state, action, theta2)) else: if randint(0, 2) == 0: tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta1[tileIndices[i] + (action * numTiles)] += alpha * ( r - Total(state, action, theta1)) else: tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta2[tileIndices[i] + (action * numTiles)] += alpha * ( r - Total(state, action, theta2)) state = nstate G += r step += 1 #print("Episode:", episodeNum, "Steps:", step, "Return: ", G) avgrlist[episodeNum] += G avgslist[episodeNum] += step returnSum += G #print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2, step
def learn(): runSum = 0.0 for run in xrange(numRuns): theta = -0.01*rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): step = 0 G = 0 traces = zeros(n) S=mountaincar.init() # Until S is terminal: while S!=None: # Choose action tilecode(S,F) if rand() <= Emu: # randomly explore a = randint(0, 2) else: # greedy action choice a = argmax([QValue(F,0,theta),QValue(F,1,theta),QValue(F,2,theta)]) # Replacing traces on indices where feature vector is 1 for index in F: traces[index+(a*numTiles)] = 1 # Take action, observe r,Sp r,Sp=mountaincar.sample(S,a) G += r # If terminal action update theta and end episode if Sp == None: delta = r - QValue(F,a,theta) theta = theta + alpha*delta*traces break # Choose expected next action tilecode(Sp,Fp) ap = argmax([QValue(Fp,0,theta),QValue(Fp,1,theta),QValue(Fp,2,theta)]) # Update theta randomAction = (Epi/3)*QValue(Fp,0,theta) + (Epi/3)*QValue(Fp,1,theta)+ (Epi/3)*QValue(Fp,2,theta) delta = r + randomAction + (1-Epi)*QValue(Fp,ap,theta) - QValue(F,a,theta) theta = theta + alpha*delta*traces # Decay every component traces = gamma*lmbda*traces S=Sp step += 1 returnSum += G print "Episode: ", episodeNum, "Steps:", step, "Return: ", G episodeReturn[episodeNum] += (G-episodeReturn[episodeNum])/(numRuns+1) episodeSteps[episodeNum] += (step-episodeSteps[episodeNum])/(numRuns+1) returnSum = returnSum + G print "Average return:", returnSum/numEpisodes runSum += returnSum print "Overall performance: Average sum of return per run:", runSum/numRuns writeAverages(episodeReturn,episodeSteps)
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 S = mountaincar.init() R, S = mountaincar.sample(S, 1) G += R while (S): Q = Q1[S, :] + Q2[S, :] A = Q.argmax() R, S = mountaincar.sample(S, A) G += R returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def episode(self, discount=1.0, max_steps=1e3): """ Run n-step Q(sigma) for one episode """ self._s = mountaincar.init() self._r_sum = 0.0 self._time = 0 # step counter self._T = float('inf') self._tau = 0 action = self.pick_action(self._s) self._tr = [(self._s, self._r_sum)] * self._n self._delta = [0.0] * self._n self._Qt = [self._Q[self._s, action]] * (self._n + 1) self._pi = [0.0] * self._n self._sigma = [0.0] * self._n while (self._tau != (self._T - 1)) and (self._time < max_steps): action = self.act(action, discount) self._sig *= self._beta return self._r_sum
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 #your code goes here (20-30 lines, depending on modularity) state = mountaincar.init() #q1 = [0] * 3 # state-action value q for each #q2 = [0] * 3 #feature_vectors = np.zeros(n) while state != None: tileIndices = [-1]*numTilings tilecode(s[0], s[1], tileIndices) # s[0]:position s[1]:velocity q0 = Qs(theta1, tileIndices) + Qs(theta2, tileIndices) # if action is 0 q1 = Qs(theta1, tileIndices+numTiles) + Qs(theta2, tileIndices+numTiles) #if action is 1 q2 = Qs(theta1, tileIndices+numTiles*2) + Qs(theta2, tileIndices+numTiles*2) # if action is 2 Q = np.array([q0, q1, q2]) # apply epsilon greedy to choose actions greedy = np.random.random() if(greedy >= epsilon): action = Q.argmax() else: action = np.random.randint(0,3) reward, nextS = mountaincar.sample(state, action) G = G + reward while nextS == None: # if next state is terminal state print("Episode:", episodeNum, "Steps:", step, "Return: ", G) returnSum += G print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
def learn(alpha=.1/numTilings, epsilon=0, numEpisodes=1000, numRuns=1): returnSum = 0.0 avgEpisodeReturns = [0]*numEpisodes doubleQ = DoubleQ(alpha, epsilon) for run in range(numRuns): doubleQ.resetQ() for episodeNum in range(numEpisodes): print("Run: " + str(run) + ", Episode: " + str(episodeNum) + " ....") G = 0 isTerminal = False #initialize the mountain car stateTuple = mountaincar.init() state = tilecode(stateTuple[0], stateTuple[1]) while (not isTerminal): action = doubleQ.policy(state) reward, stateTuple = mountaincar.sample(stateTuple, action) G+=reward if stateTuple: nextState = tilecode(stateTuple[0], stateTuple[1]) else: nextState = None doubleQ.learn(state, action, nextState, reward) if not stateTuple: isTerminal = True else: state = nextState print("Run: ", run+1, " Episode: ", episodeNum, " Steps:", step, " Return: ", G) returnSum = returnSum + G avgEpisodeReturns[episodeNum] = avgEpisodeReturns[episodeNum] + (1/(run+1))*(G - avgEpisodeReturns[episodeNum]) return avgEpisodeReturns, doubleQ.theta1, doubleQ.theta2
def test_params(_lmbda, _alpha, _epsilon): global theta, e Epi = Emu = _epsilon alpha = _alpha lmbda = _lmbda runSum = 0.0 for run in xrange(numRuns): e = np.zeros(numTilings*n*3) theta = -0.01*np.random.random_sample(numTilings*n*3) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 S = mountaincar.init() step = 0 while(S!=None): step+=1 A = epsilon_greedy_policy(S) R, S_next = mountaincar.sample(S,A) G+=R #since value of terminal state is 0 by definition #computation for delta is simplified if(S_next==None): delta = R - q(S,A) else: delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\ (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A) e*=gamma*lmbda tilecode(S[0], S[1], F) for index in [i+A*numTilings*n for i in F]: e[index] = 1 theta +=alpha*delta*e S=S_next if(step >10000): return -10000000000 returnSum = returnSum + G runSum += returnSum return runSum/numRuns
F = [-1] * numTilings Q = [0] * 3 numActions = 3 returns = np.zeros([numRuns,numEpisodes]) stepList = np.zeros([numRuns,numEpisodes]) runList = np.zeros(numRuns) runSum = 0.0 for run in xrange(numRuns): theta = -1*ones([numTiles,3]) #*rand(numTiles,3) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 step = 0 e = np.zeros([numTiles,3]) (position, velocity) = mountaincar.init() while 1: tilecode(position, velocity, F) Q = np.sum(theta[F],axis=0) if np.random.random() > epsilon: A = np.argmax(Q) else: A = np.random.randint(numActions) R, result = mountaincar.sample((position, velocity), A) error = R - Q[A] eOld = copy.copy(e) e[F,A] = 1 G += R if result == None:
Epi = Emu = epsilon = 0 n = numTilings*numTiles*numActions F = [-1]*np.ones(numTilings) steps=np.zeros(numEpisodes) returns=np.zeros(numEpisodes) runSum = 0.0 for run in xrange(numRuns): theta = -0.01*rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 # your code goes here (20-30 lines, depending on modularity) step=0 e=np.zeros(n) s=mc.init() Q=np.zeros(numActions) while s!=None: step=step+1 tilecode(s[0],s[1],F) Q=np.zeros(numActions) for a in range(3): for _ in F: Q[a]=Q[a]+theta[_+a*324] a=np.argmax(Q) r, s1=mc.sample(s,a) G+=r delta=r-Q[a] for i in F: e[i+a*324]=1 if s1==None:
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 runEpisodeReturns = [] for episodeNum in range(numEpisodes): G = 0 step = 0 currentState = mountaincar.init() terminate = False while not terminate: action = argmax([ qHat(currentState, 0, theta1) + qHat(currentState, 0, theta2), qHat(currentState, 1, theta1) + qHat(currentState, 1, theta2), qHat(currentState, 2, theta1) + qHat(currentState, 2, theta2) ]) R, nextState = mountaincar.sample(currentState, action) if (nextState is None): if randint(0, 2) == 0: # 0.5 probability phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta1[phi[i] + (action * numTiles)] += alpha * ( R - qHat(currentState, action, theta1)) else: # 0.5 probability phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta2[phi[i] + (action * numTiles)] += alpha * ( R - qHat(currentState, action, theta2)) terminate = True else: if randint(0, 2) == 0: #0.5 probability nextAction = argmax([ qHat(nextState, 0, theta1), qHat(nextState, 1, theta1), qHat(nextState, 2, theta1) ]) phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta1[phi[i] + (action * numTiles)] += alpha * ( R + qHat(nextState, nextAction, theta2) - qHat(currentState, action, theta1)) else: #0.5 probability nextAction = argmax([ qHat(nextState, 0, theta2), qHat(nextState, 1, theta2), qHat(nextState, 2, theta2) ]) phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta2[phi[i] + (action * numTiles)] += alpha * ( R + qHat(nextState, nextAction, theta1) - qHat(currentState, action, theta2)) currentState = nextState #print("Episode: ", episodeNum, "Return: ", G) G = G + R step += 1 runEpisodeReturns.append(G) # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G) returnSum = returnSum + G #print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2, runEpisodeReturns
def learn(): runSum = 0.0 for run in xrange(numRuns): theta = -0.01 * rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): step = 0 G = 0 traces = zeros(n) S = mountaincar.init() # Until S is terminal: while S != None: # Choose action tilecode(S, F) if rand() <= Emu: # randomly explore a = randint(0, 2) else: # greedy action choice a = argmax([ QValue(F, 0, theta), QValue(F, 1, theta), QValue(F, 2, theta) ]) # Replacing traces on indices where feature vector is 1 for index in F: traces[index + (a * numTiles)] = 1 # Take action, observe r,Sp r, Sp = mountaincar.sample(S, a) G += r # If terminal action update theta and end episode if Sp == None: delta = r - QValue(F, a, theta) theta = theta + alpha * delta * traces break # Choose expected next action tilecode(Sp, Fp) ap = argmax([ QValue(Fp, 0, theta), QValue(Fp, 1, theta), QValue(Fp, 2, theta) ]) # Update theta randomAction = (Epi / 3) * QValue( Fp, 0, theta) + (Epi / 3) * QValue( Fp, 1, theta) + (Epi / 3) * QValue(Fp, 2, theta) delta = r + randomAction + (1 - Epi) * QValue( Fp, ap, theta) - QValue(F, a, theta) theta = theta + alpha * delta * traces # Decay every component traces = gamma * lmbda * traces S = Sp step += 1 returnSum += G print "Episode: ", episodeNum, "Steps:", step, "Return: ", G episodeReturn[episodeNum] += (G - episodeReturn[episodeNum]) / ( numRuns + 1) episodeSteps[episodeNum] += (step - episodeSteps[episodeNum]) / ( numRuns + 1) returnSum = returnSum + G print "Average return:", returnSum / numEpisodes runSum += returnSum print "Overall performance: Average sum of return per run:", runSum / numRuns writeAverages(episodeReturn, episodeSteps)
# represent actions decelerate, coast, accelerate as integers for run in range(numRuns): w = -0.01*np.random.rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 step = 0 # From Figure 9.9 in Sutton RL 2014 # n-component eligibility trace vector e = np.zeros(n) # initialize observation observation = mountaincar.init() # use function approximation to generate next state tilecode(observation[0], observation[1], observation[2], state) # compute the Q values for the state and every action Q = Qs(state) terminal = False A = chooseAction(Q) unknownObs = observation if flipped: R, observation, terminal = mountaincar.sample(observation, A, terminal, False) someRandomAmountOfTime = random.randint(minNumExtraSteps,maxNumExtraSteps) for i in range(1, someRandomAmountOfTime):
def trueOnlinePolicyGradient(): # logging.basicConfig(filename='example.log',level=logging.DEBUG) for alpha_v in alpha_v_list: alpha_v = alpha_v * 1.0 / num_tilings for alpha_pi in alpha_pi_list: alpha_pi = alpha_pi * 1.0 / num_tilings print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi avg_steps_overall = 0.0 avg_steps_per_run = np.zeros((num_runs, )) avg_steps_per_episode = np.zeros((num_episodes, )) start_time = time.clock() for current_run in range(num_runs): logging.debug("Run #:" + str(current_run)) # print 'Run #:', current_run theta = 0.00001 * np.random.randn(mem_size, num_actions) w = 0.00001 * np.random.randn(mem_size, ) # w_old = np.zeros((mem_size, )) v_old = 0.0 steps_per_episode = np.zeros((num_episodes, )) avg_steps = 0.0 for current_episode in range(num_episodes): # if (current_episode+1) % 10 == 0: # plotWeights(theta, w, current_episode) G = 0.0 step = 0 z_theta = np.zeros((mem_size, num_actions)) z_theta_old = np.zeros((mem_size, num_actions)) z_w = np.zeros((mem_size, )) (pos, vel) = mountaincar.init() phi = np.zeros((mem_size, )) tiled_indices = tilecode(pos, vel) phi[tiled_indices] = 1 current_state = (pos, vel) (a_star, PG_star) = sampleAction(theta, phi) a_prime = 0 PG_prime = np.zeros((mem_size, num_actions)) while current_state is not None and step < max_steps: reward, next_state = mountaincar.sample(current_state, a_star) G += (gamma * reward) step += 1 v_current = np.dot(w.transpose(), phi) v_next = 0.0 phi_prime = np.zeros((mem_size, )) if next_state is not None: tiled_indices = tilecode(next_state[0], next_state[1]) phi_prime[tiled_indices] = 1 v_next = np.dot(w.transpose(), phi_prime) (a_prime, PG_prime) = sampleAction(theta, phi_prime) delta = reward + (gamma * v_next) - v_current # z_w = (gamma * lmbda * z_w) + phi - (alpha_v * gamma * lmbda * np.dot(z_w.transpose(), phi) * phi) # w += (alpha_v * (delta + v_current - v_old) * z_w - alpha_v * (v_current - v_old) * phi) z_w = (gamma * lmbda * z_w) + phi w += (alpha_v * delta * z_w) # z_theta = (gamma * lmbda * z_theta) + PG_star # theta += ((alpha_pi * z_theta * delta) + ((alpha_pi * z_theta_old) * (v_current - v_old))) z_theta = (gamma * lmbda * z_theta) + PG_star theta += (alpha_pi * delta * z_theta) v_old = v_next z_theta_old = np.copy(z_theta) phi = np.copy(phi_prime) a_star = a_prime current_state = next_state PG_star = np.copy(PG_prime) # print '########### Episode: ', current_episode, ' Return: ', G, ' Steps: ', step, " Run: ", current_run steps_per_episode[current_episode] = step avg_steps += step avg_steps = avg_steps * 1.0 / num_episodes avg_steps_overall += avg_steps avg_steps_per_run[current_run] = avg_steps avg_factor = 1.0 / (current_run + 1) for episode_i in range(num_episodes): avg_steps_per_episode[episode_i] *= (1 - avg_factor) avg_steps_per_episode[episode_i] += (avg_factor * steps_per_episode[episode_i]) end_time = time.clock() elapsed_time = (end_time - start_time) / 60.0 print 'Elapsed time: ', elapsed_time # logging.debug('Elapsed time: ' + str(elapsed_time)) avg_steps_overall = avg_steps_overall * 1.0 / num_runs std_error = 0.0 for run_i in range(num_runs): avg_factor_run = 1.0 / (run_i + 1) std_error = ((1 - avg_factor_run) * std_error) + (avg_factor_run * (avg_steps_per_run[run_i] - avg_steps_overall) * (avg_steps_per_run[run_i] - avg_steps_overall)) std_error = np.sqrt(std_error * 1.0 / num_runs) total_steps = avg_steps_overall * num_episodes * num_runs print 'Time per step: ', (elapsed_time * 1.0 / total_steps) print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi, ' lmbda: ', lmbda print 'average reward: ', -1.0 * avg_steps_overall, ' std. error: ', std_error print 'Policy gradient'
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 S = mountaincar.init() step = 0 while (S): indexList = [-1] * numTilings tilecode(S[0], S[1], indexList) indexList = np.array(indexList) q0 = qVal(theta1, indexList) + qVal(theta2, indexList) q1 = qVal(theta1, indexList + numTiles) + qVal( theta2, indexList + numTiles) q2 = qVal(theta1, indexList + 2 * numTiles) + qVal( theta2, indexList + 2 * numTiles) Q = np.array([q0, q1, q2]) prob1 = np.random.random() if prob1 < epsilon: # explore A = np.random.choice([0, 1, 2]) else: # greedy A = Q.argmax() R, S_prime = mountaincar.sample(S, A) G += R prob2 = np.random.choice([1, 2]) if prob2 == 1: theta_n = theta1 theta_prime = theta2 else: theta_n = theta2 theta_prime = theta1 indexList = [x + A * numTiles for x in indexList] qval_theta_n = qVal(theta_n, indexList) if not S_prime: for index in indexList: theta_n[index] = theta_n[index] + alpha * (R - qval_theta_n) break indexList_prime = [-1] * 4 tilecode(S_prime[0], S_prime[1], indexList_prime) indexList_prime = np.array(indexList_prime) q0_n = qVal(theta_n, indexList_prime) q1_n = qVal(theta_n, indexList_prime + numTiles) q2_n = qVal(theta_n, indexList_prime + 2 * numTiles) A_prime = np.array([q0_n, q1_n, q2_n]).argmax() q_prime_max = qVal(theta_prime, A_prime * numTiles + indexList_prime) for index in indexList: theta_n[index] = theta_n[index] + alpha * (R + q_prime_max - qval_theta_n) S = S_prime step += 1 # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G) returnSum = returnSum + G # print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) #Q=zeros(3) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 step = 0 S = mountaincar.init() tileindec = tilecode(S[0], S[1], [-1] * numTilings) # Q=Qs(tileindec,theta1) # act=argmax(Q) #derivate=zeros(n) while S != None: step += 1 #derivate=zeros(n) #tileindec=tilecode(S[0],S[1],[-1] * numTilings) #Q=Qs(tileindec,theta1) if random() < epsilon: act = randint(0, 3) else: act = argmax(Qs(tileindec, theta1 + theta2)) R, Stemp = mountaincar.sample(S, act) G += R if Stemp == None: pro = randint(0, 2) if pro == 1: q1 = Qs(tileindec, theta1) q2 = Qs(tileindec, theta2) update = alpha * (R + q2[argmax(q1)] - q1[act]) for i in tileindec: theta1[i + act * 324] += update break if pro == 0: q1 = Qs(tileindec, theta1) q2 = Qs(tileindec, theta2) update = alpha * (R + q2[argmax(q1)] - q1[act]) for i in tileindec: theta2[i + act * 324] += update break else: tileindec_tem = tilecode(Stemp[0], Stemp[1], [-1] * numTilings) # for i in tileindec: # derivate[i+act*324]=1 pro = randint(0, 2) if pro == 1: if Stemp != None: q1 = Qs(tileindec_tem, theta1) q2 = Qs(tileindec_tem, theta2) update = alpha * (R + q2[argmax(q1)] - q1[act]) for i in tileindec: theta1[i + act * 324] += update else: if Stemp != None: q1 = Qs(tileindec_tem, theta1) q2 = Qs(tileindec_tem, theta2) update = alpha * (R + q1[argmax(q2)] - q2[act]) for i in tileindec: theta2[i + act * 324] += update S = Stemp tileindec = tileindec_tem # for i in tileindec: # derivate[i+act*324]=1 # # if Stemp==None: # #print(Stemp) # for i in range(n): # theta1[i]=theta1[i]+alpha*(R-Q[act])*derivate[i] # break; # else: # # tileindec_tem=tilecode(Stemp[0],Stemp[1],[-1] * numTilings) # Q_tem=Qs(tileindec_tem,theta1) # #print(Q_tem) # act_tem=argmax(Q_tem) # # for i in range(n): # theta1[i]=theta1[i]+alpha*(R+gamma*(Q_tem[act_tem])-Q[act])*derivate[i] # S=Stemp # #print(S) # ... # your code goes here (20-30 lines, depending on modularity) # ... print("Episode: ", episodeNum, "Steps:", step, "Return: ", G) returnSum = returnSum + G print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
Q = zeros(a) for a in range(a): for i in F: Q[a] += theta[i + a * 324] return Q runSum = 0.0 for run in xrange(numRuns): theta = -0.01 * rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 #your code goes here (20-30 lines, depending on modularity) step = 0.0 s = mountaincar.init() trace = zeros(n) Q = zeros(3) while s is not None: step += 1 tilecode(s[0], s[1], F) Q = Policy(F, 3, theta) if rand() <= epsilon: action = randint(0, 2) else: action = argmax(Q) r, sp = mountaincar.sample(s, action) delta = r - Q[action] G += r for i in F: trace[i + action * numTiles] = 1
for a in range(3): for i in F: Q[a] += theta[i + a * 324] return Q runSum = 0.0 for run in xrange(numRuns): theta = -0.01 * numpy.random.rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 #your code goes here (20-30 lines, depending on modularity) steps = 0 e = numpy.zeros(n) s = mc.init() Q = numpy.zeros(numActions) while s != None: #print Q steps += 1 tilecode(s[0], s[1], F) Q = Qs(F) a = numpy.argmax(Q) r, s1 = mc.sample(s, a) G += r delta = r - Q[a] for i in F: e[i + a * 324] = 1 if s1 == None: for i in range(n): theta[i] += alpha * delta * e[i]
return q returnAvg = zeros(200) numSteps = zeros(200) runSum = [] for run in range(numRuns): w = -0.01*rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): zerovec = zeros(n) G = 0 A = 0 S = mountaincar.init() F = actionTileCode(F,S,A) zerovec[F] = 1 episodeLen = 0 while(S is not None): episodeLen = episodeLen + 1 RSA = mountaincar.sample(S,A) R = RSA[0] S = RSA[1] G = G + R delta = R - sum(w[F]) q = zeros(3) if(S is not None): for a in range(3): F = actionTileCode(F,S,a)
returns = np.zeros(shape=(numRuns,numEpisodes)) # I add it # Main function main part for run in range(numRuns): #Initializing the weight vec w = -0.01*rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 "..." "your code goes here (20-30 lines, depending on modularity)" S = mountaincar.init() #Initialize state e = np.zeros(n) #Initialize eligibility vector steps = 0 while (True): Q = [0, 0, 0] #The Q learning (S, A) pair with Feature A = 0 tilecode (S[0], S[1], F) #Get the (Position, velocity) and Fearture for j in range(3): for i in F: Q[j] = Q[j] + w[i + (j*9*9*4)] # To compplete one tiling, 4 mapping is needed if (random.uniform(0,1) < epsilon): # Epsilon greedy A = random.choice(actions) else: A = Q.index(max(Q))
#numRuns = 50 #numEpisodes = 200 #averageArray = [(0,0)]*numEpisodes ## tuple ordered (return, steps) ## ======================= for run in xrange(numRuns): theta = -0.01*rand(n) returnSum = 0.0 #stepSum = 0 print "Run: ", run for episodeNum in xrange(numEpisodes): eTrace = [0]*n G = 0 delta = 0 state = mountaincar.init() step = 0 while state != None: step += 1 tiles = tilecode(state[0], state[1],[-1]*numTilings) explore = (random.random() < epsilon) if explore: action = random.randint(0,2) reward, newState = mountaincar.sample(state, action) else: action = getBestAction(tiles, theta) reward, newState = mountaincar.sample(state, action) G += reward
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 S = mountaincar.init() # S[0] is the position, S[1] is the velocity #start = True step = 0 while True: #print('$'*80) #print('new S: ', S) q1 = [0] * 3 # for each possible actions, each has a q value q2 = [0] * 3 phi = [0] * n # initialize the list of features ø tileIndices = [-1] * numTilings tilecode(S[0], S[1], tileIndices) #print('tileIndices: ', tileIndices) # choose action, from a epsilon greedy num = np.random.random() if (num >= epsilon): for possibleAction in range(0, 3): # generate q value for each possible actions for index in tileIndices: # implementing the vector multiplication thetaT*phi q1[possibleAction] = q1[possibleAction] + theta1[ possibleAction * numTiles + index] * 1 q2[possibleAction] = q2[possibleAction] + theta2[ possibleAction * numTiles + index] * 1 action = argmax([a + b for a, b in zip(q1, q2) ]) # choose the greedy action #print('action is: ', action) else: action = np.random.randint(0, 3) # choose the stochastic action #print('action is: ', action) # actually generate the features, based on the action indices = [action * numTiles + index for index in tileIndices ] # indicates which position in phi is 1 # sample the next S, reward reward, nextS = mountaincar.sample(S, action) #print('nextS:', nextS) #print('reward: ',reward) G = G + reward step += 1 #print('G:', G) if nextS == None: # terminal S if np.random.randint(0, 2): for i in indices: theta1[i] = theta1[i] + alpha * (reward - q1[action]) #G = G+reward #step+=1 else: for i in indices: theta2[i] = theta2[i] + alpha * (reward - q2[action]) #G = G+reward #step+=1 break else: # not terminal S # need to compute phi for the next S nextQ1 = [0] * 3 nextQ2 = [0] * 3 #nextPhi = [0]*n nextTileIndices = [-1] * numTilings tilecode(nextS[0], nextS[1], nextTileIndices) #print('nextTileIndices: ', nextTileIndices) nextQ1 = Qs(nextTileIndices, theta1) nextQ2 = Qs(nextTileIndices, theta2) if np.random.randint(0, 2): # with 0.5 probability nextAction = argmax(nextQ1) for i in indices: theta1[i] = theta1[i] + alpha * ( reward + nextQ2[nextAction] - q1[action]) else: # with 0.5 probability nextAction = argmax(nextQ2) for i in indices: theta2[i] = theta2[i] + alpha * ( reward + nextQ1[nextAction] - q2[action]) #print(theta2) S = nextS steps[episodeNum] = steps[episodeNum] + step returns[episodeNum] = returns[episodeNum] + G #print("Episode:", episodeNum, "Steps:", step, "Return: ", G) returnSum += G #print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2