def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 for episodeNum in range(numTrainingEpisodes): G = 0 S = mountaincar.init() R, S = mountaincar.sample(S, 1) G += R while (S): Q = Q1[S, :] + Q2[S, :] prob1 = np.random.random() if prob1 < eps: # explore A = np.random.choice([0, 1]) else: # greedy A = Q.argmax() R, S_prime = mountaincar.sample(S, A) G += R S_prime = int(S_prime) prob2 = np.random.choice([1, 2]) if prob2 == 1: Q1[S, A] = Q1[S, A] + alpha * ( R + GAMMA * Q2[S_prime, (Q1[S_prime]).argmax()] - Q1[S, A]) else: Q2[S, A] = Q2[S, A] + alpha * ( R + GAMMA * Q1[S_prime, (Q2[S_prime]).argmax()] - Q2[S, A]) S = S_prime #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G
def evaluate(numEvaluationEpisodes): returnSum = 0.0 for episodeNum in range(numEvaluationEpisodes): G = 0 S = mountaincar.init() R, S = mountaincar.sample(S, 1) G += R while (S): Q = Q1[S, :] + Q2[S, :] A = Q.argmax() R, S = mountaincar.sample(S, A) G += R returnSum = returnSum + G return returnSum / numEvaluationEpisodes
def loop(): global S, acrl, steps, episodeNum, gotOut prev_features = get_features(S, True) updatePrototypePlacements() if steps % 100 == 0: print('num steps: ' + str(steps)) A = acrl.getAction(prev_features) R, Snext = mountaincar.sample(S, A) acrl.R = R if steps >= 5000 or Snext == None or isnan(Snext[0]) or isnan(Snext[1]): print('num steps: ' + str(steps)) print('Snext: ' + str(Snext)) print('Breaking from episode: ' + str(episodeNum)) episodeNum += 1 if Snext == None or isnan(Snext[0]) or isnan(Snext[1]): gotOut = True return True acrl.Value(prev_features) acrl.Delta() next_features = get_features(Snext, False) acrl.Next_Value(next_features) acrl.Delta_Update() acrl.Average_Reward_Update() acrl.Trace_Update_Critic(prev_features) acrl.Weights_Update_Critic() acrl.Compatible_Features(A, prev_features) acrl.Trace_Update_Actor() acrl.Weights_Update_Actor() S = Snext steps += 1 return False
def act(self, action, discount): """ do an action and update Q given the discount factor and step size """ if self._time < self._T: (r, sp) = mountaincar.sample(self._s, action) self._r_sum += r self._tr[self._time % self._n] = (self._s, action) if sp == None: # if terminal self._T = self._time + 1 self._delta[self._time%self._n] = r - self._Qt[self._time%(self._n+1)] # TD error else: # commit the next action action = self.pick_action(sp) # select arbitrarily and store an action as A_(t+1) self._Qt[(self._time + 1)%(self._n+1)] = self._Q[sp, action] # Store Q(St+1;At+1) as Qt+1 self._sigma[(self._time+1)%self._n] = self._sig self._delta[self._time%self._n] = r - self._Qt[self._time%(self._n+1)] + \ discount*((1-self._sigma[(self._time+1)%self._n]) * self.expected_Q(sp) + self._sigma[(self._time+1)%self._n] * self._Q[sp, action]) self._pi[(self._time+1)%self._n] = self.get_action_probability(sp, action) self._s = sp # update agent state self._tau = self._time + 1 - self._n # time whose estimate is being updated if self._tau >= 0: E = 1.0 G = self._Qt[self._tau%(self._n+1)] for k in range(self._tau, int(min(self._time, self._T-1))+1): G += E * self._delta[k%self._n] E *= discount * ((1-self._sigma[(k+1)%self._n]) * self._pi[(k+1)%self._n] + self._sigma[(k+1)%self._n]) s, a = self._tr[self._tau%self._n] self._Q[s, a] = G self._time += 1 return action # return the committed next action
def loop(): global S, acrl, steps, episodeNum, gotOut prev_features = get_features(S, True) updatePrototypePlacements() if steps % 100 == 0: print('num steps: ' + str(steps)) A = acrl.getAction(prev_features) R,Snext = mountaincar.sample(S,A) acrl.R = R if steps >= 5000 or Snext == None or isnan(Snext[0]) or isnan(Snext[1]): print('num steps: ' + str(steps)) print('Snext: ' + str(Snext)) print('Breaking from episode: ' + str(episodeNum)) episodeNum += 1 if Snext == None or isnan(Snext[0]) or isnan(Snext[1]): gotOut = True return True acrl.Value(prev_features) acrl.Delta() next_features = get_features(Snext, False) acrl.Next_Value(next_features) acrl.Delta_Update() acrl.Average_Reward_Update() acrl.Trace_Update_Critic(prev_features) acrl.Weights_Update_Critic() acrl.Compatible_Features(A,prev_features) acrl.Trace_Update_Actor() acrl.Weights_Update_Actor() S = Snext steps += 1 return False
def learnEpisode(alpha, eps, gamma, theta1, theta2): in1, in2 = mountaincar.init() currentStates = tilecode(in1, in2, [-1]*numTilings) # returns the initial state episodeReturn = 0 step = 0 while(True): # continue until we reach terminal state (None) action = epsGreedyPolicy(currentStates, eps, theta1, theta2) reward, nextStatePosVel = mountaincar.sample((in1, in2), action) episodeReturn += reward step += 1 if nextStatePosVel: nextIn1, nextIn2 = nextStatePosVel nextStates = tilecode(nextIn1, nextIn2, [-1]*numTilings) if(np.random.randint(0,2)): # will return ints between [0,2) updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma) else: updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma) currentStates = nextStates in1, in2 = nextIn1, nextIn2 else: # next state is terminal state if(np.random.randint(0,2)): # will return ints between [0,2) updateTheta(theta1, theta2, currentStates, nextStates, action, reward, alpha, gamma) else: updateTheta(theta2, theta1, currentStates, nextStates, action, reward, alpha, gamma) return episodeReturn, step
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 tileIndices = [-1] * numTilings pos, vel = mountaincar.init() state = (pos, vel) step = 0 while state != None: tilecode(pos, vel, tileIndices) action = chooseaction(state, theta1, theta2) r, nstate = mountaincar.sample(state, action) tileIndices = [-1] * numTilings if nstate != None: if randint(0, 2) == 0: naction = chooseaction(nstate, theta1, theta2) tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta1[tileIndices[i] + (action * numTiles)] += alpha * ( r + Total(nstate, naction, theta2) - Total(state, action, theta1)) else: naction = chooseaction(nstate, theta1, theta2) tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta2[tileIndices[i] + (action * numTiles)] += alpha * ( r + Total(nstate, naction, theta1) - Total(state, action, theta2)) else: if randint(0, 2) == 0: tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta1[tileIndices[i] + (action * numTiles)] += alpha * ( r - Total(state, action, theta1)) else: tileIndices = tilecode(state[0], state[1], tileIndices) for i in range(numTilings): theta2[tileIndices[i] + (action * numTiles)] += alpha * ( r - Total(state, action, theta2)) state = nstate G += r step += 1 #print("Episode:", episodeNum, "Steps:", step, "Return: ", G) avgrlist[episodeNum] += G avgslist[episodeNum] += step returnSum += G #print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2, step
def learn(): runSum = 0.0 for run in xrange(numRuns): theta = -0.01*rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): step = 0 G = 0 traces = zeros(n) S=mountaincar.init() # Until S is terminal: while S!=None: # Choose action tilecode(S,F) if rand() <= Emu: # randomly explore a = randint(0, 2) else: # greedy action choice a = argmax([QValue(F,0,theta),QValue(F,1,theta),QValue(F,2,theta)]) # Replacing traces on indices where feature vector is 1 for index in F: traces[index+(a*numTiles)] = 1 # Take action, observe r,Sp r,Sp=mountaincar.sample(S,a) G += r # If terminal action update theta and end episode if Sp == None: delta = r - QValue(F,a,theta) theta = theta + alpha*delta*traces break # Choose expected next action tilecode(Sp,Fp) ap = argmax([QValue(Fp,0,theta),QValue(Fp,1,theta),QValue(Fp,2,theta)]) # Update theta randomAction = (Epi/3)*QValue(Fp,0,theta) + (Epi/3)*QValue(Fp,1,theta)+ (Epi/3)*QValue(Fp,2,theta) delta = r + randomAction + (1-Epi)*QValue(Fp,ap,theta) - QValue(F,a,theta) theta = theta + alpha*delta*traces # Decay every component traces = gamma*lmbda*traces S=Sp step += 1 returnSum += G print "Episode: ", episodeNum, "Steps:", step, "Return: ", G episodeReturn[episodeNum] += (G-episodeReturn[episodeNum])/(numRuns+1) episodeSteps[episodeNum] += (step-episodeSteps[episodeNum])/(numRuns+1) returnSum = returnSum + G print "Average return:", returnSum/numEpisodes runSum += returnSum print "Overall performance: Average sum of return per run:", runSum/numRuns writeAverages(episodeReturn,episodeSteps)
def learn(alpha=0.1 / numTilings, epsilon=0.0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 #your code goes here (20-30 lines, depending on modularity) state = mountaincar.init() #q1 = [0] * 3 # state-action value q for each #q2 = [0] * 3 #feature_vectors = np.zeros(n) while state != None: tileIndices = [-1]*numTilings tilecode(s[0], s[1], tileIndices) # s[0]:position s[1]:velocity q0 = Qs(theta1, tileIndices) + Qs(theta2, tileIndices) # if action is 0 q1 = Qs(theta1, tileIndices+numTiles) + Qs(theta2, tileIndices+numTiles) #if action is 1 q2 = Qs(theta1, tileIndices+numTiles*2) + Qs(theta2, tileIndices+numTiles*2) # if action is 2 Q = np.array([q0, q1, q2]) # apply epsilon greedy to choose actions greedy = np.random.random() if(greedy >= epsilon): action = Q.argmax() else: action = np.random.randint(0,3) reward, nextS = mountaincar.sample(state, action) G = G + reward while nextS == None: # if next state is terminal state print("Episode:", episodeNum, "Steps:", step, "Return: ", G) returnSum += G print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
def learn(alpha=.1/numTilings, epsilon=0, numEpisodes=1000, numRuns=1): returnSum = 0.0 avgEpisodeReturns = [0]*numEpisodes doubleQ = DoubleQ(alpha, epsilon) for run in range(numRuns): doubleQ.resetQ() for episodeNum in range(numEpisodes): print("Run: " + str(run) + ", Episode: " + str(episodeNum) + " ....") G = 0 isTerminal = False #initialize the mountain car stateTuple = mountaincar.init() state = tilecode(stateTuple[0], stateTuple[1]) while (not isTerminal): action = doubleQ.policy(state) reward, stateTuple = mountaincar.sample(stateTuple, action) G+=reward if stateTuple: nextState = tilecode(stateTuple[0], stateTuple[1]) else: nextState = None doubleQ.learn(state, action, nextState, reward) if not stateTuple: isTerminal = True else: state = nextState print("Run: ", run+1, " Episode: ", episodeNum, " Steps:", step, " Return: ", G) returnSum = returnSum + G avgEpisodeReturns[episodeNum] = avgEpisodeReturns[episodeNum] + (1/(run+1))*(G - avgEpisodeReturns[episodeNum]) return avgEpisodeReturns, doubleQ.theta1, doubleQ.theta2
def test_params(_lmbda, _alpha, _epsilon): global theta, e Epi = Emu = _epsilon alpha = _alpha lmbda = _lmbda runSum = 0.0 for run in xrange(numRuns): e = np.zeros(numTilings*n*3) theta = -0.01*np.random.random_sample(numTilings*n*3) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 S = mountaincar.init() step = 0 while(S!=None): step+=1 A = epsilon_greedy_policy(S) R, S_next = mountaincar.sample(S,A) G+=R #since value of terminal state is 0 by definition #computation for delta is simplified if(S_next==None): delta = R - q(S,A) else: delta = R+Epi*np.average([q(S_next,a) for a in [0,1,2]]) +\ (1-Epi)*np.max([q(S_next,a) for a in [0,1,2]]) - q(S,A) e*=gamma*lmbda tilecode(S[0], S[1], F) for index in [i+A*numTilings*n for i in F]: e[index] = 1 theta +=alpha*delta*e S=S_next if(step >10000): return -10000000000 returnSum = returnSum + G runSum += returnSum return runSum/numRuns
#Summing up values for coasting Q[1] += theta[features + numTiles] #Summing up values for accerlation Q[2] += theta[features + (2*numTiles)] #Selecting action to take if rand() <= Emu: action = randint(3) else: action = argmax(Q) #Taking the action. Store results result = mountaincar.sample(state,action) G += result[0] nextState = result[1] #Calculating delta delta = result[0] - Q[action] #Updating traces for features in F: e[features + (action *numTiles)] = 1; #Breaking out if next state is none if nextState == None: theta = theta + (alpha * delta * e) break
# repeat for each step of episode while S is not None: # initialize A A = 0 # get a list of four tile indices tilecode(S[0], S[1], F) Q = Qs(F) # pick the action A = egreedy(Q, Emu) # observe reward, and next state R, Sprime = mountaincar.sample(S, A) delta = R - Q[A] G = G + R for i in F: # replacing traces e[i + (A * numTiles)] = 1 # if S' is terminal, then update theta; go to next episode if Sprime == None: theta = theta + alpha * delta * e break tilecode(Sprime[0], Sprime[1], F)
e = np.zeros(n) #Initialize eligibility vector steps = 0 while (True): Q = [0, 0, 0] #The Q learning (S, A) pair with Feature A = 0 tilecode (S[0], S[1], F) #Get the (Position, velocity) and Fearture for j in range(3): for i in F: Q[j] = Q[j] + w[i + (j*9*9*4)] # To compplete one tiling, 4 mapping is needed if (random.uniform(0,1) < epsilon): # Epsilon greedy A = random.choice(actions) else: A = Q.index(max(Q)) R,Sp = mountaincar.sample (S,A) # Learing update in one episode delta = R - Q[A] G += R for i in F: e[i+(A*4*9*9)] = 1 if (Sp == None): w += alpha*delta*e; break # If teminal state, end the episode Qp = [0,0,0] tilecode (Sp[0], Sp[1], F) for j in range(3): for i in F: Qp[j] = Qp[j] + w[i + (j*9*9*4)] # Update the next (S,A) steps += 1 delta += Qp[argmax(Qp)]
w = zeros(n) for episodeNum in range(numEpisodes): G = 0 e = zeros(n) carState = mountaincar.init() while not carState==None: Qa = zeros(3) Fa = zeros(4) for a_poss in [0,1,2]: tilecode(carState,Fa) assert (sum(Fa) > 0) # make sure Fa is populated Qa[a_poss] = getStateActionValue(w,Fa,a_poss) # get an action, act on it, and observe the reward A = getEpsilonGreedyAction(Qa) R,carStateNew = mountaincar.sample(carState,A) G = G + R delta = R - Qa[A] for i in Fa: # for each active feature index e[i+numTiles*A] = 1 # if the new state is the terminal state, update the weight vector and break if carStateNew==None: w = w + alpha*delta * e break # update values for the weight vector and the eligibility traces Qa = zeros(3) Fa = zeros(4)
# repeat for each step of episode while S is not None: # initialize A A = 0 # get a list of four tile indices tilecode(S[0], S[1], F) Q = Qs(F) # pick the action A = egreedy(Q, Emu) # observe reward, and next state R, Sprime = mountaincar.sample(S, A) delta = R - Q[A] G = G + R for i in F: # replacing traces e[i + (A*numTiles)] = 1 # if S' is terminal, then update theta; go to next episode if Sprime == None: theta = theta + alpha * delta * e break tilecode(Sprime[0], Sprime[1], F)
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0.0 S = mountaincar.init() # S[0] is the position, S[1] is the velocity #start = True step = 0 while True: #print('$'*80) #print('new S: ', S) q1 = [0] * 3 # for each possible actions, each has a q value q2 = [0] * 3 phi = [0] * n # initialize the list of features ø tileIndices = [-1] * numTilings tilecode(S[0], S[1], tileIndices) #print('tileIndices: ', tileIndices) # choose action, from a epsilon greedy num = np.random.random() if (num >= epsilon): for possibleAction in range(0, 3): # generate q value for each possible actions for index in tileIndices: # implementing the vector multiplication thetaT*phi q1[possibleAction] = q1[possibleAction] + theta1[ possibleAction * numTiles + index] * 1 q2[possibleAction] = q2[possibleAction] + theta2[ possibleAction * numTiles + index] * 1 action = argmax([a + b for a, b in zip(q1, q2) ]) # choose the greedy action #print('action is: ', action) else: action = np.random.randint(0, 3) # choose the stochastic action #print('action is: ', action) # actually generate the features, based on the action indices = [action * numTiles + index for index in tileIndices ] # indicates which position in phi is 1 # sample the next S, reward reward, nextS = mountaincar.sample(S, action) #print('nextS:', nextS) #print('reward: ',reward) G = G + reward step += 1 #print('G:', G) if nextS == None: # terminal S if np.random.randint(0, 2): for i in indices: theta1[i] = theta1[i] + alpha * (reward - q1[action]) #G = G+reward #step+=1 else: for i in indices: theta2[i] = theta2[i] + alpha * (reward - q2[action]) #G = G+reward #step+=1 break else: # not terminal S # need to compute phi for the next S nextQ1 = [0] * 3 nextQ2 = [0] * 3 #nextPhi = [0]*n nextTileIndices = [-1] * numTilings tilecode(nextS[0], nextS[1], nextTileIndices) #print('nextTileIndices: ', nextTileIndices) nextQ1 = Qs(nextTileIndices, theta1) nextQ2 = Qs(nextTileIndices, theta2) if np.random.randint(0, 2): # with 0.5 probability nextAction = argmax(nextQ1) for i in indices: theta1[i] = theta1[i] + alpha * ( reward + nextQ2[nextAction] - q1[action]) else: # with 0.5 probability nextAction = argmax(nextQ2) for i in indices: theta2[i] = theta2[i] + alpha * ( reward + nextQ1[nextAction] - q2[action]) #print(theta2) S = nextS steps[episodeNum] = steps[episodeNum] + step returns[episodeNum] = returns[episodeNum] + G #print("Episode:", episodeNum, "Steps:", step, "Return: ", G) returnSum += G #print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
for episodeNum in xrange(numEpisodes): G = 0 #your code goes here (20-30 lines, depending on modularity) step = 0.0 s = mountaincar.init() trace = zeros(n) Q = zeros(3) while s is not None: step += 1 tilecode(s[0], s[1], F) Q = Policy(F, 3, theta) if rand() <= epsilon: action = randint(0, 2) else: action = argmax(Q) r, sp = mountaincar.sample(s, action) delta = r - Q[action] G += r for i in F: trace[i + action * numTiles] = 1 if sp == None: theta += alpha * delta * trace break tilecode(sp[0], sp[1], F) delta += max(Policy(F, 3, theta)) theta += alpha * delta * trace trace = lmbda * trace * gamma s = sp print "Episode: ", episodeNum, "Steps:", step, "Return: ", G returnSum = returnSum + G
runs = np.zeros(numRuns) timeSteps = np.zeros((numRuns,numEpisodes)) returns = np.zeros((numRuns,numEpisodes)) for run in range(numRuns): mc = ACRL() returnSum = 0.0 for episodeNum in range(numEpisodes): S = mountaincar.init() G = 0 steps = 0 mc.Erase_Traces() while(1): prev_features = get_features(S) A = mc.getAction(prev_features) R,Snext = mountaincar.sample(S,A) mc.R = R if steps >= 5000 or Snext == None: # or isnan(Snext[0]) or isnan(Snext[1]): break mc.Value(prev_features) mc.Delta() next_features = get_features(Snext) mc.Next_Value(next_features) mc.Delta_Update() mc.Average_Reward_Update() mc.Trace_Update_Critic(prev_features) mc.Weights_Update_Critic() mc.Compatible_Features(A,prev_features) mc.Trace_Update_Actor() mc.Weights_Update_Actor() S = Snext
return pos * len(vel_range) + vel # add states and actions to mdp mcar = MDP() mcar.add_states(n_states + 1) # add terminal state at end for i in range(n_states): mcar.add_actions(i, n_actions) # wire up mdp print('building mdp...') for p in pos_range: for v in vel_range: s = (p, v) for a in range(3): R, sp = mountaincar.sample(s, a) mcar.add_transition(state_id(s), a, (state_id(sp), R, 1.0)) # compute values print('solving mdp...') V = mcar.value_iteration(1.0) # map values for plotting print('mapping function...') x = pos_range y = vel_range plot_V = np.zeros([len(y), len(x)]) for i in range(len(x)): for j in range(len(y)): plot_V[j, i] = -V[state_id((x[i], y[j]))]
# initialize observation observation = mountaincar.init() # use function approximation to generate next state tilecode(observation[0], observation[1], state) # compute the Q values for the state and every action Q = Qs(state) terminal = False A = chooseAction(Q) unknownObs = observation if flipped: R, observation, terminal = mountaincar.sample(observation, A, terminal) someRandomAmountOfTime = random.randint(minNumExtraSteps,maxNumExtraSteps) for i in range(1, someRandomAmountOfTime): unknownR, unknownObs, terminal = mountaincar.sample(unknownObs, A, terminal) G += unknownR step += someRandomAmountOfTime # repeat for each step of episode while True: if not flipped: # take action a and get reward R and new observation R, observation, terminal = mountaincar.sample(unknownObs, A, terminal) # if newObservation is terminal if terminal: w += alpha*delta*e
def learn(): runSum = 0.0 for run in xrange(numRuns): theta = -0.01 * rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): step = 0 G = 0 traces = zeros(n) S = mountaincar.init() # Until S is terminal: while S != None: # Choose action tilecode(S, F) if rand() <= Emu: # randomly explore a = randint(0, 2) else: # greedy action choice a = argmax([ QValue(F, 0, theta), QValue(F, 1, theta), QValue(F, 2, theta) ]) # Replacing traces on indices where feature vector is 1 for index in F: traces[index + (a * numTiles)] = 1 # Take action, observe r,Sp r, Sp = mountaincar.sample(S, a) G += r # If terminal action update theta and end episode if Sp == None: delta = r - QValue(F, a, theta) theta = theta + alpha * delta * traces break # Choose expected next action tilecode(Sp, Fp) ap = argmax([ QValue(Fp, 0, theta), QValue(Fp, 1, theta), QValue(Fp, 2, theta) ]) # Update theta randomAction = (Epi / 3) * QValue( Fp, 0, theta) + (Epi / 3) * QValue( Fp, 1, theta) + (Epi / 3) * QValue(Fp, 2, theta) delta = r + randomAction + (1 - Epi) * QValue( Fp, ap, theta) - QValue(F, a, theta) theta = theta + alpha * delta * traces # Decay every component traces = gamma * lmbda * traces S = Sp step += 1 returnSum += G print "Episode: ", episodeNum, "Steps:", step, "Return: ", G episodeReturn[episodeNum] += (G - episodeReturn[episodeNum]) / ( numRuns + 1) episodeSteps[episodeNum] += (step - episodeSteps[episodeNum]) / ( numRuns + 1) returnSum = returnSum + G print "Average return:", returnSum / numEpisodes runSum += returnSum print "Overall performance: Average sum of return per run:", runSum / numRuns writeAverages(episodeReturn, episodeSteps)
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 S = mountaincar.init() step = 0 while (S): indexList = [-1] * numTilings tilecode(S[0], S[1], indexList) indexList = np.array(indexList) q0 = qVal(theta1, indexList) + qVal(theta2, indexList) q1 = qVal(theta1, indexList + numTiles) + qVal( theta2, indexList + numTiles) q2 = qVal(theta1, indexList + 2 * numTiles) + qVal( theta2, indexList + 2 * numTiles) Q = np.array([q0, q1, q2]) prob1 = np.random.random() if prob1 < epsilon: # explore A = np.random.choice([0, 1, 2]) else: # greedy A = Q.argmax() R, S_prime = mountaincar.sample(S, A) G += R prob2 = np.random.choice([1, 2]) if prob2 == 1: theta_n = theta1 theta_prime = theta2 else: theta_n = theta2 theta_prime = theta1 indexList = [x + A * numTiles for x in indexList] qval_theta_n = qVal(theta_n, indexList) if not S_prime: for index in indexList: theta_n[index] = theta_n[index] + alpha * (R - qval_theta_n) break indexList_prime = [-1] * 4 tilecode(S_prime[0], S_prime[1], indexList_prime) indexList_prime = np.array(indexList_prime) q0_n = qVal(theta_n, indexList_prime) q1_n = qVal(theta_n, indexList_prime + numTiles) q2_n = qVal(theta_n, indexList_prime + 2 * numTiles) A_prime = np.array([q0_n, q1_n, q2_n]).argmax() q_prime_max = qVal(theta_prime, A_prime * numTiles + indexList_prime) for index in indexList: theta_n[index] = theta_n[index] + alpha * (R + q_prime_max - qval_theta_n) S = S_prime step += 1 # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G) returnSum = returnSum + G # print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) returnSum = 0.0 runEpisodeReturns = [] for episodeNum in range(numEpisodes): G = 0 step = 0 currentState = mountaincar.init() terminate = False while not terminate: action = argmax([ qHat(currentState, 0, theta1) + qHat(currentState, 0, theta2), qHat(currentState, 1, theta1) + qHat(currentState, 1, theta2), qHat(currentState, 2, theta1) + qHat(currentState, 2, theta2) ]) R, nextState = mountaincar.sample(currentState, action) if (nextState is None): if randint(0, 2) == 0: # 0.5 probability phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta1[phi[i] + (action * numTiles)] += alpha * ( R - qHat(currentState, action, theta1)) else: # 0.5 probability phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta2[phi[i] + (action * numTiles)] += alpha * ( R - qHat(currentState, action, theta2)) terminate = True else: if randint(0, 2) == 0: #0.5 probability nextAction = argmax([ qHat(nextState, 0, theta1), qHat(nextState, 1, theta1), qHat(nextState, 2, theta1) ]) phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta1[phi[i] + (action * numTiles)] += alpha * ( R + qHat(nextState, nextAction, theta2) - qHat(currentState, action, theta1)) else: #0.5 probability nextAction = argmax([ qHat(nextState, 0, theta2), qHat(nextState, 1, theta2), qHat(nextState, 2, theta2) ]) phi = tilecode(currentState[0], currentState[1]) for i in range(numTilings): theta2[phi[i] + (action * numTiles)] += alpha * ( R + qHat(nextState, nextAction, theta1) - qHat(currentState, action, theta2)) currentState = nextState #print("Episode: ", episodeNum, "Return: ", G) G = G + R step += 1 runEpisodeReturns.append(G) # print("Episode: ", episodeNum, "Steps:", step, "Return: ", G) returnSum = returnSum + G #print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2, runEpisodeReturns
returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 step = 0 e = np.zeros([numTiles,3]) (position, velocity) = mountaincar.init() while 1: tilecode(position, velocity, F) Q = np.sum(theta[F],axis=0) if np.random.random() > epsilon: A = np.argmax(Q) else: A = np.random.randint(numActions) R, result = mountaincar.sample((position, velocity), A) error = R - Q[A] eOld = copy.copy(e) e[F,A] = 1 G += R if result == None: theta = theta + alpha * error * e break newPosition,newVelocity = result oldF = copy.copy(F) tilecode(newPosition, newVelocity, F) Q = np.sum(theta[F],axis=0) error = error + (1 - epsilon) * np.max(Q) + epsilon \
runSum = [] for run in range(numRuns): w = -0.01*rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): zerovec = zeros(n) G = 0 A = 0 S = mountaincar.init() F = actionTileCode(F,S,A) zerovec[F] = 1 episodeLen = 0 while(S is not None): episodeLen = episodeLen + 1 RSA = mountaincar.sample(S,A) R = RSA[0] S = RSA[1] G = G + R delta = R - sum(w[F]) q = zeros(3) if(S is not None): for a in range(3): F = actionTileCode(F,S,a) q[a] = sum(w[F]) else: w = w + alpha*delta*zerovec break expected_q = getExpected(q)
theta = -0.01 * numpy.random.rand(n) returnSum = 0.0 for episodeNum in xrange(numEpisodes): G = 0 #your code goes here (20-30 lines, depending on modularity) steps = 0 e = numpy.zeros(n) s = mc.init() Q = numpy.zeros(numActions) while s != None: #print Q steps += 1 tilecode(s[0], s[1], F) Q = Qs(F) a = numpy.argmax(Q) r, s1 = mc.sample(s, a) G += r delta = r - Q[a] for i in F: e[i + a * 324] = 1 if s1 == None: for i in range(n): theta[i] += alpha * delta * e[i] break tilecode(s1[0], s1[1], F) Q = Qs(F) delta = delta + numpy.max(Q) for i in range(n): theta[i] += alpha * delta * e[i] e[i] = lmbda * e[i] s = s1
for episodeNum in xrange(numEpisodes): G = 0 # your code goes here (20-30 lines, depending on modularity) step=0 e=np.zeros(n) s=mc.init() Q=np.zeros(numActions) while s!=None: step=step+1 tilecode(s[0],s[1],F) Q=np.zeros(numActions) for a in range(3): for _ in F: Q[a]=Q[a]+theta[_+a*324] a=np.argmax(Q) r, s1=mc.sample(s,a) G+=r delta=r-Q[a] for i in F: e[i+a*324]=1 if s1==None: for i in range(n): theta[i]=theta[i]+alpha*delta*e[i] break tilecode(s1[0],s1[1],F) Q=np.zeros(numActions) for a in range(3): for i in F: Q[a]=Q[a]+theta[i+a*324] delta=delta+np.max(Q) for _ in range(n):
returnsArray = numpy.zeros((numRuns,numEpisodes)) runSum = 0.0 for run in range(numRuns): w = -0.01*pylab.rand(n) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 state = mountaincar.init() e = numpy.zeros(n) steps = 0 while state != None: Tilecoder.tilecode(state[0], state[1], listOfTiles) Q = getQValues(w) action = eGreedy(Q) reward, statePrime = mountaincar.sample(state, action) G = G + reward delta = reward - Q[action] for index in listOfTiles: e[(numTiles*action)+index] = 1 if statePrime == None: for i in range(len(w)): w[i] = w[i] + alpha * delta * e[i] state = statePrime else: Tilecoder.tilecode(statePrime[0], statePrime[1], listOfTiles) Q = getQValues(w)
et = numpy.zeros(n) step = 0 while St != None: step+=1 tilecode(St[0],St[1],F) Q=newQ(F) # policy here, if Epi is changed, action may select differently action = numpy.argmax(Q) if Epi > random_sample(): action = randint(0,3) r, St1 = mountaincar.sample(St,action) G+=r delta=r-Q[action] for i in F: et[i+action*e_para]=1 if St1 == None: for i in range(n): theta[i]+=alpha*delta*et[i] break tilecode(St1[0],St1[1],F) Q=newQ(F) delta=delta+numpy.max(Q) for i in range(n): theta[i]+=alpha*delta*et[i] et[i]=lmbda*et[i]
for episodeNum in xrange(numEpisodes): G = 0 #your code goes here (20-30 lines, depending on modularity) step=0.0 s = mountaincar.init() trace=zeros(n) Q=zeros(3) while s is not None: step += 1 tilecode(s[0],s[1],F) Q = Policy(F,3,theta) if rand() <= epsilon: action = randint(0,2) else: action = argmax(Q) r, sp = mountaincar.sample(s,action) delta=r-Q[action] G+=r for i in F: trace[i+action*numTiles] = 1 if sp == None: theta += alpha*delta*trace break tilecode(sp[0],sp[1],F) delta += max(Policy(F,3,theta)) theta += alpha * delta * trace trace=lmbda * trace *gamma s = sp print "Episode: ", episodeNum, "Steps:", step, "Return: ", G returnSum = returnSum + G
def trueOnlinePolicyGradient(): # logging.basicConfig(filename='example.log',level=logging.DEBUG) for alpha_v in alpha_v_list: alpha_v = alpha_v * 1.0 / num_tilings for alpha_pi in alpha_pi_list: alpha_pi = alpha_pi * 1.0 / num_tilings print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi avg_steps_overall = 0.0 avg_steps_per_run = np.zeros((num_runs, )) avg_steps_per_episode = np.zeros((num_episodes, )) start_time = time.clock() for current_run in range(num_runs): logging.debug("Run #:" + str(current_run)) # print 'Run #:', current_run theta = 0.00001 * np.random.randn(mem_size, num_actions) w = 0.00001 * np.random.randn(mem_size, ) # w_old = np.zeros((mem_size, )) v_old = 0.0 steps_per_episode = np.zeros((num_episodes, )) avg_steps = 0.0 for current_episode in range(num_episodes): # if (current_episode+1) % 10 == 0: # plotWeights(theta, w, current_episode) G = 0.0 step = 0 z_theta = np.zeros((mem_size, num_actions)) z_theta_old = np.zeros((mem_size, num_actions)) z_w = np.zeros((mem_size, )) (pos, vel) = mountaincar.init() phi = np.zeros((mem_size, )) tiled_indices = tilecode(pos, vel) phi[tiled_indices] = 1 current_state = (pos, vel) (a_star, PG_star) = sampleAction(theta, phi) a_prime = 0 PG_prime = np.zeros((mem_size, num_actions)) while current_state is not None and step < max_steps: reward, next_state = mountaincar.sample(current_state, a_star) G += (gamma * reward) step += 1 v_current = np.dot(w.transpose(), phi) v_next = 0.0 phi_prime = np.zeros((mem_size, )) if next_state is not None: tiled_indices = tilecode(next_state[0], next_state[1]) phi_prime[tiled_indices] = 1 v_next = np.dot(w.transpose(), phi_prime) (a_prime, PG_prime) = sampleAction(theta, phi_prime) delta = reward + (gamma * v_next) - v_current # z_w = (gamma * lmbda * z_w) + phi - (alpha_v * gamma * lmbda * np.dot(z_w.transpose(), phi) * phi) # w += (alpha_v * (delta + v_current - v_old) * z_w - alpha_v * (v_current - v_old) * phi) z_w = (gamma * lmbda * z_w) + phi w += (alpha_v * delta * z_w) # z_theta = (gamma * lmbda * z_theta) + PG_star # theta += ((alpha_pi * z_theta * delta) + ((alpha_pi * z_theta_old) * (v_current - v_old))) z_theta = (gamma * lmbda * z_theta) + PG_star theta += (alpha_pi * delta * z_theta) v_old = v_next z_theta_old = np.copy(z_theta) phi = np.copy(phi_prime) a_star = a_prime current_state = next_state PG_star = np.copy(PG_prime) # print '########### Episode: ', current_episode, ' Return: ', G, ' Steps: ', step, " Run: ", current_run steps_per_episode[current_episode] = step avg_steps += step avg_steps = avg_steps * 1.0 / num_episodes avg_steps_overall += avg_steps avg_steps_per_run[current_run] = avg_steps avg_factor = 1.0 / (current_run + 1) for episode_i in range(num_episodes): avg_steps_per_episode[episode_i] *= (1 - avg_factor) avg_steps_per_episode[episode_i] += (avg_factor * steps_per_episode[episode_i]) end_time = time.clock() elapsed_time = (end_time - start_time) / 60.0 print 'Elapsed time: ', elapsed_time # logging.debug('Elapsed time: ' + str(elapsed_time)) avg_steps_overall = avg_steps_overall * 1.0 / num_runs std_error = 0.0 for run_i in range(num_runs): avg_factor_run = 1.0 / (run_i + 1) std_error = ((1 - avg_factor_run) * std_error) + (avg_factor_run * (avg_steps_per_run[run_i] - avg_steps_overall) * (avg_steps_per_run[run_i] - avg_steps_overall)) std_error = np.sqrt(std_error * 1.0 / num_runs) total_steps = avg_steps_overall * num_episodes * num_runs print 'Time per step: ', (elapsed_time * 1.0 / total_steps) print 'alpha_v: ', alpha_v, ' alpha_pi: ', alpha_pi, ' lmbda: ', lmbda print 'average reward: ', -1.0 * avg_steps_overall, ' std. error: ', std_error print 'Policy gradient'
def learn(alpha=.1 / numTilings, epsilon=0, numEpisodes=200): theta1 = -0.001 * rand(n) theta2 = -0.001 * rand(n) #Q=zeros(3) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 step = 0 S = mountaincar.init() tileindec = tilecode(S[0], S[1], [-1] * numTilings) # Q=Qs(tileindec,theta1) # act=argmax(Q) #derivate=zeros(n) while S != None: step += 1 #derivate=zeros(n) #tileindec=tilecode(S[0],S[1],[-1] * numTilings) #Q=Qs(tileindec,theta1) if random() < epsilon: act = randint(0, 3) else: act = argmax(Qs(tileindec, theta1 + theta2)) R, Stemp = mountaincar.sample(S, act) G += R if Stemp == None: pro = randint(0, 2) if pro == 1: q1 = Qs(tileindec, theta1) q2 = Qs(tileindec, theta2) update = alpha * (R + q2[argmax(q1)] - q1[act]) for i in tileindec: theta1[i + act * 324] += update break if pro == 0: q1 = Qs(tileindec, theta1) q2 = Qs(tileindec, theta2) update = alpha * (R + q2[argmax(q1)] - q1[act]) for i in tileindec: theta2[i + act * 324] += update break else: tileindec_tem = tilecode(Stemp[0], Stemp[1], [-1] * numTilings) # for i in tileindec: # derivate[i+act*324]=1 pro = randint(0, 2) if pro == 1: if Stemp != None: q1 = Qs(tileindec_tem, theta1) q2 = Qs(tileindec_tem, theta2) update = alpha * (R + q2[argmax(q1)] - q1[act]) for i in tileindec: theta1[i + act * 324] += update else: if Stemp != None: q1 = Qs(tileindec_tem, theta1) q2 = Qs(tileindec_tem, theta2) update = alpha * (R + q1[argmax(q2)] - q2[act]) for i in tileindec: theta2[i + act * 324] += update S = Stemp tileindec = tileindec_tem # for i in tileindec: # derivate[i+act*324]=1 # # if Stemp==None: # #print(Stemp) # for i in range(n): # theta1[i]=theta1[i]+alpha*(R-Q[act])*derivate[i] # break; # else: # # tileindec_tem=tilecode(Stemp[0],Stemp[1],[-1] * numTilings) # Q_tem=Qs(tileindec_tem,theta1) # #print(Q_tem) # act_tem=argmax(Q_tem) # # for i in range(n): # theta1[i]=theta1[i]+alpha*(R+gamma*(Q_tem[act_tem])-Q[act])*derivate[i] # S=Stemp # #print(S) # ... # your code goes here (20-30 lines, depending on modularity) # ... print("Episode: ", episodeNum, "Steps:", step, "Return: ", G) returnSum = returnSum + G print("Average return:", returnSum / numEpisodes) return returnSum, theta1, theta2
for episodeNum in xrange(numEpisodes): eTrace = [0]*n G = 0 delta = 0 state = mountaincar.init() step = 0 while state != None: step += 1 tiles = tilecode(state[0], state[1],[-1]*numTilings) explore = (random.random() < epsilon) if explore: action = random.randint(0,2) reward, newState = mountaincar.sample(state, action) else: action = getBestAction(tiles, theta) reward, newState = mountaincar.sample(state, action) G += reward if newState != None: delta = reward + updateDelta(tiles, theta, action, newState) eTrace = updateETrace(eTrace, tiles, action) theta = updateTheta(theta, delta, eTrace) else: Qa = 0 for i in tiles: Qa += theta[i + action*4*81] delta = reward - Qa updateETrace(eTrace, tiles, action)
# initialize observation observation = mountaincar.init() # use function approximation to generate next state tilecode(observation[0], observation[1], observation[2], state) # compute the Q values for the state and every action Q = Qs(state) terminal = False A = chooseAction(Q) unknownObs = observation if flipped: R, observation, terminal = mountaincar.sample(observation, A, terminal, False) someRandomAmountOfTime = random.randint(minNumExtraSteps,maxNumExtraSteps) for i in range(1, someRandomAmountOfTime): unknownR, observation, terminal = mountaincar.sample(observation, A, terminal, True) G += unknownR step += someRandomAmountOfTime # repeat for each step of episode while True: if not flipped: # take action a and get reward R and new observation R, observation, terminal = mountaincar.sample(observation, A, terminal, False) # if newObservation is terminal if terminal: