def GetValueMC(mdp, rl: MCRL, nIter=20): ''' Every time step update ''' N = 1 V = defaultdict(int) N = defaultdict(int) #for MC convergence Plot #Maps states to histo of V values Vhisto = defaultdict(list) for i in range(nIter): for start_state in mdp.states: rl.reset() totalRewards = simulate(mdp, rl, start_state, numTrials=1) G_t = 0 totalDiscount = 1 #We go backward in the sequence of rewards rl.RewardSequence.reverse() for state, R_t in rl.RewardSequence: G_t += R_t * totalDiscount totalDiscount *= rl.gamma N[state] += 1 V[state] += G_t Vhisto[state].append(1 / N[state] * V[state]) for state in N.keys(): V[state] *= 1 / N[state] return V, Vhisto
def GetValueTDLambda(mdp, rl: MCRL, lbda=0.95, online=True, alpha=0.01, max_T=10000, nIter=20): V = defaultdict(int) #for convergence Plot #Maps states to histo of V values Vhisto = defaultdict(list) for i in range(nIter): for start_state in mdp.states: rl.reset() totalRewards = simulate(mdp, rl, start_state, numTrials=1, maxIterations=max_T) if not (online): episodeUpdate = defaultdict(int) T = len(rl.RewardSequence) for t in range(T - 1): curSumReward = 0 G_t_lbda = 0 totalDiscount = 1 scaling = (1 - lbda) / lbda state_t = rl.RewardSequence[t][0] n = 0 for state, R_t, nextState in rl.RewardSequence[t + 1:]: n += 1 if n < len(rl.RewardSequence) - 1: scaling *= lbda else: scaling = lbda**(T - t - 1) #T = n for the last step curSumReward += R_t * totalDiscount G_t_n = curSumReward + totalDiscount * rl.gamma * V[ nextState] totalDiscount *= rl.gamma G_t_lbda += scaling * G_t_n if online: V[state_t] += +alpha * (G_t_lbda - V[state_t]) Vhisto[state_t].append(V[state_t]) else: episodeUpdate[state_t] += alpha * (G_t_lbda - V[state_t]) if not (online): for state, val in episodeUpdate.items(): V[state] += val Vhisto[state].append(V[state]) return V, Vhisto
def GetValueTD(mdp, rl: TD, nIter=20): ''' Every time step update ''' N = 1 V = defaultdict(int) N = defaultdict(int) for start_state in mdp.states: totalRewards = simulate(mdp, rl, start_state, numTrials=nIter) return rl.V, rl.Vhisto
def GetValueBackwardTDLambda(mdp, rl: MCRL, lbda=0.97, online=True, alpha=0.01, max_T=10000, nIter=20): V = defaultdict(int) #for convergence Plot #Maps states to histo of V values Vhisto = defaultdict(list) for i in range(nIter): for start_state in mdp.states: rl.reset() totalRewards = simulate(mdp, rl, start_state, numTrials=1, maxIterations=max_T) #Eligibility Trace E_t = defaultdict(float) if not (online): episodeUpdate = defaultdict(int) T = len(rl.RewardSequence) totalDiscount = 1 for t in range(T): decay(E_t, lbda, rl.gamma) state_t, r_t, next_state = rl.RewardSequence[t] E_t[state_t] += 1 delta_t = r_t + rl.gamma * V[next_state] - V[state_t] if online: V[state_t] += alpha * delta_t * E_t[state_t] Vhisto[state_t].append(V[state_t]) else: episodeUpdate[state_t] += alpha * delta_t * E_t[state_t] if not (online): for state, val in episodeUpdate.items(): V[state] += val Vhisto[state].append(V[state]) return V, Vhisto
from my_utils.frog_mdp import FrogMDP, f, generate_transitions_rewards, get_frog_mdp from my_utils.rl import RLAlgorithm, FixedRLAlgorithm, simulate class FrogRL(RLAlgorithm): # Return the Q function associated with the weights and features def getQ(self, state, action): score = 0 for f, v in self.featureExtractor(state, action): score += self.weights[f] * v return score def getAction(self, state): pass #When simulating an MDP, update parameters. # If |state| is a terminal state, this function will be called with (s, a, # 0, None). When this function is called, it indicates that taking action # |action| in state |state| resulted in reward |reward| and a transition to state # |newState|. def incorporateFeedback(self, state, action, reward, newState): pass if __name__ == "__main__": n = 10 mdp, a_policy = get_frog_mdp(n) rl = FixedRLAlgorithm(a_policy) start_state = n//2 totalRewards = simulate(mdp, rl, start_state, numTrials=10, maxIterations=1000) print(totalRewards)