Beispiel #1
0
def GetValueMC(mdp, rl: MCRL, nIter=20):
    ''' Every time step update
    '''
    N = 1
    V = defaultdict(int)
    N = defaultdict(int)

    #for MC convergence Plot
    #Maps states to histo of V values
    Vhisto = defaultdict(list)

    for i in range(nIter):
        for start_state in mdp.states:
            rl.reset()
            totalRewards = simulate(mdp, rl, start_state, numTrials=1)
            G_t = 0
            totalDiscount = 1
            #We go backward in the sequence of rewards
            rl.RewardSequence.reverse()
            for state, R_t in rl.RewardSequence:
                G_t += R_t * totalDiscount
                totalDiscount *= rl.gamma
                N[state] += 1
                V[state] += G_t
                Vhisto[state].append(1 / N[state] * V[state])

    for state in N.keys():
        V[state] *= 1 / N[state]

    return V, Vhisto
Beispiel #2
0
def GetValueTDLambda(mdp,
                     rl: MCRL,
                     lbda=0.95,
                     online=True,
                     alpha=0.01,
                     max_T=10000,
                     nIter=20):
    V = defaultdict(int)

    #for convergence Plot
    #Maps states to histo of V values
    Vhisto = defaultdict(list)

    for i in range(nIter):
        for start_state in mdp.states:
            rl.reset()
            totalRewards = simulate(mdp,
                                    rl,
                                    start_state,
                                    numTrials=1,
                                    maxIterations=max_T)
            if not (online):
                episodeUpdate = defaultdict(int)

            T = len(rl.RewardSequence)
            for t in range(T - 1):
                curSumReward = 0
                G_t_lbda = 0
                totalDiscount = 1
                scaling = (1 - lbda) / lbda
                state_t = rl.RewardSequence[t][0]
                n = 0
                for state, R_t, nextState in rl.RewardSequence[t + 1:]:
                    n += 1
                    if n < len(rl.RewardSequence) - 1:
                        scaling *= lbda
                    else:
                        scaling = lbda**(T - t - 1)  #T = n for the last step
                    curSumReward += R_t * totalDiscount
                    G_t_n = curSumReward + totalDiscount * rl.gamma * V[
                        nextState]
                    totalDiscount *= rl.gamma
                    G_t_lbda += scaling * G_t_n

                if online:
                    V[state_t] += +alpha * (G_t_lbda - V[state_t])
                    Vhisto[state_t].append(V[state_t])
                else:
                    episodeUpdate[state_t] += alpha * (G_t_lbda - V[state_t])

            if not (online):
                for state, val in episodeUpdate.items():
                    V[state] += val
                    Vhisto[state].append(V[state])

    return V, Vhisto
Beispiel #3
0
def GetValueTD(mdp, rl: TD, nIter=20):
    ''' Every time step update
    '''
    N = 1
    V = defaultdict(int)
    N = defaultdict(int)
    
    for start_state in mdp.states:
        totalRewards = simulate(mdp, rl, start_state, numTrials=nIter)
        
    return rl.V, rl.Vhisto
Beispiel #4
0
def GetValueBackwardTDLambda(mdp,
                             rl: MCRL,
                             lbda=0.97,
                             online=True,
                             alpha=0.01,
                             max_T=10000,
                             nIter=20):
    V = defaultdict(int)

    #for convergence Plot
    #Maps states to histo of V values
    Vhisto = defaultdict(list)

    for i in range(nIter):
        for start_state in mdp.states:
            rl.reset()
            totalRewards = simulate(mdp,
                                    rl,
                                    start_state,
                                    numTrials=1,
                                    maxIterations=max_T)
            #Eligibility Trace
            E_t = defaultdict(float)

            if not (online):
                episodeUpdate = defaultdict(int)

            T = len(rl.RewardSequence)
            totalDiscount = 1
            for t in range(T):
                decay(E_t, lbda, rl.gamma)

                state_t, r_t, next_state = rl.RewardSequence[t]
                E_t[state_t] += 1
                delta_t = r_t + rl.gamma * V[next_state] - V[state_t]

                if online:
                    V[state_t] += alpha * delta_t * E_t[state_t]
                    Vhisto[state_t].append(V[state_t])
                else:
                    episodeUpdate[state_t] += alpha * delta_t * E_t[state_t]

            if not (online):
                for state, val in episodeUpdate.items():
                    V[state] += val
                    Vhisto[state].append(V[state])

    return V, Vhisto
Beispiel #5
0
from my_utils.frog_mdp import FrogMDP, f, generate_transitions_rewards, get_frog_mdp

from my_utils.rl import RLAlgorithm, FixedRLAlgorithm, simulate

class FrogRL(RLAlgorithm):
    # Return the Q function associated with the weights and features
    def getQ(self, state, action):
        score = 0
        for f, v in self.featureExtractor(state, action):
            score += self.weights[f] * v
        return score
    
    def getAction(self, state):
        pass
    #When simulating an MDP, update parameters.
    # If |state| is a terminal state, this function will be called with (s, a,
    # 0, None). When this function is called, it indicates that taking action
    # |action| in state |state| resulted in reward |reward| and a transition to state
    # |newState|.
    def incorporateFeedback(self, state, action, reward, newState):
        pass
        
if __name__ == "__main__":
    n = 10
    mdp, a_policy = get_frog_mdp(n)
    rl = FixedRLAlgorithm(a_policy)
    start_state = n//2
    totalRewards = simulate(mdp, rl, start_state, numTrials=10, maxIterations=1000)
    print(totalRewards)