def __init__(self,
              nParams,
              nActions,
              alpha,
              gamma,
              lambd,
              approximationFunctionArgs,
              doAccumulateTraces=False,
              doClearTraces=False,
              actionSelectionMethod="egreedy",
              epsilon=0.01):
     self.name = "SARSA(Lambda)"
     self.nParams = nParams
     self.nActions = nActions
     self.alpha = alpha
     self.gamma = gamma
     self.lambd = lambd
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.ftf = getValueFromDict(self.af_kwargs, "ftf")
     self.doAccumulateTraces = doAccumulateTraces
     self.doClearTraces = doClearTraces
     self.w = np.zeros([self.nParams], dtype=np.float)
     self.z = np.zeros([self.nParams], dtype=np.float)
     self.policy = FunctionApproximationPolicy(
         self.nParams,
         self.nActions,
         self.af_kwargs,
         actionSelectionMethod=actionSelectionMethod,
         epsilon=epsilon)
 def __init__(self,
              nParams,
              nActions,
              alpha,
              gamma,
              lambd,
              approximationFunctionArgs,
              actionSelectionMethod="egreedy",
              epsilon=0.01):
     self.name = "True Online SARSA"
     self.nParams = nParams
     self.nActions = nActions
     self.alpha = alpha
     self.gamma = gamma
     self.lambd = lambd
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.ftf = getValueFromDict(self.af_kwargs, "ftf")
     self.w = np.zeros([self.nParams], dtype=np.float)
     self.z = np.zeros([self.nParams], dtype=np.float)
     self.q_old = 0.0
     self.policy = FunctionApproximationPolicy(
         self.nParams,
         self.nActions,
         self.af_kwargs,
         actionSelectionMethod=actionSelectionMethod,
         epsilon=epsilon)
class SemiGradientTDControl:
    def __init__(self,
                 nParams,
                 nActions,
                 alpha,
                 approximationFunctionArgs,
                 actionSelectionMethod="egreedy",
                 epsilon=0.01):
        self.name = "Generic SemiGradient TD Control Class"
        self.nParams = nParams
        self.nActions = nActions
        self.alpha = alpha
        self.af_kwargs = approximationFunctionArgs
        self.af = getValueFromDict(self.af_kwargs, "af")
        self.afd = getValueFromDict(self.af_kwargs, "afd")
        self.w = np.zeros([self.nParams], dtype=float)
        self.policy = FunctionApproximationPolicy(
            self.nParams,
            self.nActions,
            self.af_kwargs,
            actionSelectionMethod=actionSelectionMethod,
            epsilon=epsilon)

    def selectAction(self, state):
        return self.policy.selectAction(state)

    def getValue(self, state, action=None):
        if action is None:
            return np.array([
                self.af(self.w, state, a, **self.af_kwargs)
                for a in range(self.nActions)
            ])
        else:
            return self.af(self.w, state, action, **self.af_kwargs)

    def getName(self):
        return self.name

    def reset(self):
        self.w = np.zeros([self.nParams], dtype=float)
        self.policy.reset()

    def getGreedyAction(self, state, actionsAvailable=None):
        q = np.array([
            self.af(self.w, state, a, **self.af_kwargs)
            for a in range(self.nActions)
        ])
        if (actionsAvailable is None):
            actionValues = q[:]
            actionList = np.array(range(self.nActions))
        else:
            actionValues = q[actionsAvailable]
            actionList = np.array(actionsAvailable)
        actionIdx = selectAction_greedy(actionValues)
        return actionList[actionIdx]
 def __init__(self,
              nParams,
              nActions,
              alpha,
              approximationFunctionArgs,
              actionSelectionMethod="egreedy",
              epsilon=0.01):
     self.name = "Generic SemiGradient TD Control Class"
     self.nParams = nParams
     self.nActions = nActions
     self.alpha = alpha
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams], dtype=float)
     self.policy = FunctionApproximationPolicy(
         self.nParams,
         self.nActions,
         self.af_kwargs,
         actionSelectionMethod=actionSelectionMethod,
         epsilon=epsilon)
class TrueOnlineSARSA:
    def __init__(self,
                 nParams,
                 nActions,
                 alpha,
                 gamma,
                 lambd,
                 approximationFunctionArgs,
                 actionSelectionMethod="egreedy",
                 epsilon=0.01):
        self.name = "True Online SARSA"
        self.nParams = nParams
        self.nActions = nActions
        self.alpha = alpha
        self.gamma = gamma
        self.lambd = lambd
        self.af_kwargs = approximationFunctionArgs
        self.af = getValueFromDict(self.af_kwargs, "af")
        self.ftf = getValueFromDict(self.af_kwargs, "ftf")
        self.w = np.zeros([self.nParams], dtype=np.float)
        self.z = np.zeros([self.nParams], dtype=np.float)
        self.q_old = 0.0
        self.policy = FunctionApproximationPolicy(
            self.nParams,
            self.nActions,
            self.af_kwargs,
            actionSelectionMethod=actionSelectionMethod,
            epsilon=epsilon)

    def update(self, episode):
        t = len(episode) - 2
        state = episode[t]["state"]
        action = episode[t]["action"]
        reward = episode[t + 1]["reward"]
        next_state = episode[t + 1]["state"]
        next_action = episode[t + 1]["action"]
        done = episode[t + 1]["done"]
        x = self.ftf(state, action, **self.af_kwargs)
        xx = self.ftf(next_state, next_action, **self.af_kwargs)
        q = self.getValue(state, action)
        q_next = self.getValue(next_state, next_action)
        td_error = reward + self.gamma * q_next - q
        self.z = self.gamma * self.lambd * self.z + (
            1 - self.alpha * self.gamma * self.lambd * np.dot(self.z, x)) * x
        self.w += self.alpha * (td_error + q - self.q_old
                                ) * self.z - self.alpha * (q - self.q_old) * x
        self.policy.update(self.w)
        self.q_old = q_next
        if done:
            self.z *= 0.0
            self.q_old = 0.0

    def getValue(self, state, action=None):
        if action is None:
            return np.array([
                self.af(self.w, state, action, **self.af_kwargs)
                for action in range(self.nActions)
            ])
        else:
            return self.af(self.w, state, action, **self.af_kwargs)

    def selectAction(self, state):
        return self.policy.selectAction(state)

    def reset(self):
        self.w = np.zeros([self.nParams], dtype=np.float)
        self.z = np.zeros([self.nParams], dtype=np.float)
        self.q_old = 0.0

    def getName(self):
        return self.name
class SARSALambda:
    def __init__(self,
                 nParams,
                 nActions,
                 alpha,
                 gamma,
                 lambd,
                 approximationFunctionArgs,
                 doAccumulateTraces=False,
                 doClearTraces=False,
                 actionSelectionMethod="egreedy",
                 epsilon=0.01):
        self.name = "SARSA(Lambda)"
        self.nParams = nParams
        self.nActions = nActions
        self.alpha = alpha
        self.gamma = gamma
        self.lambd = lambd
        self.af_kwargs = approximationFunctionArgs
        self.af = getValueFromDict(self.af_kwargs, "af")
        self.ftf = getValueFromDict(self.af_kwargs, "ftf")
        self.doAccumulateTraces = doAccumulateTraces
        self.doClearTraces = doClearTraces
        self.w = np.zeros([self.nParams], dtype=np.float)
        self.z = np.zeros([self.nParams], dtype=np.float)
        self.policy = FunctionApproximationPolicy(
            self.nParams,
            self.nActions,
            self.af_kwargs,
            actionSelectionMethod=actionSelectionMethod,
            epsilon=epsilon)

    def update(self, episode):
        t = len(episode) - 2
        state = episode[t]["state"]
        action = episode[t]["action"]
        reward = episode[t + 1]["reward"]
        next_state = episode[t + 1]["state"]
        next_action = episode[t + 1]["action"]
        done = episode[t + 1]["done"]
        x = self.ftf(state, action, **self.af_kwargs)
        xx = self.ftf(next_state, next_action, **self.af_kwargs)
        td_error = reward
        for i in np.nonzero(x)[0]:
            td_error -= self.w[i]
            if self.doAccumulateTraces:
                self.z[i] += 1
            else:
                self.z[i] = 1
        if done:
            self.w += self.alpha * td_error * self.z
            self.policy.update(self.w)
            self.z *= 0.0
        else:
            for i in np.nonzero(xx)[0]:
                td_error += self.gamma * self.w[i]
            self.w += self.alpha * td_error * self.z
            self.policy.update(self.w)
            self.z = self.gamma * self.lambd * self.z
        if self.doClearTraces:
            idxToClear = np.array(np.ones(self.nParams), dtype=int)
            idxToClear[np.nonzero(x)[0]] = 0
            idxToClear[np.nonzero(xx)[0]] = 0
            self.z[idxToClear] = 0.0

    def getValue(self, state, action=None):
        if action is None:
            return np.array([
                self.af(self.w, state, action, **self.af_kwargs)
                for action in range(self.nActions)
            ])
        else:
            return self.af(self.w, state, action, **self.af_kwargs)

    def selectAction(self, state):
        return self.policy.selectAction(state)

    def reset(self):
        self.w = np.zeros([self.nParams], dtype=np.float)
        self.z = np.zeros([self.nParams], dtype=np.float)

    def getName(self):
        return self.name
    'minStates':minStates, 'maxStates':maxStates, 'nTilings':nTilings, 
    'tilingOffsets':tilingOffsets, 'tilingSize':tilingSize, 'nActions':nActions}

  alpha_expectedSARSA = 0.5/8
  gamma_expectedSARSA = 1.0
  epsilon_expectedSARSA = 0.0
  
  # Behaviour Policy
  epsilon_behaviourPolicy = 0.3
  doUpdateBehaviourPolicy = True
  
  env = MountainCar(positionBounds, velocityBounds, startPositionBounds)
  
  agent_TB = nStepSemiGradientTreeBackup(nParams, nActions, alpha_TB, gamma_TB, n_TB, 
    approximationFunctionArgs=approximationFunctionArgs, epsilon=epsilon_TB)
  behaviour_policy = FunctionApproximationPolicy(nParams, nActions, approximationFunctionArgs, 
    actionSelectionMethod="esoft", epsilon=epsilon_behaviourPolicy)
  nStepsPerEpisode_TB = runExperiment(nEpisodes, env, agent_TB, behaviour_policy, 
    doUpdateBehaviourPolicy, episodesToShowCostToGo, doShowNow=False)

  # Expected SARSA agent, in case you want to experiment with it
  agent_expectedSARSA = SemiGradientExpectedSARSA(nParams, nActions, alpha_expectedSARSA, gamma_expectedSARSA, 
    approximationFunctionArgs=approximationFunctionArgs, epsilon=epsilon_expectedSARSA)
  behaviour_policy = FunctionApproximationPolicy(nParams, nActions, approximationFunctionArgs, 
    actionSelectionMethod="esoft", epsilon=epsilon_behaviourPolicy)
  nStepsPerEpisode_expectedSARSA = runExperiment(nEpisodes, env, agent_expectedSARSA, behaviour_policy, 
    doUpdateBehaviourPolicy, episodesToShowCostToGo, doShowNow=False)
  
  pl.figure()
  pl.plot(nStepsPerEpisode_TB, label=agent_TB.getName())
  pl.plot(nStepsPerEpisode_expectedSARSA, label=agent_expectedSARSA.getName())
  pl.xlabel('Episodes')