Beispiel #1
0
class nStepTreeBackup(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step Tree Backup"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy=None):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            if (t + 1) >= T:
                G = self.bufferExperience[T]['reward']
            else:
                last_state = self.bufferExperience[t + 1]['state']
                last_reward = self.bufferExperience[t + 1]['reward']
                G = last_reward + self.gamma * np.dot(
                    self.policy.getProbability(last_state),
                    self.actionValueTable[last_state, :])
            for k in range(min(t, T - 1), tau, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k]['reward']
                probActions = np.array(
                    self.policy.getProbability(sweeping_state))
                probAction = probActions[sweeping_action]
                probActions[sweeping_action] = 0.0
                G = sweeping_reward + self.gamma * np.dot(
                    probActions, self.actionValueTable[
                        sweeping_state, :]) + self.gamma * probAction * G
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Beispiel #2
0
class nStepOffPolicySARSA(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step off-policy SARSA"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            rewards = np.array([
                self.bufferExperience[i]['reward']
                for i in range(tau + 1,
                               min(tau + self.n, t + 1) + 1)
            ])
            gammas = np.array(
                [self.gamma**i for i in range(min(self.n, t + 1 - tau))])
            l = min(tau + self.n, t + 1) + 1
            p = [
                self.policy.getProbability(self.bufferExperience[i]['state'],
                                           self.bufferExperience[i]['action'])
                for i in range(tau + 1, l)
            ]
            b = [
                behaviour_policy.getProbability(
                    self.bufferExperience[i]['state'],
                    self.bufferExperience[i]['action'])
                for i in range(tau + 1, l)
            ]
            W = np.prod(np.array(p) / np.array(b))
            G = np.sum(rewards * gammas)
            if (tau + self.n) <= t + 1:
                G += self.gamma**(self.n) * self.actionValueTable[
                    self.bufferExperience[tau + self.n]['state'],
                    self.bufferExperience[tau + self.n]['action']]
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * W * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Beispiel #3
0
class ExpectedSARSA(TDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 actionSelectionMethod="esoft",
                 epsilon=0.01,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates, nActions, alpha, gamma, valueInit=valueInit)
        self.name = "Expected SARSA"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod="esoft",
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def update(self, episode):
        T = len(episode)
        for t in range(0, T - 1):
            state = episode[t]["state"]
            action = episode[t]["action"]
            reward = episode[t + 1]["reward"]
            next_state = episode[t + 1]["state"]
            if ("allowedActions" in episode[t + 1].keys()):
                allowedActions = episode[t + 1]["allowedActions"]
                pdist = Numeric.normalize_sum(
                    self.policy.getProbability(next_state)[allowedActions])
            else:
                allowedActions = np.array(range(self.nActions))
                pdist = self.policy.getProbability(next_state)
            expectedVal = np.dot(
                pdist, self.actionValueTable[next_state, allowedActions])
            td_error = reward + self.gamma * expectedVal - self.actionValueTable[
                state, action]
            self.actionValueTable[state, action] += self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
class BanditGradient():
    def __init__(self, nStates, nActions, alpha, doUseBaseline=True):
        self.nStates = nStates
        self.nActions = nActions
        self.alpha = alpha
        self.doUseBaseline = doUseBaseline
        self.preferencesTable = np.zeros([self.nStates, self.nActions],
                                         dtype=float) + 0.0001
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod="softmax",
                                       tieBreakingMethod="consistent")
        self.count = 0
        self.avgReward = 0.0

    def update(self, state, action, reward):
        if self.doUseBaseline:
            baseline = self.avgReward
        else:
            baseline = 0.0
        for a in range(self.nActions):
            if (a == action):
                self.preferencesTable[state, a] += self.alpha * (
                    reward - baseline) * (1.0 -
                                          self.policy.getProbability(state, a))
            else:
                self.preferencesTable[state, a] -= self.alpha * (
                    reward - baseline) * self.policy.getProbability(state, a)
        self.policy.update(state, self.preferencesTable)
        self.count += 1
        self.avgReward = self.avgReward + (1.0 / self.count) * (reward -
                                                                self.avgReward)

    def selectAction(self, state):
        return self.policy.sampleAction(state)

    def reset(self):
        self.preferencesTable = np.zeros([self.nStates, self.nActions],
                                         dtype=float) + 0.0001
        self.count = 0
        self.avgReward = 0.0
Beispiel #5
0
class nStepPerDecisionTDPrediction(nStepTDPredictionAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 valueInit="zeros",
                 policyUpdateMethod="greedy",
                 epsilon=0.0,
                 tieBreakingMethod="consistent"):
        super().__init__(nStates, alpha, gamma, n, valueInit=valueInit)
        self.name = "n-step Per-Decision TD Prediction"
        self.nActions = nActions
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            l = min(T + 1, t + 1)
            G = self.valueTable[self.bufferExperience[l]['state']]
            for k in range(l - 1, tau - 1, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k + 1]['reward']
                p = self.policy.getProbability(sweeping_state, sweeping_action)
                b = behaviour_policy.getProbability(sweeping_state,
                                                    sweeping_action)
                W = p / b
                G = W * (sweeping_reward + self.gamma * G) + (
                    1.0 - W) * self.valueTable[sweeping_state]
            td_error = G - self.valueTable[state]
            self.valueTable[
                state] = self.valueTable[state] + self.alpha * td_error

    def reset(self):
        super().reset()
        self.policy.reset()
Beispiel #6
0
class MCControlAgent:

  def __init__(self, nStates, nActions, gamma, policyUpdateMethod="greedy", epsilon=0.0, tieBreakingMethod="arbitrary"):
    self.name = "Generic Monte Carlo Control Agent"
    self.nStates = nStates
    self.nActions = nActions
    self.gamma = gamma
    self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=float)
    self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod,
      epsilon=epsilon, tieBreakingMethod=tieBreakingMethod)

  def selectAction(self, state, actionsAvailable=None):
    return self.policy.sampleAction(state, actionsAvailable)
    
  def getGreedyAction(self, state, actionsAvailable=None):
    if(actionsAvailable is None):
      actionValues = self.actionValueTable[state,:]
      actionList = np.array(range(self.nActions))
    else:
      actionValues = self.actionValueTable[state, actionsAvailable]
      actionList = np.array(actionsAvailable)
    actionIdx = selectAction_greedy(actionValues)
    return actionList[actionIdx]
    
  def getValue(self, state):
    return np.dot(self.policy.getProbability(state), self.actionValueTable[state,:])
    
  def getActionValue(self, state, action):
    return self.actionValueTable[state,action]

  def getName(self):
    return self.name
    
  def reset(self):
    self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=np.float)
    self.policy.reset()    
Beispiel #7
0
class nStepQSigma(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 sigma,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step Q-sigma"
        self.sigma = sigma
        self.policy = StochasticPolicy(
            self.nStates,
            self.nActions,
            policyUpdateMethod=policyUpdateMethod,
            epsilon=epsilon,
            tieBreakingMethod=tieBreakingMethod)  # TODO

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            if ((t + 1) < T):
                G = self.actionValueTable[self.bufferExperience[t +
                                                                1]['state'],
                                          self.bufferExperience[t +
                                                                1]['action']]
            for k in range(t + 1, tau, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k]['reward']
                if (k == T):
                    G = sweeping_reward
                else:
                    sigma = self.sigma
                    probActions = np.array(
                        self.policy.getProbability(sweeping_state))
                    p = probActions[sweeping_action]
                    b = behaviour_policy.getProbability(
                        sweeping_state, sweeping_action)
                    W = p / b
                    V = np.dot(probActions,
                               self.actionValueTable[sweeping_state, :])
                    G = sweeping_reward + self.gamma * (
                        sigma * W +
                        (1.0 - sigma) * p) * (G - self.actionValueTable[
                            sweeping_state, sweeping_action]) + self.gamma * V
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)