class nStepPerDecisionTDPrediction(nStepTDPredictionAgent): def __init__(self, nStates, nActions, alpha, gamma, n, valueInit="zeros", policyUpdateMethod="greedy", epsilon=0.0, tieBreakingMethod="consistent"): super().__init__(nStates, alpha, gamma, n, valueInit=valueInit) self.name = "n-step Per-Decision TD Prediction" self.nActions = nActions self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod, epsilon=epsilon, tieBreakingMethod=tieBreakingMethod) def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy): for tau in range(tau_start, tau_stop): state = self.bufferExperience[tau]['state'] l = min(T + 1, t + 1) G = self.valueTable[self.bufferExperience[l]['state']] for k in range(l - 1, tau - 1, -1): sweeping_state = self.bufferExperience[k]['state'] sweeping_action = self.bufferExperience[k]['action'] sweeping_reward = self.bufferExperience[k + 1]['reward'] p = self.policy.getProbability(sweeping_state, sweeping_action) b = behaviour_policy.getProbability(sweeping_state, sweeping_action) W = p / b G = W * (sweeping_reward + self.gamma * G) + ( 1.0 - W) * self.valueTable[sweeping_state] td_error = G - self.valueTable[state] self.valueTable[ state] = self.valueTable[state] + self.alpha * td_error def reset(self): super().reset() self.policy.reset()
class MCControlAgent: def __init__(self, nStates, nActions, gamma, policyUpdateMethod="greedy", epsilon=0.0, tieBreakingMethod="arbitrary"): self.name = "Generic Monte Carlo Control Agent" self.nStates = nStates self.nActions = nActions self.gamma = gamma self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=float) self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod, epsilon=epsilon, tieBreakingMethod=tieBreakingMethod) def selectAction(self, state, actionsAvailable=None): return self.policy.sampleAction(state, actionsAvailable) def getGreedyAction(self, state, actionsAvailable=None): if(actionsAvailable is None): actionValues = self.actionValueTable[state,:] actionList = np.array(range(self.nActions)) else: actionValues = self.actionValueTable[state, actionsAvailable] actionList = np.array(actionsAvailable) actionIdx = selectAction_greedy(actionValues) return actionList[actionIdx] def getValue(self, state): return np.dot(self.policy.getProbability(state), self.actionValueTable[state,:]) def getActionValue(self, state, action): return self.actionValueTable[state,action] def getName(self): return self.name def reset(self): self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=np.float) self.policy.reset()