class nStepOffPolicySARSA(nStepTDControlAgent): def __init__(self, nStates, nActions, alpha, gamma, n, policyUpdateMethod="esoft", epsilon=0.1, tieBreakingMethod="arbitrary", valueInit="zeros"): super().__init__(nStates, nActions, alpha, gamma, n, valueInit=valueInit) self.name = "n-step off-policy SARSA" self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod, epsilon=epsilon, tieBreakingMethod=tieBreakingMethod) def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy): for tau in range(tau_start, tau_stop): state = self.bufferExperience[tau]['state'] action = self.bufferExperience[tau]['action'] rewards = np.array([ self.bufferExperience[i]['reward'] for i in range(tau + 1, min(tau + self.n, t + 1) + 1) ]) gammas = np.array( [self.gamma**i for i in range(min(self.n, t + 1 - tau))]) l = min(tau + self.n, t + 1) + 1 p = [ self.policy.getProbability(self.bufferExperience[i]['state'], self.bufferExperience[i]['action']) for i in range(tau + 1, l) ] b = [ behaviour_policy.getProbability( self.bufferExperience[i]['state'], self.bufferExperience[i]['action']) for i in range(tau + 1, l) ] W = np.prod(np.array(p) / np.array(b)) G = np.sum(rewards * gammas) if (tau + self.n) <= t + 1: G += self.gamma**(self.n) * self.actionValueTable[ self.bufferExperience[tau + self.n]['state'], self.bufferExperience[tau + self.n]['action']] td_error = G - self.actionValueTable[state, action] self.actionValueTable[state, action] = self.actionValueTable[ state, action] + self.alpha * W * td_error self.policy.update(state, self.actionValueTable[state, :]) def selectAction(self, state, actionsAvailable=None): return self.policy.sampleAction(state, actionsAvailable)
class nStepTreeBackup(nStepTDControlAgent): def __init__(self, nStates, nActions, alpha, gamma, n, policyUpdateMethod="esoft", epsilon=0.1, tieBreakingMethod="arbitrary", valueInit="zeros"): super().__init__(nStates, nActions, alpha, gamma, n, valueInit=valueInit) self.name = "n-step Tree Backup" self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod, epsilon=epsilon, tieBreakingMethod=tieBreakingMethod) def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy=None): for tau in range(tau_start, tau_stop): state = self.bufferExperience[tau]['state'] action = self.bufferExperience[tau]['action'] if (t + 1) >= T: G = self.bufferExperience[T]['reward'] else: last_state = self.bufferExperience[t + 1]['state'] last_reward = self.bufferExperience[t + 1]['reward'] G = last_reward + self.gamma * np.dot( self.policy.getProbability(last_state), self.actionValueTable[last_state, :]) for k in range(min(t, T - 1), tau, -1): sweeping_state = self.bufferExperience[k]['state'] sweeping_action = self.bufferExperience[k]['action'] sweeping_reward = self.bufferExperience[k]['reward'] probActions = np.array( self.policy.getProbability(sweeping_state)) probAction = probActions[sweeping_action] probActions[sweeping_action] = 0.0 G = sweeping_reward + self.gamma * np.dot( probActions, self.actionValueTable[ sweeping_state, :]) + self.gamma * probAction * G td_error = G - self.actionValueTable[state, action] self.actionValueTable[state, action] = self.actionValueTable[ state, action] + self.alpha * td_error self.policy.update(state, self.actionValueTable[state, :]) def selectAction(self, state, actionsAvailable=None): return self.policy.sampleAction(state, actionsAvailable)
class ExpectedSARSA(TDControlAgent): def __init__(self, nStates, nActions, alpha, gamma, actionSelectionMethod="esoft", epsilon=0.01, tieBreakingMethod="arbitrary", valueInit="zeros"): super().__init__(nStates, nActions, alpha, gamma, valueInit=valueInit) self.name = "Expected SARSA" self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod="esoft", epsilon=epsilon, tieBreakingMethod=tieBreakingMethod) def update(self, episode): T = len(episode) for t in range(0, T - 1): state = episode[t]["state"] action = episode[t]["action"] reward = episode[t + 1]["reward"] next_state = episode[t + 1]["state"] if ("allowedActions" in episode[t + 1].keys()): allowedActions = episode[t + 1]["allowedActions"] pdist = Numeric.normalize_sum( self.policy.getProbability(next_state)[allowedActions]) else: allowedActions = np.array(range(self.nActions)) pdist = self.policy.getProbability(next_state) expectedVal = np.dot( pdist, self.actionValueTable[next_state, allowedActions]) td_error = reward + self.gamma * expectedVal - self.actionValueTable[ state, action] self.actionValueTable[state, action] += self.alpha * td_error self.policy.update(state, self.actionValueTable[state, :]) def selectAction(self, state, actionsAvailable=None): return self.policy.sampleAction(state, actionsAvailable)
class BanditGradient(): def __init__(self, nStates, nActions, alpha, doUseBaseline=True): self.nStates = nStates self.nActions = nActions self.alpha = alpha self.doUseBaseline = doUseBaseline self.preferencesTable = np.zeros([self.nStates, self.nActions], dtype=float) + 0.0001 self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod="softmax", tieBreakingMethod="consistent") self.count = 0 self.avgReward = 0.0 def update(self, state, action, reward): if self.doUseBaseline: baseline = self.avgReward else: baseline = 0.0 for a in range(self.nActions): if (a == action): self.preferencesTable[state, a] += self.alpha * ( reward - baseline) * (1.0 - self.policy.getProbability(state, a)) else: self.preferencesTable[state, a] -= self.alpha * ( reward - baseline) * self.policy.getProbability(state, a) self.policy.update(state, self.preferencesTable) self.count += 1 self.avgReward = self.avgReward + (1.0 / self.count) * (reward - self.avgReward) def selectAction(self, state): return self.policy.sampleAction(state) def reset(self): self.preferencesTable = np.zeros([self.nStates, self.nActions], dtype=float) + 0.0001 self.count = 0 self.avgReward = 0.0
class nStepQSigma(nStepTDControlAgent): def __init__(self, nStates, nActions, alpha, gamma, n, sigma, policyUpdateMethod="esoft", epsilon=0.1, tieBreakingMethod="arbitrary", valueInit="zeros"): super().__init__(nStates, nActions, alpha, gamma, n, valueInit=valueInit) self.name = "n-step Q-sigma" self.sigma = sigma self.policy = StochasticPolicy( self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod, epsilon=epsilon, tieBreakingMethod=tieBreakingMethod) # TODO def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy): for tau in range(tau_start, tau_stop): state = self.bufferExperience[tau]['state'] action = self.bufferExperience[tau]['action'] if ((t + 1) < T): G = self.actionValueTable[self.bufferExperience[t + 1]['state'], self.bufferExperience[t + 1]['action']] for k in range(t + 1, tau, -1): sweeping_state = self.bufferExperience[k]['state'] sweeping_action = self.bufferExperience[k]['action'] sweeping_reward = self.bufferExperience[k]['reward'] if (k == T): G = sweeping_reward else: sigma = self.sigma probActions = np.array( self.policy.getProbability(sweeping_state)) p = probActions[sweeping_action] b = behaviour_policy.getProbability( sweeping_state, sweeping_action) W = p / b V = np.dot(probActions, self.actionValueTable[sweeping_state, :]) G = sweeping_reward + self.gamma * ( sigma * W + (1.0 - sigma) * p) * (G - self.actionValueTable[ sweeping_state, sweeping_action]) + self.gamma * V td_error = G - self.actionValueTable[state, action] self.actionValueTable[state, action] = self.actionValueTable[ state, action] + self.alpha * td_error self.policy.update(state, self.actionValueTable[state, :]) def selectAction(self, state, actionsAvailable=None): return self.policy.sampleAction(state, actionsAvailable)
xp = {} xp['reward'] = reward xp['state'] = new_state xp['allowedActions'] = env.getAvailableActions( new_state) # TODO check xp['done'] = done experiences.append(xp) state = new_state agent.update(experiences, behaviour_policy) if (doUpdateBehaviourPolicy): # update behaviour policy to be e-soft version of the target policy for idx_state in range(env.nStates): behaviour_policy.update(idx_state, agent.actionValueTable[idx_state, :]) # Simulation after learning # ------------------------- env.printEnv(agent) input("Press any key to continue...") env.p_actionFail = 0.0 agentHistory = runSimulation(env, agent) print("Simulation:") env.render(agentHistory)
thresh_convergence = 1e-30 n = 5 alpha_TDnOP = 0.001 alpha_TDnPD = 0.001 env = DeterministicGridWorld(sizeX, sizeY, defaultReward=defaultReward, terminalStates=terminalStates) # Behaviour policy is a simple stochastic policy with equiprobable actions behaviour_policy = StochasticPolicy(env.nStates, env.nActions) # Load target policy q table # We will use the optimal policy learned via VI as target policy # These are the values learned in chapter04/03_GridWorld_2_VI.py with open('gridworld_2_qtable.npy', 'rb') as f: targetPolicy_qTable = np.load(f) target_policy = StochasticPolicy(env.nStates, env.nActions) for s in range(env.nStates): target_policy.update(s, targetPolicy_qTable[s,:]) # A policy evaluation agent will provide the ground truth agent_PE = PolicyEvaluation(env.nStates, env.nActions, gamma, thresh_convergence, env.computeExpectedValue) env.printEnv() # Policy evaluation for reference for e in range(nEpisodes): deltaMax, isConverged = agent_PE.evaluate(target_policy) #print("Episode : ", e, " Delta: ", deltaMax) printStr = "" for y in range(sizeY): for x in range(sizeX):