Ejemplo n.º 1
0
class nStepOffPolicySARSA(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step off-policy SARSA"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            rewards = np.array([
                self.bufferExperience[i]['reward']
                for i in range(tau + 1,
                               min(tau + self.n, t + 1) + 1)
            ])
            gammas = np.array(
                [self.gamma**i for i in range(min(self.n, t + 1 - tau))])
            l = min(tau + self.n, t + 1) + 1
            p = [
                self.policy.getProbability(self.bufferExperience[i]['state'],
                                           self.bufferExperience[i]['action'])
                for i in range(tau + 1, l)
            ]
            b = [
                behaviour_policy.getProbability(
                    self.bufferExperience[i]['state'],
                    self.bufferExperience[i]['action'])
                for i in range(tau + 1, l)
            ]
            W = np.prod(np.array(p) / np.array(b))
            G = np.sum(rewards * gammas)
            if (tau + self.n) <= t + 1:
                G += self.gamma**(self.n) * self.actionValueTable[
                    self.bufferExperience[tau + self.n]['state'],
                    self.bufferExperience[tau + self.n]['action']]
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * W * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Ejemplo n.º 2
0
class nStepTreeBackup(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step Tree Backup"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy=None):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            if (t + 1) >= T:
                G = self.bufferExperience[T]['reward']
            else:
                last_state = self.bufferExperience[t + 1]['state']
                last_reward = self.bufferExperience[t + 1]['reward']
                G = last_reward + self.gamma * np.dot(
                    self.policy.getProbability(last_state),
                    self.actionValueTable[last_state, :])
            for k in range(min(t, T - 1), tau, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k]['reward']
                probActions = np.array(
                    self.policy.getProbability(sweeping_state))
                probAction = probActions[sweeping_action]
                probActions[sweeping_action] = 0.0
                G = sweeping_reward + self.gamma * np.dot(
                    probActions, self.actionValueTable[
                        sweeping_state, :]) + self.gamma * probAction * G
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Ejemplo n.º 3
0
class ExpectedSARSA(TDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 actionSelectionMethod="esoft",
                 epsilon=0.01,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates, nActions, alpha, gamma, valueInit=valueInit)
        self.name = "Expected SARSA"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod="esoft",
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def update(self, episode):
        T = len(episode)
        for t in range(0, T - 1):
            state = episode[t]["state"]
            action = episode[t]["action"]
            reward = episode[t + 1]["reward"]
            next_state = episode[t + 1]["state"]
            if ("allowedActions" in episode[t + 1].keys()):
                allowedActions = episode[t + 1]["allowedActions"]
                pdist = Numeric.normalize_sum(
                    self.policy.getProbability(next_state)[allowedActions])
            else:
                allowedActions = np.array(range(self.nActions))
                pdist = self.policy.getProbability(next_state)
            expectedVal = np.dot(
                pdist, self.actionValueTable[next_state, allowedActions])
            td_error = reward + self.gamma * expectedVal - self.actionValueTable[
                state, action]
            self.actionValueTable[state, action] += self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
class BanditGradient():
    def __init__(self, nStates, nActions, alpha, doUseBaseline=True):
        self.nStates = nStates
        self.nActions = nActions
        self.alpha = alpha
        self.doUseBaseline = doUseBaseline
        self.preferencesTable = np.zeros([self.nStates, self.nActions],
                                         dtype=float) + 0.0001
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod="softmax",
                                       tieBreakingMethod="consistent")
        self.count = 0
        self.avgReward = 0.0

    def update(self, state, action, reward):
        if self.doUseBaseline:
            baseline = self.avgReward
        else:
            baseline = 0.0
        for a in range(self.nActions):
            if (a == action):
                self.preferencesTable[state, a] += self.alpha * (
                    reward - baseline) * (1.0 -
                                          self.policy.getProbability(state, a))
            else:
                self.preferencesTable[state, a] -= self.alpha * (
                    reward - baseline) * self.policy.getProbability(state, a)
        self.policy.update(state, self.preferencesTable)
        self.count += 1
        self.avgReward = self.avgReward + (1.0 / self.count) * (reward -
                                                                self.avgReward)

    def selectAction(self, state):
        return self.policy.sampleAction(state)

    def reset(self):
        self.preferencesTable = np.zeros([self.nStates, self.nActions],
                                         dtype=float) + 0.0001
        self.count = 0
        self.avgReward = 0.0
Ejemplo n.º 5
0
class nStepQSigma(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 sigma,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step Q-sigma"
        self.sigma = sigma
        self.policy = StochasticPolicy(
            self.nStates,
            self.nActions,
            policyUpdateMethod=policyUpdateMethod,
            epsilon=epsilon,
            tieBreakingMethod=tieBreakingMethod)  # TODO

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            if ((t + 1) < T):
                G = self.actionValueTable[self.bufferExperience[t +
                                                                1]['state'],
                                          self.bufferExperience[t +
                                                                1]['action']]
            for k in range(t + 1, tau, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k]['reward']
                if (k == T):
                    G = sweeping_reward
                else:
                    sigma = self.sigma
                    probActions = np.array(
                        self.policy.getProbability(sweeping_state))
                    p = probActions[sweeping_action]
                    b = behaviour_policy.getProbability(
                        sweeping_state, sweeping_action)
                    W = p / b
                    V = np.dot(probActions,
                               self.actionValueTable[sweeping_state, :])
                    G = sweeping_reward + self.gamma * (
                        sigma * W +
                        (1.0 - sigma) * p) * (G - self.actionValueTable[
                            sweeping_state, sweeping_action]) + self.gamma * V
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Ejemplo n.º 6
0
            xp = {}
            xp['reward'] = reward
            xp['state'] = new_state
            xp['allowedActions'] = env.getAvailableActions(
                new_state)  # TODO check
            xp['done'] = done
            experiences.append(xp)

            state = new_state

        agent.update(experiences, behaviour_policy)

        if (doUpdateBehaviourPolicy):
            # update behaviour policy to be e-soft version of the target policy
            for idx_state in range(env.nStates):
                behaviour_policy.update(idx_state,
                                        agent.actionValueTable[idx_state, :])

    # Simulation after learning
    # -------------------------
    env.printEnv(agent)

    input("Press any key to continue...")

    env.p_actionFail = 0.0
    agentHistory = runSimulation(env, agent)

    print("Simulation:")

    env.render(agentHistory)
Ejemplo n.º 7
0
  thresh_convergence = 1e-30
  n = 5
  alpha_TDnOP = 0.001
  alpha_TDnPD = 0.001
 
  env = DeterministicGridWorld(sizeX, sizeY, defaultReward=defaultReward, terminalStates=terminalStates)
  # Behaviour policy is a simple stochastic policy with equiprobable actions
  behaviour_policy = StochasticPolicy(env.nStates, env.nActions)
  # Load target policy q table
  # We will use the optimal policy learned via VI as target policy
  # These are the values learned in chapter04/03_GridWorld_2_VI.py
  with open('gridworld_2_qtable.npy', 'rb') as f:
    targetPolicy_qTable = np.load(f)  
  target_policy = StochasticPolicy(env.nStates, env.nActions)
  for s in range(env.nStates):
    target_policy.update(s, targetPolicy_qTable[s,:])
  # A policy evaluation agent will provide the ground truth
  agent_PE = PolicyEvaluation(env.nStates, env.nActions, gamma, thresh_convergence, env.computeExpectedValue)
  
  env.printEnv()
  
  # Policy evaluation for reference
  for e in range(nEpisodes):
      
    deltaMax, isConverged = agent_PE.evaluate(target_policy)
    
    #print("Episode : ", e, " Delta: ", deltaMax)
    
    printStr = ""
    for y in range(sizeY):
      for x in range(sizeX):