Example #1
0
class nStepOffPolicySARSA(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step off-policy SARSA"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            rewards = np.array([
                self.bufferExperience[i]['reward']
                for i in range(tau + 1,
                               min(tau + self.n, t + 1) + 1)
            ])
            gammas = np.array(
                [self.gamma**i for i in range(min(self.n, t + 1 - tau))])
            l = min(tau + self.n, t + 1) + 1
            p = [
                self.policy.getProbability(self.bufferExperience[i]['state'],
                                           self.bufferExperience[i]['action'])
                for i in range(tau + 1, l)
            ]
            b = [
                behaviour_policy.getProbability(
                    self.bufferExperience[i]['state'],
                    self.bufferExperience[i]['action'])
                for i in range(tau + 1, l)
            ]
            W = np.prod(np.array(p) / np.array(b))
            G = np.sum(rewards * gammas)
            if (tau + self.n) <= t + 1:
                G += self.gamma**(self.n) * self.actionValueTable[
                    self.bufferExperience[tau + self.n]['state'],
                    self.bufferExperience[tau + self.n]['action']]
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * W * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Example #2
0
class nStepTreeBackup(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step Tree Backup"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod=policyUpdateMethod,
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy=None):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            if (t + 1) >= T:
                G = self.bufferExperience[T]['reward']
            else:
                last_state = self.bufferExperience[t + 1]['state']
                last_reward = self.bufferExperience[t + 1]['reward']
                G = last_reward + self.gamma * np.dot(
                    self.policy.getProbability(last_state),
                    self.actionValueTable[last_state, :])
            for k in range(min(t, T - 1), tau, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k]['reward']
                probActions = np.array(
                    self.policy.getProbability(sweeping_state))
                probAction = probActions[sweeping_action]
                probActions[sweeping_action] = 0.0
                G = sweeping_reward + self.gamma * np.dot(
                    probActions, self.actionValueTable[
                        sweeping_state, :]) + self.gamma * probAction * G
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
class BanditGradient():
    def __init__(self, nStates, nActions, alpha, doUseBaseline=True):
        self.nStates = nStates
        self.nActions = nActions
        self.alpha = alpha
        self.doUseBaseline = doUseBaseline
        self.preferencesTable = np.zeros([self.nStates, self.nActions],
                                         dtype=float) + 0.0001
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod="softmax",
                                       tieBreakingMethod="consistent")
        self.count = 0
        self.avgReward = 0.0

    def update(self, state, action, reward):
        if self.doUseBaseline:
            baseline = self.avgReward
        else:
            baseline = 0.0
        for a in range(self.nActions):
            if (a == action):
                self.preferencesTable[state, a] += self.alpha * (
                    reward - baseline) * (1.0 -
                                          self.policy.getProbability(state, a))
            else:
                self.preferencesTable[state, a] -= self.alpha * (
                    reward - baseline) * self.policy.getProbability(state, a)
        self.policy.update(state, self.preferencesTable)
        self.count += 1
        self.avgReward = self.avgReward + (1.0 / self.count) * (reward -
                                                                self.avgReward)

    def selectAction(self, state):
        return self.policy.sampleAction(state)

    def reset(self):
        self.preferencesTable = np.zeros([self.nStates, self.nActions],
                                         dtype=float) + 0.0001
        self.count = 0
        self.avgReward = 0.0
Example #4
0
class ExpectedSARSA(TDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 actionSelectionMethod="esoft",
                 epsilon=0.01,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates, nActions, alpha, gamma, valueInit=valueInit)
        self.name = "Expected SARSA"
        self.policy = StochasticPolicy(self.nStates,
                                       self.nActions,
                                       policyUpdateMethod="esoft",
                                       epsilon=epsilon,
                                       tieBreakingMethod=tieBreakingMethod)

    def update(self, episode):
        T = len(episode)
        for t in range(0, T - 1):
            state = episode[t]["state"]
            action = episode[t]["action"]
            reward = episode[t + 1]["reward"]
            next_state = episode[t + 1]["state"]
            if ("allowedActions" in episode[t + 1].keys()):
                allowedActions = episode[t + 1]["allowedActions"]
                pdist = Numeric.normalize_sum(
                    self.policy.getProbability(next_state)[allowedActions])
            else:
                allowedActions = np.array(range(self.nActions))
                pdist = self.policy.getProbability(next_state)
            expectedVal = np.dot(
                pdist, self.actionValueTable[next_state, allowedActions])
            td_error = reward + self.gamma * expectedVal - self.actionValueTable[
                state, action]
            self.actionValueTable[state, action] += self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Example #5
0
class MCControlAgent:

  def __init__(self, nStates, nActions, gamma, policyUpdateMethod="greedy", epsilon=0.0, tieBreakingMethod="arbitrary"):
    self.name = "Generic Monte Carlo Control Agent"
    self.nStates = nStates
    self.nActions = nActions
    self.gamma = gamma
    self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=float)
    self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod,
      epsilon=epsilon, tieBreakingMethod=tieBreakingMethod)

  def selectAction(self, state, actionsAvailable=None):
    return self.policy.sampleAction(state, actionsAvailable)
    
  def getGreedyAction(self, state, actionsAvailable=None):
    if(actionsAvailable is None):
      actionValues = self.actionValueTable[state,:]
      actionList = np.array(range(self.nActions))
    else:
      actionValues = self.actionValueTable[state, actionsAvailable]
      actionList = np.array(actionsAvailable)
    actionIdx = selectAction_greedy(actionValues)
    return actionList[actionIdx]
    
  def getValue(self, state):
    return np.dot(self.policy.getProbability(state), self.actionValueTable[state,:])
    
  def getActionValue(self, state, action):
    return self.actionValueTable[state,action]

  def getName(self):
    return self.name
    
  def reset(self):
    self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=np.float)
    self.policy.reset()    
        if (isConverged):
            print("Convergence achieved!")
            break

    # TD(n) prediction vs averaging TD errors
    values_sumTDerrors = np.zeros(env.nStates)

    for e in range(nEpisodes):

        if (e % int(nEpisodes * 0.1) == 0):
            print("Episode : ", e)

        experiences = [{}]
        state = env.reset()
        action = policy.sampleAction(state)
        done = False
        while not done:

            experiences[-1]['state'] = state
            experiences[-1]['action'] = action
            experiences[-1]['done'] = done

            new_state, reward, done = env.step(action)

            #print("Episode : ", e, " State : ", state, " Action : ", action, " Reward : ", reward, " Next state : ", new_state)

            new_action = policy.sampleAction(state)

            xp = {}
            xp['state'] = new_state
Example #7
0
class nStepQSigma(nStepTDControlAgent):
    def __init__(self,
                 nStates,
                 nActions,
                 alpha,
                 gamma,
                 n,
                 sigma,
                 policyUpdateMethod="esoft",
                 epsilon=0.1,
                 tieBreakingMethod="arbitrary",
                 valueInit="zeros"):
        super().__init__(nStates,
                         nActions,
                         alpha,
                         gamma,
                         n,
                         valueInit=valueInit)
        self.name = "n-step Q-sigma"
        self.sigma = sigma
        self.policy = StochasticPolicy(
            self.nStates,
            self.nActions,
            policyUpdateMethod=policyUpdateMethod,
            epsilon=epsilon,
            tieBreakingMethod=tieBreakingMethod)  # TODO

    def sweepBuffer(self, tau_start, tau_stop, t, T, behaviour_policy):
        for tau in range(tau_start, tau_stop):
            state = self.bufferExperience[tau]['state']
            action = self.bufferExperience[tau]['action']
            if ((t + 1) < T):
                G = self.actionValueTable[self.bufferExperience[t +
                                                                1]['state'],
                                          self.bufferExperience[t +
                                                                1]['action']]
            for k in range(t + 1, tau, -1):
                sweeping_state = self.bufferExperience[k]['state']
                sweeping_action = self.bufferExperience[k]['action']
                sweeping_reward = self.bufferExperience[k]['reward']
                if (k == T):
                    G = sweeping_reward
                else:
                    sigma = self.sigma
                    probActions = np.array(
                        self.policy.getProbability(sweeping_state))
                    p = probActions[sweeping_action]
                    b = behaviour_policy.getProbability(
                        sweeping_state, sweeping_action)
                    W = p / b
                    V = np.dot(probActions,
                               self.actionValueTable[sweeping_state, :])
                    G = sweeping_reward + self.gamma * (
                        sigma * W +
                        (1.0 - sigma) * p) * (G - self.actionValueTable[
                            sweeping_state, sweeping_action]) + self.gamma * V
            td_error = G - self.actionValueTable[state, action]
            self.actionValueTable[state, action] = self.actionValueTable[
                state, action] + self.alpha * td_error
            self.policy.update(state, self.actionValueTable[state, :])

    def selectAction(self, state, actionsAvailable=None):
        return self.policy.sampleAction(state, actionsAvailable)
Example #8
0
    behaviour_policy = StochasticPolicy(env.nStates,
                                        env.nActions,
                                        policyUpdateMethod="esoft",
                                        epsilon=epsilon)

    for e in range(nEpochs):

        if (e % 1000 == 0):
            print("Epoch : ", e)

        experiences = [{}]
        state = env.reset()
        done = False
        while not done:

            action = behaviour_policy.sampleAction(state,
                                                   env.getAvailableActions())

            experiences[-1]['state'] = state
            experiences[-1]['action'] = action
            experiences[-1]['done'] = done
            experiences[-1]['allowedActions'] = env.getAvailableActions(
                state)  # TODO check

            new_state, reward, done = env.step(action)

            xp = {}
            xp['reward'] = reward
            xp['state'] = new_state
            xp['allowedActions'] = env.getAvailableActions(
                new_state)  # TODO check
            xp['done'] = done
	policy = StochasticPolicy(env.nStates, env.nActions)
	agent = MonteCarloPrediction(env.nStates, gamma, doUseAllVisits=False)
	
	#env.printEnv()
	
	for e in range(nEpisodes):
	
		if(e%1000==0):
			print("Episode : ", e)
			
		experiences = [{}]
		state = env.reset()
		done = False	
		while not done:
		
			action = policy.sampleAction(state, env.getAvailableActions())
			
			experiences[-1]['state'] = state
			experiences[-1]['action'] = action
			experiences[-1]['done'] = done
			
			new_state, reward, done = env.step(action)

			#print("Episode : ", e, " State : ", state, " Action : ", action, " Reward : ", reward, " Next state : ", new_state)
			
			xp = {}
			xp['reward'] = reward
			xp['state'] = new_state
			xp['done'] = done
			experiences.append(xp)
			
Example #10
0
                    actionProb[env.ACTION_STICK] = 1.0
                    agent.policy.update(idx_state, actionProb)

    #env.printEnv()

    for e in range(nEpisodes):

        if (e % 10000 == 0):
            print("Episode : ", e)

        experiences = [{}]
        state = env.reset()
        done = False
        while not done:

            action = policy_behaviour.sampleAction(state)

            experiences[-1]['state'] = state
            experiences[-1]['action'] = action
            experiences[-1]['done'] = done

            new_state, reward, done = env.step(action)

            #print("Episode : ", e, " State : ", state, " Action : ", action, " Reward : ", reward, " Next state : ", new_state)

            xp = {}
            xp['reward'] = reward
            xp['state'] = new_state
            xp['done'] = done
            experiences.append(xp)
Example #11
0
                    actionProb[env.ACTION_STICK] = 1.0
                    agent.policy.update(idx_state, actionProb)

    #env.printEnv()

    for e in range(nEpisodes):

        if (e % 10000 == 0):
            print("Episode : ", e)

        experiences = [{}]
        state = env.reset()
        done = False
        while not done:

            action = behaviour_policy.sampleAction(state)

            experiences[-1]['state'] = state
            experiences[-1]['action'] = action
            experiences[-1]['done'] = done

            new_state, reward, done = env.step(action)

            #print("Episode : ", e, " State : ", state, " Action : ", action, " Reward : ", reward, " Next state : ", new_state)

            xp = {}
            xp['reward'] = reward
            xp['state'] = new_state
            xp['done'] = done
            experiences.append(xp)