def __init__(self, actions, discount, featureExtractor, explorationProb,
              stepSize, threshold, decay, maxGradient,
              num_consecutive_random_actions):
     """
     :note: please see parent class for params not described here
     """
     super(SARSALambdaLearningAlgorithm,
           self).__init__(actions, discount, featureExtractor,
                          explorationProb, stepSize, maxGradient,
                          num_consecutive_random_actions)
     self.eligibility_traces = EligibilityTraces(threshold, decay)
Example #2
0
 def __init__(self, actions, featureExtractor, discount, explorationProb,
              stepSize, decay, threshold):
     super(SARSALambdaLearningAlgorithm,
           self).__init__(actions, featureExtractor, discount,
                          explorationProb, stepSize)
     self.threshold = threshold
     self.decay = decay
     self.eligibility_traces = EligibilityTraces(threshold, decay)
     self.name = "SARSALambda"
     self.maxFeatVectorNorm = 1
     self.firstReward = 0
     self.sawFirst = False
 def __init__(self, actions, discount, featureExtractor,
             explorationProb, stepSize, threshold, decay, maxGradient,
             num_consecutive_random_actions):
     """
     :note: please see parent class for params not described here
     """
     super(SARSALambdaLearningAlgorithm, self).__init__(actions, discount, featureExtractor,
                 explorationProb, stepSize, maxGradient, num_consecutive_random_actions)
     self.eligibility_traces = EligibilityTraces(threshold, decay)
Example #4
0
from q_learning import QLearning
from SARSA import SARSALearning
from eligibility_traces import EligibilityTraces
from function_approximation import FApprox
from mountain_cart import run_methods, self_iterate
import pickle

if __name__ == "__main__":
    # Initialize a method
    methods = [
        QLearning("MountainCar-v0", print_progress=False),
        SARSALearning("MountainCar-v0", print_progress=False),
        FApprox("MountainCar-v0", print_progress=False),
        EligibilityTraces("MountainCar-v0", print_progress=False)
    ]

    # Run the tests
    run_methods(methods)

    method = methods[0]
    method.q_table = pickle.load(
        open("Best_Method_" + str(type(method).__name__) + ".p", "rb"))
    method.evaluate()
    method.display()

    self_iterate(methods[0])
class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm):
    """
    :description: Class implementing the SARSA Lambda algorithm. This
        class is equivalent to the SARSALearningAlgorithm class when
        self.lambda is set to 0; however, we keep it separate here
        because it imposes an overhead of tracking eligibility
        traces and because it is nice to see the difference between
        the two clearly.
    """
    def __init__(self, actions, discount, featureExtractor, explorationProb,
                 stepSize, threshold, decay, maxGradient,
                 num_consecutive_random_actions):
        """
        :note: please see parent class for params not described here
        """
        super(SARSALambdaLearningAlgorithm,
              self).__init__(actions, discount, featureExtractor,
                             explorationProb, stepSize, maxGradient,
                             num_consecutive_random_actions)
        self.eligibility_traces = EligibilityTraces(threshold, decay)

    def incorporateFeedback(self, state, action, reward, newState):
        """
        :description: performs a SARSA update

        :type state: dictionary
        :param state: the state of the game

        :type action: int
        :param action: the action for which to retrieve the Q-value

        :type reward: float
        :param reward: reward associated with being in newState

        :type newState: dictionary
        :param newState: the new state of the game

        :type rval: int or None
        :param rval: if rval returned, then this is the next action taken
        """
        stepSize = self.stepSize
        prediction = self.getQ(state, action)
        self.eligibility_traces.update_all()
        target = reward
        newAction = None
        for f, v in self.featureExtractor(state, action):
            ### v might actually be 1 ###
            self.eligibility_traces[f] += v
        if newState != None:
            # SARSA differs from Q-learning in that it does not take the max
            # over actions, but instead selects the action using it's policy
            # and in that it returns the action selected
            # so that the main training loop may use that in the next iteration
            newAction = self.getAction(newState)
            target += self.discount * self.getQ(newState, newAction)

        update = stepSize * (prediction - target)
        update = np.clip(update, -self.maxGradient, self.maxGradient)

        for f, e in self.eligibility_traces.iteritems():
            #print 'update * e: {} applied to {}, e: {}, update: {}'.format(-1 * update * e, f, e, update)
            self.weights[f] -= update * e
            assert (self.weights[f] < MAX_FEATURE_WEIGHT_VALUE)
        # return newAction to denote that this is an on-policy algorithm
        return newAction
class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm):
    """
    :description: Class implementing the SARSA Lambda algorithm. This
        class is equivalent to the SARSALearningAlgorithm class when
        self.lambda is set to 0; however, we keep it separate here
        because it imposes an overhead of tracking eligibility
        traces and because it is nice to see the difference between
        the two clearly.
    """
    def __init__(self, actions, discount, featureExtractor,
                explorationProb, stepSize, threshold, decay, maxGradient,
                num_consecutive_random_actions):
        """
        :note: please see parent class for params not described here
        """
        super(SARSALambdaLearningAlgorithm, self).__init__(actions, discount, featureExtractor,
                    explorationProb, stepSize, maxGradient, num_consecutive_random_actions)
        self.eligibility_traces = EligibilityTraces(threshold, decay)

    def incorporateFeedback(self, state, action, reward, newState):
        """
        :description: performs a SARSA update

        :type state: dictionary
        :param state: the state of the game

        :type action: int
        :param action: the action for which to retrieve the Q-value

        :type reward: float
        :param reward: reward associated with being in newState

        :type newState: dictionary
        :param newState: the new state of the game

        :type rval: int or None
        :param rval: if rval returned, then this is the next action taken
        """
        stepSize = self.stepSize
        prediction = self.getQ(state, action)
        self.eligibility_traces.update_all()
        target = reward
        newAction = None
        for f, v in self.featureExtractor(state, action):
            ### v might actually be 1 ###
            self.eligibility_traces[f] += v
        if newState != None:
            # SARSA differs from Q-learning in that it does not take the max
            # over actions, but instead selects the action using it's policy
            # and in that it returns the action selected
            # so that the main training loop may use that in the next iteration
            newAction = self.getAction(newState)
            target += self.discount * self.getQ(newState, newAction)

        update = stepSize * (prediction - target)
        update = np.clip(update, -self.maxGradient, self.maxGradient)

        for f, e in self.eligibility_traces.iteritems():
            #print 'update * e: {} applied to {}, e: {}, update: {}'.format(-1 * update * e, f, e, update)
            self.weights[f] -= update * e
            assert(self.weights[f] < MAX_FEATURE_WEIGHT_VALUE)
        # return newAction to denote that this is an on-policy algorithm
        return newAction
Example #7
0
class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm):
    """
    :description: Class implementing the SARSA Lambda algorithm. This
        class is equivalent to the SARSALearningAlgorithm class when
        self.lambda is set to 0; however, we keep it separate here
        because it imposes an overhead of tracking eligibility
        traces and because it is nice to see the difference between
        the two clearly.
    """
    def __init__(self, actions, featureExtractor, discount, explorationProb,
                 stepSize, decay, threshold):
        super(SARSALambdaLearningAlgorithm,
              self).__init__(actions, featureExtractor, discount,
                             explorationProb, stepSize)
        self.threshold = threshold
        self.decay = decay
        self.eligibility_traces = EligibilityTraces(threshold, decay)
        self.name = "SARSALambda"
        self.maxFeatVectorNorm = 1
        self.firstReward = 0
        self.sawFirst = False

    def startEpisode(self, state):
        self.resetTraces()
        self.featureExtractor.extractFeatures(state)

    def resetTraces(self):
        self.eligibility_traces = EligibilityTraces(self.threshold, self.decay)

    def incorporateFeedback(self,
                            state,
                            action,
                            reward,
                            newState,
                            prediction=None,
                            target=None):
        """
        :description: performs a SARSA update

        :type state: dictionary
        :param state: the state of the game

        :type action: int
        :param action: the action for which to retrieve the Q-value

        :type reward: float
        :param reward: reward associated with being in newState

        :type newState: dictionary
        :param newState: the new state of the game

        :type rval: int or None
        :param rval: if rval returned, then this is the next action taken
        """
        self.eligibility_traces.update_all()
        for f in self.featureExtractor.features:
            self.eligibility_traces[(f, action)] = 1

        if prediction is None:
            prediction = self.getQ(action)

        if reward != 0 and not self.sawFirst:
            self.sawFirst = True
            self.firstReward = abs(float(reward))

        scaledReward = reward
        if self.sawFirst:
            scaledReward = reward / self.firstReward

        newAction = None

        if target is None:
            target = scaledReward
            if newState != None:
                # extract features of new state
                self.featureExtractor.extractFeatures(newState)
                # SARSA differs from Q-learning in that it does not take the max
                # over actions, but instead selects the action using it's policy
                # and in that it returns the action selected
                # so that the main training loop may use that in the next iteration
                newAction = self.getAction()
                target += self.discount * self.getQ(newAction)

        if len(self.featureExtractor.features) > self.maxFeatVectorNorm:
            self.maxFeatVectorNorm = len(self.featureExtractor.features)

        update = self.stepSize / self.maxFeatVectorNorm * (prediction - target)
        for f, e in self.eligibility_traces.iteritems():
            self.weights[f] -= update * e

        return newAction
Example #8
0
 def resetTraces(self):
     self.eligibility_traces = EligibilityTraces(self.threshold, self.decay)
Example #9
0
class Strategy:
    def __init__(self, γ, α, λ, ε, ε_decay, actions):
        self.γ = γ
        self.α = α
        self.λ = λ
        self.ε = ε
        self.ε_decay = ε_decay
        self.actions = actions
        self.eligibility_traces = None
        self.q_values = QValues(actions)
        self.scores = []  # TODO
        self.episode = 0
        self.episode_reward = 0
        self.episode_reward_total = 0  # TODO

    def new_episode(self):
        self.eligibility_traces = EligibilityTraces(1 - self.γ * self.λ)
        self.ε *= self.ε_decay
        self.episode += 1
        self.episode_reward = 0

    def next_action(self, state, ε=None):
        return self.q_values.get_greedy_action(state,
                                               self.ε if ε is None else ε)

    def update(self, state_before, action, reward, state_after):
        expected_reward = self.q_values.get_expected_reward(
            state_before, action)
        next_action = self.q_values.get_greedy_action(state_after, self.ε)
        next_expected_reward = self.q_values.get_expected_reward(
            state_after, next_action)

        td_error = reward - expected_reward + self.γ * next_expected_reward

        self.eligibility_traces.increment(state_before, action)
        self.q_values.ensure_exists(state_before, action)

        def update_q_values(state, action):
            old_expected_reward = self.q_values.get_expected_reward(
                state, action)
            new_expected_reward = old_expected_reward + self.α * td_error * self.eligibility_traces.get(
                state, action)
            self.q_values.set_expected_reward(state, action,
                                              new_expected_reward)
            self.eligibility_traces.decay(state, action)

        self.q_values.for_each(update_q_values)
        self.episode_reward += reward

    def load(self, values):
        self.q_values.set_all_values(values['q'])
        self.ε = values['ε']
        self.scores = values['scores']
        self.episode = values['episode']

    def dump(self):
        return {
            'q': self.q_values.get_all_values(),
            'ε': self.ε,
            'scores': self.scores,
            'episode': self.episode
        }
Example #10
0
 def new_episode(self):
     self.eligibility_traces = EligibilityTraces(1 - self.γ * self.λ)
     self.ε *= self.ε_decay
     self.episode += 1
     self.episode_reward = 0