コード例 #1
0
    def __init__(self, *args, **kwargs):
        # Create the agent info
        self.agentInfo = \
            mmlf.framework.protocol.AgentInfo(# Which communication protocol 
                                                 # version can the agent handle?
                                                 versionNumber = "0.3",
                                                 # Name of the agent (can be
                                                 # chosen arbitrarily)
                                                 agentName= "Actor Critic",
                                                 # Can the agent be used in
                                                 # environment with continuous
                                                 # state spaces?
                                                 continuousState = False,
                                                 # Can the agent be used in
                                                 # environment with continuous
                                                 # action spaces?
                                                 continuousAction = False,
                                                 # Can the agent be used in
                                                 # environment with discrete
                                                 # action spaces?
                                                 discreteAction = True,
                                                 # Can the agent be used in
                                                 # non-episodic environments
                                                 nonEpisodicCapable = True)

        # Calls constructor of base class
        # After this call, the agent has an attribute "self.configDict",
        # that contains the information from config['configDict'].
        super(ActorCriticAgent, self).__init__(*args, **kwargs)

        # An observable that can be used to monitor an agents preferences
        self.preferencesObservable = \
            StateActionValuesObservable(title='%s Preferences' % self.__class__.__name__)
コード例 #2
0
ファイル: td_lambda_agent.py プロジェクト: mekruthi/mmlf
 def __init__(self, *args, **kwargs):
     
     if self.__class__ == TDLambdaAgent:
         # Create the agent info
         self.agentInfo = mmlf.framework.protocol.AgentInfo(
                             versionNumber = "0.3",
                             agentName = "TD(lambda)",
                             continuousState = True,
                             continuousAction = True,
                             discreteAction = True,
                             nonEpisodicCapable = True)
     
     # Call the constructor of the super class
     super(TDLambdaAgent, self).__init__(*args, **kwargs)
     
     if not 'update_rule' in self.configDict:
         self.configDict['update_rule'] = "SARSA"        
     elif self.configDict['update_rule'] == "WatkinsQ" and self.configDict['lambda'] > 0.0:
         self.agentLog.warning("Using the update rule 'WatkinsQ' with lambda>0. "
                               "MMLF does neither use WatkinsQ(lambda) nor "
                               "PengQ(lambda) but the 'naive' Q(lambda) " 
                               "(compare Sutton/Barto, chapter 7.6)")
     
     #Create Eligibility Traces for all possible actions
     minTraceValue = self.configDict['minTraceValue']
     self.eligibilityTrace = EligibilityTrace(minTraceValue = minTraceValue)
                 
     # An observable that can be used to monitor the agent's eligibility traces
     self.eligibilityTraceObservable = \
         StateActionValuesObservable(title='%s (eligibility trace)' 
                                                 % self.__class__.__name__)
コード例 #3
0
ファイル: fitted_r_max_agent.py プロジェクト: mekruthi/mmlf
    def __init__(self, *args, **kwargs):

        # Create the agent info
        self.agentInfo = mmlf.framework.protocol.AgentInfo(
            versionNumber="0.3",
            agentName="Fitted R-Max",
            continuousState=True,
            continuousAction=False,
            discreteAction=True,
            nonEpisodicCapable=True)

        # Calls constructor of base class
        super(FittedRMaxAgent, self).__init__(*args, **kwargs)

        self.functionApproximator = None

        self.userDirObj.createPath(['model'],
                                   refName='modelDir',
                                   baseRef='agentlogs',
                                   force=True)

        # An observable that stores the exploration value
        self.explorationValueObservable = \
            FloatStreamObservable(title='%s Exploration Value' % self.__class__.__name__,
                                  time_dimension_name='Step',
                                  value_name='Exploration Value')

        # An observable that can be used to monitor an agents greedy policy
        self.policyObservable = \
            FunctionOverStateSpaceObservable(title='%s (greedy policy)'
                                                     % self.__class__.__name__,
                                             discreteValues=True)
        # An observable that can be used to monitor an agents optimal value function
        self.optimalValueFunctionObservable = \
            FunctionOverStateSpaceObservable(title='%s (optimal value function)'
                                                    % self.__class__.__name__,
                                                 discreteValues=False)
        # An observable that can be used to monitor an agents Q-Function
        self.stateActionValuesObservable = \
            StateActionValuesObservable(title='%s Q Function'
                                                   % self.__class__.__name__)
        # An observable that can be used to monitor the expected reward
        self.expectedRewardObservable = \
            StateActionValuesObservable(title='%s Reward Expectation'
                                                   % self.__class__.__name__)
コード例 #4
0
ファイル: td_agent.py プロジェクト: mekruthi/mmlf
    def __init__(self, *args, **kwargs):

        super(TDAgent, self).__init__(*args, **kwargs)

        self.epsilon = self.configDict.get('epsilon', 0.0)
        self.epsilonDecay = self.configDict.get('epsilonDecay', 1.0)

        # An observable that can be used to monitor an agents greedy policy
        self.policyObservable = \
            FunctionOverStateSpaceObservable(title='%s (greedy policy)' % self.__class__.__name__,
                                             discreteValues=True)
        # An observable that can be used to monitor an agents optimal value function
        self.optimalValueFunctionObservable = \
            FunctionOverStateSpaceObservable(title='%s (optimal value function)' % self.__class__.__name__,
                                                 discreteValues=False)
        # An observable that can be used to monitor an agents Q-Function
        self.stateActionValuesObservable = \
            StateActionValuesObservable(title='%s (value function)' % self.__class__.__name__)
コード例 #5
0
ファイル: monte_carlo_agent.py プロジェクト: mekruthi/mmlf
    def __init__(self, *args, **kwargs):

        self.agentInfo = mmlf.framework.protocol.AgentInfo(
            versionNumber="0.3",
            agentName="Monte Carlo",
            continuousState=False,
            continuousAction=False,
            discreteAction=True,
            nonEpisodicCapable=False)

        # Calls constructor of base class
        super(MonteCarloAgent, self).__init__(*args, **kwargs)

        self.hasContinuousActionSpace = False

        self.episode = {'states': [], 'actions': [], 'rewards': []}

        self.samples = defaultdict(lambda: 0)
        self.qvalues = defaultdict(lambda: self.configDict['defaultQ'])

        # An observable that can be used to monitor an agents Q-Function
        self.stateActionValuesObservable = \
            StateActionValuesObservable(title='%s Q Function' % self.__class__.__name__)
コード例 #6
0
ファイル: td_agent.py プロジェクト: mekruthi/mmlf
class TDAgent(AgentBase):
    """
    A base agent for all kind of agents based on temporal difference learning.
    Most of these agents can reuse most methods of this agents and 
    have to modify only small parts
    
    Note: The TDAgent cannot be instantiated by itself, it is a abstract base class!
    """
    def __init__(self, *args, **kwargs):

        super(TDAgent, self).__init__(*args, **kwargs)

        self.epsilon = self.configDict.get('epsilon', 0.0)
        self.epsilonDecay = self.configDict.get('epsilonDecay', 1.0)

        # An observable that can be used to monitor an agents greedy policy
        self.policyObservable = \
            FunctionOverStateSpaceObservable(title='%s (greedy policy)' % self.__class__.__name__,
                                             discreteValues=True)
        # An observable that can be used to monitor an agents optimal value function
        self.optimalValueFunctionObservable = \
            FunctionOverStateSpaceObservable(title='%s (optimal value function)' % self.__class__.__name__,
                                                 discreteValues=False)
        # An observable that can be used to monitor an agents Q-Function
        self.stateActionValuesObservable = \
            StateActionValuesObservable(title='%s (value function)' % self.__class__.__name__)

    ######################  BEGIN COMMAND-HANDLING METHODS ###############################

    def setStateSpace(self, stateSpace):
        """ Informs the agent about the state space of the environment
        
        More information about state spaces can be found in 
        :ref:`state_and_action_spaces`
        """
        super(TDAgent, self).setStateSpace(stateSpace)

        # Determine the resolution used for continuous state space dimensions
        # (if any exist):
        for dimName, dimDescr in self.stateSpace.iteritems():
            if dimDescr['dimensionType'] == "continuous":  # Set the resolution
                if isinstance(self.configDict['stateDimensionResolution'],
                              dict):
                    # If we have separate resolution for each dimension
                    assert ("stateDimensionResolution" in self.configDict and
                            dimName in self.configDict['stateDimensionResolution']), \
                        "State dimension %s has no specified resolution." % dimName
                    self.stateSpace[dimName]["resolution"] \
                            = self.configDict['stateDimensionResolution'][dimName]
                else:  # If we have the same value for all dimensions:
                    self.stateSpace[dimName]["resolution"] \
                            = self.configDict['stateDimensionResolution']

    def setActionSpace(self, actionSpace):
        """ Informs the agent about the action space of the environment
        
        More information about action spaces can be found in 
        :ref:`state_and_action_spaces`
        """
        if actionSpace.hasContinuousDimensions():
            assert ("actionDimensionResolution" in self.configDict), \
                " Continuous action spaces require that an action dimension resolution be defined."
            discreteActionsPerDimension = self.configDict[
                'actionDimensionResolution']
            self.actionSpace = actionSpace.discretizedActionSpace(
                discreteActionsPerDimension)
        else:
            self.actionSpace = actionSpace

        # Get a list of all actions this agent might take
        self.actions = self.actionSpace.getActionList()

        self.agentLog.info("%s got new action-space: %s" %
                           (self.__class__.__name__, self.actionSpace))

        # Since state and action space are now known, the learner can be initialized
        self._initialize()

    def setState(self, state):
        """ Informs the agent of the environment's current state 
        
        More information about (valid) states can be found in 
        :ref:`state_and_action_spaces`
        """
        super(TDAgent, self).setState(state)

    def giveReward(self, reward):
        """ Provides a reward to the agent """
        self.reward += reward

    def getAction(self):
        """ Request the next action the agent want to execute """
        # Choose the action, based on the current state
        self.action = self._chooseAction()

        # Create an action dictionary
        # that maps action dimension to chosen action
        actionDictionary = dict()
        for index, actionName in enumerate(self.actionSpace.iterkeys()):
            actionDictionary[actionName] = self.action[index]

        # Train the agent
        # NOTE: This has to happen in this method since a method like SARSA
        #       needs to know the action taken in the successor state to
        #       update the previous state
        self._train(terminalState=False)

        # Update all observables
        self._updateObservables()

        super(TDAgent, self).getAction()

        return self._generateActionObject(actionDictionary)

    def nextEpisodeStarted(self):
        """ Informs the agent that a new episode has started."""
        # Train the agent a last time for this episode
        self._train(terminalState=True)

        # Update all observables
        self._updateObservables()

        # Decay epsilon
        self.epsilon *= self.epsilonDecay

        super(TDAgent, self).nextEpisodeStarted()

    ######################  End COMMAND-HANDLING METHODS ###############################

    def getGreedyPolicy(self):
        """ Returns the optimal greedy policy the agent has found so far """
        return ValueFunctionPolicy(self.tdLearner.functionApproximator,
                                   self.tdLearner.actions)

    def _initialize(self):
        """ Initializes learner as soon as state and action space are known """
        # Create function approximator based on the configuration
        # one now may set a policy in advance, that shall be re-cycled during learning
        # as for transfer learning techniques
        if not hasattr(
                self,
                "functionApproximator") or self.functionApproximator == None:
            self.functionApproximator = \
                FunctionApproximator.create(self.configDict["function_approximator"],
                                        self.stateSpace, self.actions)

        # The default feature computation is just to scale the state
        featureFct = lambda state: state

        #Create the main TD learner object which is responsible for learning
        self.tdLearner = TD_Learner(self.actions, self.functionApproximator,
                                    featureFct, self.configDict)

    def _chooseAction(self):
        """
        Choose action to perform for the given state based on the Q-value of the
        state, action pairs
        """
        # Apply epsilon-greedy action selection
        if random.random() < self.epsilon:
            #Choose random action
            action = random.choice(self.actions)
        else:
            # Compute the action with the optimal q-value
            action = self.functionApproximator.computeOptimalAction(self.state)

        return action

    def _updateObservables(self):
        super(TDAgent, self)._updateObservables()

        # Update policy observable
        self.policyObservable.updateFunction(
            lambda state: self.functionApproximator.computeOptimalAction(state
                                                                         )[0])

        # Update optimal value function observable
        self.optimalValueFunctionObservable.updateFunction(
            lambda state: self.functionApproximator.computeV(state))

        # Update Q-function observable
        valueAccessFunction = lambda state, action : \
                    self.functionApproximator.computeQ(state, action)
        self.stateActionValuesObservable.updateValues(valueAccessFunction,
                                                      self.actions)
コード例 #7
0
ファイル: monte_carlo_agent.py プロジェクト: mekruthi/mmlf
class MonteCarloAgent(AgentBase):
    """ Agent that learns based on monte-carlo samples of the Q-function

    An agent which uses Monte Carlo policy evaluation to optimize its behavior
    in a given environment.
    
    **CONFIG DICT** 
        :gamma: : The discount factor for computing the return given the rewards
        :epsilon: : Exploration rate. The probability that an action is chosen non-greedily, i.e. uniformly random among all available actions
        :visit: : Whether first ("first") or every visit ("every") is used in Monte-Carlo updates
        :defaultQ: : The initially assumed Q-value for each state-action pair. Allows to control initial exploration due to optimistic initialization 
    """

    DEFAULT_CONFIG_DICT = {
        'gamma': 1.0,
        'epsilon': 0.1,
        'visit': "first",
        'defaultQ': 0.0
    }

    def __init__(self, *args, **kwargs):

        self.agentInfo = mmlf.framework.protocol.AgentInfo(
            versionNumber="0.3",
            agentName="Monte Carlo",
            continuousState=False,
            continuousAction=False,
            discreteAction=True,
            nonEpisodicCapable=False)

        # Calls constructor of base class
        super(MonteCarloAgent, self).__init__(*args, **kwargs)

        self.hasContinuousActionSpace = False

        self.episode = {'states': [], 'actions': [], 'rewards': []}

        self.samples = defaultdict(lambda: 0)
        self.qvalues = defaultdict(lambda: self.configDict['defaultQ'])

        # An observable that can be used to monitor an agents Q-Function
        self.stateActionValuesObservable = \
            StateActionValuesObservable(title='%s Q Function' % self.__class__.__name__)

    ######################  BEGIN COMMAND-HANDLING METHODS ###############################

    def setActionSpace(self, actionSpace):
        """ Informs the agent about the action space of the environment
        
        More information about action spaces can be found in 
        :ref:`state_and_action_spaces`
        """
        ##TODO: Extend to more than one action dimension?
        if len(actionSpace.keys()) > 1:
            raise UserWarning("Error: Currently, only one action dimension is "
                              "possible in Monte-Carlo!")

        super(MonteCarloAgent, self).setActionSpace(actionSpace)

        self.actions = copy.copy(self.actionSpace["action"]["dimensionValues"])

    def setState(self, state):
        """ Informs the agent of the environment's current state 
        
        More information about (valid) states can be found in 
        :ref:`state_and_action_spaces`
        """
        super(MonteCarloAgent, self).setState(state)

        self.episode['states'].append(self.state)

    def giveReward(self, reward):
        """ Provides a reward to the agent """
        self.episode['rewards'].append(reward)

    def getAction(self):
        """ Request the next action the agent want to execute """
        #Choose the action, based on the current state
        actions = copy.copy(self.actions)
        random.shuffle(actions)

        exploration = random.random() < self.configDict['epsilon']
        if exploration:
            maxAction = random.choice(actions)
        else:
            maxValue, foo, maxAction = max((
                self.qvalues[(self.state, action)],
                random.random(),  # break ties randomly!
                action) for action in actions)

        self.episode['actions'].append(maxAction)

        actionDictionary = {'action': maxAction}

        super(MonteCarloAgent, self).getAction()

        return self._generateActionObject(actionDictionary)

    def nextEpisodeStarted(self):
        """ Informs the agent that a new episode has started."""

        visited_state_actions = set()

        trajectory = zip(self.episode['states'], self.episode['actions'])

        for step, state_action in enumerate(trajectory):
            if self.configDict[
                    'visit'] == "first" and state_action in visited_state_actions:
                continue

            visited_state_actions.add(state_action)

            discReturn = sum(
                map(lambda x: self.configDict['gamma']**x[0] * x[1],
                    enumerate(self.episode['rewards'][step:])))

            self.qvalues[state_action] = float(self.qvalues[state_action] * self.samples[state_action] + discReturn) \
                                                / (self.samples[state_action] + 1)

            self.samples[state_action] = self.samples[state_action] + 1

        self.episode = {'states': [], 'actions': [], 'rewards': []}

        # Update Q-function observable
        def valueAccessFunction(state, action):
            if isinstance(action, tuple):
                action = action[0]
            return self.qvalues[(state, action)]

        self.stateActionValuesObservable.updateValues(valueAccessFunction,
                                                      self.actions)

        super(MonteCarloAgent, self).nextEpisodeStarted()
コード例 #8
0
ファイル: fitted_r_max_agent.py プロジェクト: mekruthi/mmlf
class FittedRMaxAgent(AgentBase):
    """ Fitted R-Max agent
    
    Fitted R-Max is a model-based RL algorithm that uses the RMax heuristic for
    exploration control, uses a fitted function approximator (even though this can
    be configured differently), and uses Dynamic Programming (boosted by prioritized
    sweeping) for deriving a value function from the model. Fitted R-Max learns 
    usually very sample-efficient (meaning that a good policy is learned with only a 
    few interactions with the environment) but requires a huge amount of 
    computational resources.
    
    .. seealso::
        Nicholas K. Jong and Peter Stone,
        "Model-based function approximation in reinforcement learning",
        in "Proceedings of the 6th International Joint Conference on Autonomous Agents and Multiagent Systems" 
        Honolulu, Hawaii: ACM, 2007, 1-8, http://portal.acm.org/citation.cfm?id=1329125.1329242.
    
    **CONFIG DICT**
        :gamma: : The discount factor for computing the return given the rewards#
        :min_exploration_value: : The agent explores in a state until the given exploration value (approx. number of exploratory actions in proximity of state action pair) is reached for all actions
        :RMax: : An upper bound on the achievable return an agent can obtain in a single episode
        :planner: : The algorithm used for planning, i.e. for optimizing the policy based on a learned model
        :model: : The algorithm used for learning a model of the environment
        :function_approximator: : The function approximator used for representing the Q value function
        :actionDimensionResolution: : Per default, the agent discretizes a continuous action space in this number of discrete actions             
    """

    DEFAULT_CONFIG_DICT = {
        'gamma': 0.99,
        'min_exploration_value': 1.0,
        'RMax': 0.0,
        'planner': {
            'name': "PrioritizedSweeping",
            'updatesPerStep': 1000,
            'minSweepDelta': 0.1
        },
        'model': {
            'name': 'KNNModel',
            'k': 100,
            'b_Sa': 0.03,
            'exampleSetSize': 2500
        },
        'function_approximator': {
            'name': 'KNN',
            'k': 20,
            'b_X': 0.01
        },
        'actionDimensionResolution': 9
    }

    def __init__(self, *args, **kwargs):

        # Create the agent info
        self.agentInfo = mmlf.framework.protocol.AgentInfo(
            versionNumber="0.3",
            agentName="Fitted R-Max",
            continuousState=True,
            continuousAction=False,
            discreteAction=True,
            nonEpisodicCapable=True)

        # Calls constructor of base class
        super(FittedRMaxAgent, self).__init__(*args, **kwargs)

        self.functionApproximator = None

        self.userDirObj.createPath(['model'],
                                   refName='modelDir',
                                   baseRef='agentlogs',
                                   force=True)

        # An observable that stores the exploration value
        self.explorationValueObservable = \
            FloatStreamObservable(title='%s Exploration Value' % self.__class__.__name__,
                                  time_dimension_name='Step',
                                  value_name='Exploration Value')

        # An observable that can be used to monitor an agents greedy policy
        self.policyObservable = \
            FunctionOverStateSpaceObservable(title='%s (greedy policy)'
                                                     % self.__class__.__name__,
                                             discreteValues=True)
        # An observable that can be used to monitor an agents optimal value function
        self.optimalValueFunctionObservable = \
            FunctionOverStateSpaceObservable(title='%s (optimal value function)'
                                                    % self.__class__.__name__,
                                                 discreteValues=False)
        # An observable that can be used to monitor an agents Q-Function
        self.stateActionValuesObservable = \
            StateActionValuesObservable(title='%s Q Function'
                                                   % self.__class__.__name__)
        # An observable that can be used to monitor the expected reward
        self.expectedRewardObservable = \
            StateActionValuesObservable(title='%s Reward Expectation'
                                                   % self.__class__.__name__)

    ######################  BEGIN COMMAND-HANDLING METHODS ###############################
    def setActionSpace(self, actionSpace):
        """ Informs the agent about the action space of the environment
        
        More information about action spaces can be found in 
        :ref:`state_and_action_spaces`
        """
        if actionSpace.hasContinuousDimensions():
            assert ("actionDimensionResolution" in self.configDict), \
                """ Continuous action spaces require that an action dimension resolution be defined. """
            discreteActionsPerDimension = self.configDict[
                'actionDimensionResolution']
            self.actionSpace = actionSpace.discretizedActionSpace(
                discreteActionsPerDimension)
        else:
            self.actionSpace = actionSpace

        # Get a list of all actions this agent might take
        self.actions = self.actionSpace.getActionList()

        self.agentLog.info("%s got new action-space: %s" %
                           (self.__class__.__name__, self.actionSpace))

        # Since state and action space are now known, the learner can be initialized
        self._initialize()

    def setState(self, state):
        """ Informs the agent of the environment's current state 
        
        More information about (valid) states can be found in 
        :ref:`state_and_action_spaces`
        """
        super(FittedRMaxAgent, self).setState(state)

    def giveReward(self, reward):
        """ Provides a reward to the agent """
        self.reward = reward

    def getAction(self):
        """ Request the next action the agent want to execute """
        if self.lastState != None:
            # Inform the model about the outcome of the last action
            self.model.addExperience(self.lastState, self.lastAction,
                                     self.state, self.reward)
            # Planning
            self._updatePolicy()


#            if self.stepCounter % 100 == 0:
#                import cProfile;
#                cProfile.runctx("self._updatePolicy()", globals(), locals(),
#                                "profiling_data_%s.dat" % self.stepCounter)
        else:
            # Inform the model about a new start state
            self.model.addStartState(self.state)

        if self.functionApproximator != None:
            # Compute the action with the optimal q-value
            self.action = self.functionApproximator.computeOptimalAction(
                self.state)
        else:
            # Randomly choose an action
            self.action = random.choice(self.actions)

        # Create an action dictionary that maps
        # action dimension to chosen action
        actionDictionary = dict()
        for index, actionName in enumerate(self.actionSpace.iterkeys()):
            actionDictionary[actionName] = self.action[index]

        # Update explorationValueObservable
        self.explorationValueObservable.addValue(
            self.stepCounter,
            self.model.getExplorationValue(self.state, self.action))

        super(FittedRMaxAgent, self).getAction()

        self.agentLog.debug("Episode %s Step: %s State: %s Action: %s" %
                            (self.episodeCounter, self.stepCounter,
                             self.lastState, self.lastAction))

        return self._generateActionObject(actionDictionary)

    def nextEpisodeStarted(self):
        """ Informs the agent that a new episode has started."""
        # If the agent has actually reached a terminal state
        if self.state is not None:
            # Inform the model about the outcome of the last action
            self.model.addExperience(self.lastState, self.lastAction,
                                     self.state, self.reward)
            # We have reached a terminal state
            self.model.addTerminalState(self.state)
            # Update policy
            self._updatePolicy()

        super(FittedRMaxAgent, self).nextEpisodeStarted()

    def getGreedyPolicy(self):
        """ Returns the optimal greedy policy the agent has found so far """
        return ValueFunctionPolicy(self.functionApproximator, self.actions)

    def _initialize(self):
        """ Lazy initialization of the agent once state and action space are known """
        # Data structure to store value function of discrete MDP during planning
        self.tabularStorage = TabularStorage(
            stateSpace=None,
            actions=self.actions,
            **self.configDict["function_approximator"])

        assert  self.configDict["planner"]['name'] == "PrioritizedSweeping", \
                 "Fitted-RMax works currently only with Prioritized Sweeping " \
                 " planner"
        # The object that performs planning
        self.planner = Planner.create(self.configDict["planner"],
                                      self.stateSpace, self.tabularStorage,
                                      self.configDict['gamma'], self.actions)

        # Choose action model class based on conf
        self.model = Model.create(self.configDict["model"], self,
                                  self.stateSpace, self.actionSpace,
                                  self.userDirObj)

        # TODO: Just a hack, should be refactored soon
        if "plotting" in self.configDict:
            self.configDict["plotting"]["modelRasterPoints"] = \
                [self.configDict["plotting"]["modelRasterPoints"]
                    for i in range(self.stateSpace.getNumberOfDimensions())]

    def _updatePolicy(self):
        """ Update policy using prioritized sweeping based on the internal model
        
        Construct a discrete, finite MDP based on the state transitions and 
        reward expectations learned by the models.
        For state-action pairs that have not been explored sufficiently, 
        be optimistic. i.e. assume that these states have a value that is equal 
        to the maximal achievable reward (R-Max).
        """

        # The states of the constructed MDP
        states = self.model.getStates()
        states.extend([("s_term", ), ("s_opt", )])
        self.planner.setStates(states)

        # Generate the discrete, RMax-optimistic version of the MDP to be
        # solved. This MDP is derived from the learned model.
        discreteRMaxMDP = DiscreteRMaxMDP(
            self.model, self.configDict["RMax"],
            self.configDict["min_exploration_value"],
            self.stateSpace.hasContinuousDimensions())

        # Perform prioritized sweeping to compute optimal value function
        # for the MDP specified by the model starting from the MDP state "nnState"
        try:
            nnState = self.model.getNearestNeighbor(self.lastState)
            self.planner.plan(
                state=nnState,
                action=self.lastAction,
                sampleStartState=None,
                sampleSuccessorState=None,
                stateTransitionFct=lambda state, action: discreteRMaxMDP.
                stateTransitionFct(state, action),
                invStateTransitionFct=lambda state, action: discreteRMaxMDP.
                invStateTransitionFct(state, action),
                rewardFct=lambda state, action: discreteRMaxMDP.rewardFct(
                    state, action),
                isTerminalState=None)
        except (PlanningFailedException, ModelNotInitialized):
            self.agentLog.info("Planning failed!")

        # Get the computed qValues and remove the artificial states "s_term" and
        # "s_opt"
        qValues = self.planner.functionApproximator.getPlainValues()
        for action in self.actions:
            qValues.pop((("s_term", ), action), None)
            qValues.pop((("s_opt", ), action), None)

        # Create a function approximator that generalizes the compute q-values
        # to a value function over the whole continuous state space.
        self.configDict["function_approximator"]["learning_rate"] = 1.0
        functionApproximator = \
            FunctionApproximator.create(self.configDict["function_approximator"],
                                        self.stateSpace, self.actions)

        # Use the RMax function approximator wrapper to ensure that
        # underexplored states have value RMax
        self.functionApproximator = \
                 RMaxFunctionApproximatorWrapper(functionApproximator,
                                                 self.stateSpace,
                                                 self.actions,
                                                 self.model,
                                                 self.configDict["RMax"],
                                                 self.configDict["min_exploration_value"])

        self.functionApproximator.train(qValues)

        # Update observables
        self._updateObservables(discreteRMaxMDP)

    def _updateObservables(self, discreteRMaxMDP):
        super(FittedRMaxAgent, self)._updateObservables()

        # Update policy observable
        self.policyObservable.updateFunction(
            lambda state: self.functionApproximator.computeOptimalAction(state
                                                                         )[0])

        # Update optimal value function observable
        self.optimalValueFunctionObservable.updateFunction(
            lambda state: self.functionApproximator.computeV(state))

        # Update Q-function observable
        valueAccessFunction = lambda state, action : \
                    self.functionApproximator.computeQ(state, action)
        self.stateActionValuesObservable.updateValues(valueAccessFunction,
                                                      self.actions)

        # Update reward expectation observable
        valueAccessFunction = lambda state, action : \
                    discreteRMaxMDP.rewardFct(state, action)
        self.expectedRewardObservable.updateValues(valueAccessFunction,
                                                   self.actions)
コード例 #9
0
class ActorCriticAgent(TDLambdaAgent):
    """ Agent that learns based on the actor-critic architecture.

    This agent learns based on the actor critic architecture.   
    It uses standard TD(lambda) to learn the value function of
    the critic. For this reason, it subclasses TDLambdaAgent. The main
    difference to TD(lambda) is the means for action selection. Instead of
    deriving an epsilon-greedy policy from its Q-function, it learns an 
    explicit stochastic policy. To this end, it maintains preferences for each
    action in each state. These preferences are updated after each action
    execution according to the following rule:
    
    .. math::

        p(s,a) = p(s,a) + \delta, 
        
    where delta is the TD error

    .. math::
    
       \delta = r + \gamma V(s') - V(s)
    Action selection is based on a Gibbs softmax distribution:

    .. math::
    
       \pi(s,a) = \\frac{exp(\\tau^{-1}p(s,a))}{\sum_{b \in A} exp(\\tau^{-1}p(s,b))}
    where tau is a temperature parameter.
    
    Note that even though preferences are stored in a function approximator 
    such that in principle, action preferences could be generalized over the
    state space, continuous state spaces are not yet supported.
    
    .. versionadded:: 0.9.9
       Added Actor-Critic agent
    
    **CONFIG DICT** 
        :gamma: : The discount factor for computing the return given the rewards
        :lambda: : The eligibility trace decay rate
        :tau: : Temperature parameter used in the Gibbs softmax distribution for action selection
        :minTraceValue: : The minimum value of an entry in a trace that is considered to be relevant. If the eligibility falls  below this value, it is set to 0 and the entry is thus no longer updated
        :update_rule: : Whether the learning is on-policy or off-policy.. Can be either "SARSA" (on-policy) or "WatkinsQ" (off-policy)
        :stateDimensionResolution: : The default "resolution" the agent uses for every state dimension. Can be either an int (same resolution for each dimension) or a dict mapping dimension name to its resolution.
        :actionDimensionResolution: : Per default, the agent discretizes a continuous action space in this number of discrete actions.
        :function_approximator: : The function approximator used for representing the Q value function
        :preferences_approximator: The function approximator used for representing the action preferences (i.e. the policy)
    """

    DEFAULT_CONFIG_DICT = {
        'gamma': 0.0,
        'lambda': 0.9,
        'tau': 0.2,
        'minTraceValue': 0.5,
        'stateDimensionResolution': 5,
        'actionDimensionResolution': 7,
        'update_rule': "SARSA",
        'function_approximator': {
            'name': 'TabularStorage',
            'learning_rate': 0.1,
            'default': 0.0
        },
        'preferences_approximator': {
            'name': 'TabularStorage',
            'learning_rate': 1.0,
            'default': 0.0
        }
    }

    def __init__(self, *args, **kwargs):
        # Create the agent info
        self.agentInfo = \
            mmlf.framework.protocol.AgentInfo(# Which communication protocol 
                                                 # version can the agent handle?
                                                 versionNumber = "0.3",
                                                 # Name of the agent (can be
                                                 # chosen arbitrarily)
                                                 agentName= "Actor Critic",
                                                 # Can the agent be used in
                                                 # environment with continuous
                                                 # state spaces?
                                                 continuousState = False,
                                                 # Can the agent be used in
                                                 # environment with continuous
                                                 # action spaces?
                                                 continuousAction = False,
                                                 # Can the agent be used in
                                                 # environment with discrete
                                                 # action spaces?
                                                 discreteAction = True,
                                                 # Can the agent be used in
                                                 # non-episodic environments
                                                 nonEpisodicCapable = True)

        # Calls constructor of base class
        # After this call, the agent has an attribute "self.configDict",
        # that contains the information from config['configDict'].
        super(ActorCriticAgent, self).__init__(*args, **kwargs)

        # An observable that can be used to monitor an agents preferences
        self.preferencesObservable = \
            StateActionValuesObservable(title='%s Preferences' % self.__class__.__name__)

    ######################  BEGIN COMMAND-HANDLING METHODS ###############################

    def getAction(self):
        """ Request the next action the agent want to execute """
        # We can now modify the preferences for the action taken in the last step
        if self.lastState is not None:
            self._updatePreferences()

        return super(ActorCriticAgent, self).getAction()

    def nextEpisodeStarted(self):
        """ Informs the agent that a new episode has started."""
        # We can now modify the preferences for the action taken in the last step
        if self.lastState is not None:
            self._updatePreferences()

        return super(ActorCriticAgent, self).nextEpisodeStarted()

    ########################  END COMMAND-HANDLING METHODS ###############################

    def _initialize(self):
        """ Initializes learner as soon as state and action space are known """
        # The actor learns preferences for actions in certain states
        # Create function approximator for this preference function
        # based on the configuration
        self.preferencesApproximator = \
            FunctionApproximator.create(self.configDict["preferences_approximator"],
                                        self.stateSpace, self.actions)

        super(ActorCriticAgent, self)._initialize()

    def _updatePreferences(self):
        """ Update the preferences of the agent for the last chosen action """
        # Compute the critic (in form of a TD error)
        tdError = self.reward + self.configDict['gamma'] * self.functionApproximator.computeV(self.state)  \
                        - self.functionApproximator.computeV(self.lastState)

        # Updates actor's preferences accordingly
        target = self.preferencesApproximator.computeQ(
            self.lastState, self.lastAction) + tdError
        trainDict = {(self.lastState, self.lastAction): target}
        self.preferencesApproximator.train(trainDict)

        # Update Q-function observable
        valueAccessFunction = lambda state, action : \
                    self.preferencesApproximator.computeQ(state, action)
        self.preferencesObservable.updateValues(valueAccessFunction,
                                                self.actions)

    def _chooseAction(self):
        "Chooses an action from the action space"
        preferences = [
            self.preferencesApproximator.computeQ(self.state, action)
            for action in self.actions
        ]
        probabilityMassFunction = computeProbabilityMasses(
            preferences, self.configDict['tau'])

        randValue = random.random()
        accumulator = 0.0
        for index, probabilityMass in enumerate(probabilityMassFunction):
            accumulator += probabilityMass
            if accumulator >= randValue:
                return self.actions[index]

        # For rare cases where the probability masses do not sum to 1 due to
        # numerical imprecision
        return self.actions[-1]
コード例 #10
0
ファイル: td_lambda_agent.py プロジェクト: mekruthi/mmlf
class TDLambdaAgent(TDAgent):
    """ Agent that implements TD(lambda) RL
    
    An agent that uses temporal difference learning  (e.g. Sarsa)
    with eligibility traces and function approximation (e.g. linear tile
    coding CMAC) to optimize its behavior in a given environment

    **CONFIG DICT** 
        :update_rule: : Whether the learning is on-policy or off-policy.. Can be either "SARSA" (on-policy) or "WatkinsQ" (off-policy)
        :gamma: : The discount factor for computing the return given the rewards
        :epsilon: : Exploration rate. The probability that an action is chosen non-greedily, i.e. uniformly random among all available actions
        :epsilonDecay: : Decay factor for the exploration rate. The exploration rate is multiplied with this value after each episode. 
        :lambda: : The eligibility trace decay rate
        :minTraceValue: : The minimum value of an entry in a trace that is considered to be relevant. If the eligibility falls  below this value, it is set to 0 and the entry is thus no longer updated
        :replacingTraces: : Whether replacing or accumulating traces are used.
        :stateDimensionResolution: : The default "resolution" the agent uses for every state dimension. Can be either an int (same resolution for each dimension) or a dict mapping dimension name to its resolution.
        :actionDimensionResolution: : Per default, the agent discretizes a continuous action space in this number of discrete actions
        :function_approximator: : The function approximator used for representing the Q value function     
     """
    
    DEFAULT_CONFIG_DICT = {'update_rule' : "SARSA",
                           'gamma' : 0.9,
                           'epsilon' : 0.1,
                           'epsilonDecay' : 1.0,
                           'lambda' : 0.9,
                           'minTraceValue' : 0.1,
                           'replacingTraces' : True,
                           'stateDimensionResolution' : 5,
                           'actionDimensionResolution' : 7,
                           'function_approximator' :  {'name' : 'TabularStorage',
                                                       'learning_rate' : 1.0,
                                                       'default' : 0.0}}
    
    def __init__(self, *args, **kwargs):
        
        if self.__class__ == TDLambdaAgent:
            # Create the agent info
            self.agentInfo = mmlf.framework.protocol.AgentInfo(
                                versionNumber = "0.3",
                                agentName = "TD(lambda)",
                                continuousState = True,
                                continuousAction = True,
                                discreteAction = True,
                                nonEpisodicCapable = True)
        
        # Call the constructor of the super class
        super(TDLambdaAgent, self).__init__(*args, **kwargs)
        
        if not 'update_rule' in self.configDict:
            self.configDict['update_rule'] = "SARSA"        
        elif self.configDict['update_rule'] == "WatkinsQ" and self.configDict['lambda'] > 0.0:
            self.agentLog.warning("Using the update rule 'WatkinsQ' with lambda>0. "
                                  "MMLF does neither use WatkinsQ(lambda) nor "
                                  "PengQ(lambda) but the 'naive' Q(lambda) " 
                                  "(compare Sutton/Barto, chapter 7.6)")
        
        #Create Eligibility Traces for all possible actions
        minTraceValue = self.configDict['minTraceValue']
        self.eligibilityTrace = EligibilityTrace(minTraceValue = minTraceValue)
                    
        # An observable that can be used to monitor the agent's eligibility traces
        self.eligibilityTraceObservable = \
            StateActionValuesObservable(title='%s (eligibility trace)' 
                                                    % self.__class__.__name__)
            
            
    ######################  BEGIN COMMAND-HANDLING METHODS #####################
    
    def nextEpisodeStarted(self):
        """ Informs the agent that a new episode has started."""
        newEpisodeResponse = super(TDLambdaAgent, self).nextEpisodeStarted()

        # Before we continue, we have to reset the eligibility traces
        self.eligibilityTrace.traces.clear()
                    
        return newEpisodeResponse
    
    ######################  End COMMAND-HANDLING METHODS ###############################
      
    def _train(self, terminalState = False):
        """ Train agent on last experience and eligibility traces.
         
        Train the agent using the last (s,a,r,s',a') tuple and the stored
        eligibility traces 
        """
        # Since we are going to update the Q-table for the 
        # pair of lastState, lastAction, we have to check if they are 
        # not None       
        if self.lastState != None and self.lastAction != None:
            # We set the eligibility for the lastState, lastAction pair
            self.eligibilityTrace.setEligibility(self.lastState, 
                                                 self.lastAction, 
                                                 1)
            if terminalState: 
                # If we have reached a terminal state,
                # the target is simply the obtained reward obtained after executing
                # last Action in lastState
                target = self.reward
            else:
                # Otherwise we let the learner compute the delta
                target = self.tdLearner.computeTarget(self.lastState,
                                                      self.lastAction,
                                                      self.reward, self.state,
                                                      self.action)
            
            # Train the learner using the eligibility traces
            traces = self.eligibilityTrace.getTraces()
            self.tdLearner.trainOnTraces(self.lastState, self.lastAction,
                                         target, traces)
        
        # Update eligibility trace observable 
        valueAccessFunction = lambda state, action : \
                    self.eligibilityTrace.getEligibility(state, action)
        self.eligibilityTraceObservable.updateValues(valueAccessFunction,
                                                     self.actions)
        
        # Decay the eligibility by the factor lambda * gamma
        self.eligibilityTrace.decayAllEligibilities(self.configDict['lambda'] 
                                                    * self.configDict['gamma'])