class TDAgent(AgentBase): """ A base agent for all kind of agents based on temporal difference learning. Most of these agents can reuse most methods of this agents and have to modify only small parts Note: The TDAgent cannot be instantiated by itself, it is a abstract base class! """ def __init__(self, *args, **kwargs): super(TDAgent, self).__init__(*args, **kwargs) self.epsilon = self.configDict.get('epsilon', 0.0) self.epsilonDecay = self.configDict.get('epsilonDecay', 1.0) # An observable that can be used to monitor an agents greedy policy self.policyObservable = \ FunctionOverStateSpaceObservable(title='%s (greedy policy)' % self.__class__.__name__, discreteValues=True) # An observable that can be used to monitor an agents optimal value function self.optimalValueFunctionObservable = \ FunctionOverStateSpaceObservable(title='%s (optimal value function)' % self.__class__.__name__, discreteValues=False) # An observable that can be used to monitor an agents Q-Function self.stateActionValuesObservable = \ StateActionValuesObservable(title='%s (value function)' % self.__class__.__name__) ###################### BEGIN COMMAND-HANDLING METHODS ############################### def setStateSpace(self, stateSpace): """ Informs the agent about the state space of the environment More information about state spaces can be found in :ref:`state_and_action_spaces` """ super(TDAgent, self).setStateSpace(stateSpace) # Determine the resolution used for continuous state space dimensions # (if any exist): for dimName, dimDescr in self.stateSpace.iteritems(): if dimDescr['dimensionType'] == "continuous": # Set the resolution if isinstance(self.configDict['stateDimensionResolution'], dict): # If we have separate resolution for each dimension assert ("stateDimensionResolution" in self.configDict and dimName in self.configDict['stateDimensionResolution']), \ "State dimension %s has no specified resolution." % dimName self.stateSpace[dimName]["resolution"] \ = self.configDict['stateDimensionResolution'][dimName] else: # If we have the same value for all dimensions: self.stateSpace[dimName]["resolution"] \ = self.configDict['stateDimensionResolution'] def setActionSpace(self, actionSpace): """ Informs the agent about the action space of the environment More information about action spaces can be found in :ref:`state_and_action_spaces` """ if actionSpace.hasContinuousDimensions(): assert ("actionDimensionResolution" in self.configDict), \ " Continuous action spaces require that an action dimension resolution be defined." discreteActionsPerDimension = self.configDict[ 'actionDimensionResolution'] self.actionSpace = actionSpace.discretizedActionSpace( discreteActionsPerDimension) else: self.actionSpace = actionSpace # Get a list of all actions this agent might take self.actions = self.actionSpace.getActionList() self.agentLog.info("%s got new action-space: %s" % (self.__class__.__name__, self.actionSpace)) # Since state and action space are now known, the learner can be initialized self._initialize() def setState(self, state): """ Informs the agent of the environment's current state More information about (valid) states can be found in :ref:`state_and_action_spaces` """ super(TDAgent, self).setState(state) def giveReward(self, reward): """ Provides a reward to the agent """ self.reward += reward def getAction(self): """ Request the next action the agent want to execute """ # Choose the action, based on the current state self.action = self._chooseAction() # Create an action dictionary # that maps action dimension to chosen action actionDictionary = dict() for index, actionName in enumerate(self.actionSpace.iterkeys()): actionDictionary[actionName] = self.action[index] # Train the agent # NOTE: This has to happen in this method since a method like SARSA # needs to know the action taken in the successor state to # update the previous state self._train(terminalState=False) # Update all observables self._updateObservables() super(TDAgent, self).getAction() return self._generateActionObject(actionDictionary) def nextEpisodeStarted(self): """ Informs the agent that a new episode has started.""" # Train the agent a last time for this episode self._train(terminalState=True) # Update all observables self._updateObservables() # Decay epsilon self.epsilon *= self.epsilonDecay super(TDAgent, self).nextEpisodeStarted() ###################### End COMMAND-HANDLING METHODS ############################### def getGreedyPolicy(self): """ Returns the optimal greedy policy the agent has found so far """ return ValueFunctionPolicy(self.tdLearner.functionApproximator, self.tdLearner.actions) def _initialize(self): """ Initializes learner as soon as state and action space are known """ # Create function approximator based on the configuration # one now may set a policy in advance, that shall be re-cycled during learning # as for transfer learning techniques if not hasattr( self, "functionApproximator") or self.functionApproximator == None: self.functionApproximator = \ FunctionApproximator.create(self.configDict["function_approximator"], self.stateSpace, self.actions) # The default feature computation is just to scale the state featureFct = lambda state: state #Create the main TD learner object which is responsible for learning self.tdLearner = TD_Learner(self.actions, self.functionApproximator, featureFct, self.configDict) def _chooseAction(self): """ Choose action to perform for the given state based on the Q-value of the state, action pairs """ # Apply epsilon-greedy action selection if random.random() < self.epsilon: #Choose random action action = random.choice(self.actions) else: # Compute the action with the optimal q-value action = self.functionApproximator.computeOptimalAction(self.state) return action def _updateObservables(self): super(TDAgent, self)._updateObservables() # Update policy observable self.policyObservable.updateFunction( lambda state: self.functionApproximator.computeOptimalAction(state )[0]) # Update optimal value function observable self.optimalValueFunctionObservable.updateFunction( lambda state: self.functionApproximator.computeV(state)) # Update Q-function observable valueAccessFunction = lambda state, action : \ self.functionApproximator.computeQ(state, action) self.stateActionValuesObservable.updateValues(valueAccessFunction, self.actions)
class MonteCarloAgent(AgentBase): """ Agent that learns based on monte-carlo samples of the Q-function An agent which uses Monte Carlo policy evaluation to optimize its behavior in a given environment. **CONFIG DICT** :gamma: : The discount factor for computing the return given the rewards :epsilon: : Exploration rate. The probability that an action is chosen non-greedily, i.e. uniformly random among all available actions :visit: : Whether first ("first") or every visit ("every") is used in Monte-Carlo updates :defaultQ: : The initially assumed Q-value for each state-action pair. Allows to control initial exploration due to optimistic initialization """ DEFAULT_CONFIG_DICT = { 'gamma': 1.0, 'epsilon': 0.1, 'visit': "first", 'defaultQ': 0.0 } def __init__(self, *args, **kwargs): self.agentInfo = mmlf.framework.protocol.AgentInfo( versionNumber="0.3", agentName="Monte Carlo", continuousState=False, continuousAction=False, discreteAction=True, nonEpisodicCapable=False) # Calls constructor of base class super(MonteCarloAgent, self).__init__(*args, **kwargs) self.hasContinuousActionSpace = False self.episode = {'states': [], 'actions': [], 'rewards': []} self.samples = defaultdict(lambda: 0) self.qvalues = defaultdict(lambda: self.configDict['defaultQ']) # An observable that can be used to monitor an agents Q-Function self.stateActionValuesObservable = \ StateActionValuesObservable(title='%s Q Function' % self.__class__.__name__) ###################### BEGIN COMMAND-HANDLING METHODS ############################### def setActionSpace(self, actionSpace): """ Informs the agent about the action space of the environment More information about action spaces can be found in :ref:`state_and_action_spaces` """ ##TODO: Extend to more than one action dimension? if len(actionSpace.keys()) > 1: raise UserWarning("Error: Currently, only one action dimension is " "possible in Monte-Carlo!") super(MonteCarloAgent, self).setActionSpace(actionSpace) self.actions = copy.copy(self.actionSpace["action"]["dimensionValues"]) def setState(self, state): """ Informs the agent of the environment's current state More information about (valid) states can be found in :ref:`state_and_action_spaces` """ super(MonteCarloAgent, self).setState(state) self.episode['states'].append(self.state) def giveReward(self, reward): """ Provides a reward to the agent """ self.episode['rewards'].append(reward) def getAction(self): """ Request the next action the agent want to execute """ #Choose the action, based on the current state actions = copy.copy(self.actions) random.shuffle(actions) exploration = random.random() < self.configDict['epsilon'] if exploration: maxAction = random.choice(actions) else: maxValue, foo, maxAction = max(( self.qvalues[(self.state, action)], random.random(), # break ties randomly! action) for action in actions) self.episode['actions'].append(maxAction) actionDictionary = {'action': maxAction} super(MonteCarloAgent, self).getAction() return self._generateActionObject(actionDictionary) def nextEpisodeStarted(self): """ Informs the agent that a new episode has started.""" visited_state_actions = set() trajectory = zip(self.episode['states'], self.episode['actions']) for step, state_action in enumerate(trajectory): if self.configDict[ 'visit'] == "first" and state_action in visited_state_actions: continue visited_state_actions.add(state_action) discReturn = sum( map(lambda x: self.configDict['gamma']**x[0] * x[1], enumerate(self.episode['rewards'][step:]))) self.qvalues[state_action] = float(self.qvalues[state_action] * self.samples[state_action] + discReturn) \ / (self.samples[state_action] + 1) self.samples[state_action] = self.samples[state_action] + 1 self.episode = {'states': [], 'actions': [], 'rewards': []} # Update Q-function observable def valueAccessFunction(state, action): if isinstance(action, tuple): action = action[0] return self.qvalues[(state, action)] self.stateActionValuesObservable.updateValues(valueAccessFunction, self.actions) super(MonteCarloAgent, self).nextEpisodeStarted()
class ActorCriticAgent(TDLambdaAgent): """ Agent that learns based on the actor-critic architecture. This agent learns based on the actor critic architecture. It uses standard TD(lambda) to learn the value function of the critic. For this reason, it subclasses TDLambdaAgent. The main difference to TD(lambda) is the means for action selection. Instead of deriving an epsilon-greedy policy from its Q-function, it learns an explicit stochastic policy. To this end, it maintains preferences for each action in each state. These preferences are updated after each action execution according to the following rule: .. math:: p(s,a) = p(s,a) + \delta, where delta is the TD error .. math:: \delta = r + \gamma V(s') - V(s) Action selection is based on a Gibbs softmax distribution: .. math:: \pi(s,a) = \\frac{exp(\\tau^{-1}p(s,a))}{\sum_{b \in A} exp(\\tau^{-1}p(s,b))} where tau is a temperature parameter. Note that even though preferences are stored in a function approximator such that in principle, action preferences could be generalized over the state space, continuous state spaces are not yet supported. .. versionadded:: 0.9.9 Added Actor-Critic agent **CONFIG DICT** :gamma: : The discount factor for computing the return given the rewards :lambda: : The eligibility trace decay rate :tau: : Temperature parameter used in the Gibbs softmax distribution for action selection :minTraceValue: : The minimum value of an entry in a trace that is considered to be relevant. If the eligibility falls below this value, it is set to 0 and the entry is thus no longer updated :update_rule: : Whether the learning is on-policy or off-policy.. Can be either "SARSA" (on-policy) or "WatkinsQ" (off-policy) :stateDimensionResolution: : The default "resolution" the agent uses for every state dimension. Can be either an int (same resolution for each dimension) or a dict mapping dimension name to its resolution. :actionDimensionResolution: : Per default, the agent discretizes a continuous action space in this number of discrete actions. :function_approximator: : The function approximator used for representing the Q value function :preferences_approximator: The function approximator used for representing the action preferences (i.e. the policy) """ DEFAULT_CONFIG_DICT = { 'gamma': 0.0, 'lambda': 0.9, 'tau': 0.2, 'minTraceValue': 0.5, 'stateDimensionResolution': 5, 'actionDimensionResolution': 7, 'update_rule': "SARSA", 'function_approximator': { 'name': 'TabularStorage', 'learning_rate': 0.1, 'default': 0.0 }, 'preferences_approximator': { 'name': 'TabularStorage', 'learning_rate': 1.0, 'default': 0.0 } } def __init__(self, *args, **kwargs): # Create the agent info self.agentInfo = \ mmlf.framework.protocol.AgentInfo(# Which communication protocol # version can the agent handle? versionNumber = "0.3", # Name of the agent (can be # chosen arbitrarily) agentName= "Actor Critic", # Can the agent be used in # environment with continuous # state spaces? continuousState = False, # Can the agent be used in # environment with continuous # action spaces? continuousAction = False, # Can the agent be used in # environment with discrete # action spaces? discreteAction = True, # Can the agent be used in # non-episodic environments nonEpisodicCapable = True) # Calls constructor of base class # After this call, the agent has an attribute "self.configDict", # that contains the information from config['configDict']. super(ActorCriticAgent, self).__init__(*args, **kwargs) # An observable that can be used to monitor an agents preferences self.preferencesObservable = \ StateActionValuesObservable(title='%s Preferences' % self.__class__.__name__) ###################### BEGIN COMMAND-HANDLING METHODS ############################### def getAction(self): """ Request the next action the agent want to execute """ # We can now modify the preferences for the action taken in the last step if self.lastState is not None: self._updatePreferences() return super(ActorCriticAgent, self).getAction() def nextEpisodeStarted(self): """ Informs the agent that a new episode has started.""" # We can now modify the preferences for the action taken in the last step if self.lastState is not None: self._updatePreferences() return super(ActorCriticAgent, self).nextEpisodeStarted() ######################## END COMMAND-HANDLING METHODS ############################### def _initialize(self): """ Initializes learner as soon as state and action space are known """ # The actor learns preferences for actions in certain states # Create function approximator for this preference function # based on the configuration self.preferencesApproximator = \ FunctionApproximator.create(self.configDict["preferences_approximator"], self.stateSpace, self.actions) super(ActorCriticAgent, self)._initialize() def _updatePreferences(self): """ Update the preferences of the agent for the last chosen action """ # Compute the critic (in form of a TD error) tdError = self.reward + self.configDict['gamma'] * self.functionApproximator.computeV(self.state) \ - self.functionApproximator.computeV(self.lastState) # Updates actor's preferences accordingly target = self.preferencesApproximator.computeQ( self.lastState, self.lastAction) + tdError trainDict = {(self.lastState, self.lastAction): target} self.preferencesApproximator.train(trainDict) # Update Q-function observable valueAccessFunction = lambda state, action : \ self.preferencesApproximator.computeQ(state, action) self.preferencesObservable.updateValues(valueAccessFunction, self.actions) def _chooseAction(self): "Chooses an action from the action space" preferences = [ self.preferencesApproximator.computeQ(self.state, action) for action in self.actions ] probabilityMassFunction = computeProbabilityMasses( preferences, self.configDict['tau']) randValue = random.random() accumulator = 0.0 for index, probabilityMass in enumerate(probabilityMassFunction): accumulator += probabilityMass if accumulator >= randValue: return self.actions[index] # For rare cases where the probability masses do not sum to 1 due to # numerical imprecision return self.actions[-1]
class FittedRMaxAgent(AgentBase): """ Fitted R-Max agent Fitted R-Max is a model-based RL algorithm that uses the RMax heuristic for exploration control, uses a fitted function approximator (even though this can be configured differently), and uses Dynamic Programming (boosted by prioritized sweeping) for deriving a value function from the model. Fitted R-Max learns usually very sample-efficient (meaning that a good policy is learned with only a few interactions with the environment) but requires a huge amount of computational resources. .. seealso:: Nicholas K. Jong and Peter Stone, "Model-based function approximation in reinforcement learning", in "Proceedings of the 6th International Joint Conference on Autonomous Agents and Multiagent Systems" Honolulu, Hawaii: ACM, 2007, 1-8, http://portal.acm.org/citation.cfm?id=1329125.1329242. **CONFIG DICT** :gamma: : The discount factor for computing the return given the rewards# :min_exploration_value: : The agent explores in a state until the given exploration value (approx. number of exploratory actions in proximity of state action pair) is reached for all actions :RMax: : An upper bound on the achievable return an agent can obtain in a single episode :planner: : The algorithm used for planning, i.e. for optimizing the policy based on a learned model :model: : The algorithm used for learning a model of the environment :function_approximator: : The function approximator used for representing the Q value function :actionDimensionResolution: : Per default, the agent discretizes a continuous action space in this number of discrete actions """ DEFAULT_CONFIG_DICT = { 'gamma': 0.99, 'min_exploration_value': 1.0, 'RMax': 0.0, 'planner': { 'name': "PrioritizedSweeping", 'updatesPerStep': 1000, 'minSweepDelta': 0.1 }, 'model': { 'name': 'KNNModel', 'k': 100, 'b_Sa': 0.03, 'exampleSetSize': 2500 }, 'function_approximator': { 'name': 'KNN', 'k': 20, 'b_X': 0.01 }, 'actionDimensionResolution': 9 } def __init__(self, *args, **kwargs): # Create the agent info self.agentInfo = mmlf.framework.protocol.AgentInfo( versionNumber="0.3", agentName="Fitted R-Max", continuousState=True, continuousAction=False, discreteAction=True, nonEpisodicCapable=True) # Calls constructor of base class super(FittedRMaxAgent, self).__init__(*args, **kwargs) self.functionApproximator = None self.userDirObj.createPath(['model'], refName='modelDir', baseRef='agentlogs', force=True) # An observable that stores the exploration value self.explorationValueObservable = \ FloatStreamObservable(title='%s Exploration Value' % self.__class__.__name__, time_dimension_name='Step', value_name='Exploration Value') # An observable that can be used to monitor an agents greedy policy self.policyObservable = \ FunctionOverStateSpaceObservable(title='%s (greedy policy)' % self.__class__.__name__, discreteValues=True) # An observable that can be used to monitor an agents optimal value function self.optimalValueFunctionObservable = \ FunctionOverStateSpaceObservable(title='%s (optimal value function)' % self.__class__.__name__, discreteValues=False) # An observable that can be used to monitor an agents Q-Function self.stateActionValuesObservable = \ StateActionValuesObservable(title='%s Q Function' % self.__class__.__name__) # An observable that can be used to monitor the expected reward self.expectedRewardObservable = \ StateActionValuesObservable(title='%s Reward Expectation' % self.__class__.__name__) ###################### BEGIN COMMAND-HANDLING METHODS ############################### def setActionSpace(self, actionSpace): """ Informs the agent about the action space of the environment More information about action spaces can be found in :ref:`state_and_action_spaces` """ if actionSpace.hasContinuousDimensions(): assert ("actionDimensionResolution" in self.configDict), \ """ Continuous action spaces require that an action dimension resolution be defined. """ discreteActionsPerDimension = self.configDict[ 'actionDimensionResolution'] self.actionSpace = actionSpace.discretizedActionSpace( discreteActionsPerDimension) else: self.actionSpace = actionSpace # Get a list of all actions this agent might take self.actions = self.actionSpace.getActionList() self.agentLog.info("%s got new action-space: %s" % (self.__class__.__name__, self.actionSpace)) # Since state and action space are now known, the learner can be initialized self._initialize() def setState(self, state): """ Informs the agent of the environment's current state More information about (valid) states can be found in :ref:`state_and_action_spaces` """ super(FittedRMaxAgent, self).setState(state) def giveReward(self, reward): """ Provides a reward to the agent """ self.reward = reward def getAction(self): """ Request the next action the agent want to execute """ if self.lastState != None: # Inform the model about the outcome of the last action self.model.addExperience(self.lastState, self.lastAction, self.state, self.reward) # Planning self._updatePolicy() # if self.stepCounter % 100 == 0: # import cProfile; # cProfile.runctx("self._updatePolicy()", globals(), locals(), # "profiling_data_%s.dat" % self.stepCounter) else: # Inform the model about a new start state self.model.addStartState(self.state) if self.functionApproximator != None: # Compute the action with the optimal q-value self.action = self.functionApproximator.computeOptimalAction( self.state) else: # Randomly choose an action self.action = random.choice(self.actions) # Create an action dictionary that maps # action dimension to chosen action actionDictionary = dict() for index, actionName in enumerate(self.actionSpace.iterkeys()): actionDictionary[actionName] = self.action[index] # Update explorationValueObservable self.explorationValueObservable.addValue( self.stepCounter, self.model.getExplorationValue(self.state, self.action)) super(FittedRMaxAgent, self).getAction() self.agentLog.debug("Episode %s Step: %s State: %s Action: %s" % (self.episodeCounter, self.stepCounter, self.lastState, self.lastAction)) return self._generateActionObject(actionDictionary) def nextEpisodeStarted(self): """ Informs the agent that a new episode has started.""" # If the agent has actually reached a terminal state if self.state is not None: # Inform the model about the outcome of the last action self.model.addExperience(self.lastState, self.lastAction, self.state, self.reward) # We have reached a terminal state self.model.addTerminalState(self.state) # Update policy self._updatePolicy() super(FittedRMaxAgent, self).nextEpisodeStarted() def getGreedyPolicy(self): """ Returns the optimal greedy policy the agent has found so far """ return ValueFunctionPolicy(self.functionApproximator, self.actions) def _initialize(self): """ Lazy initialization of the agent once state and action space are known """ # Data structure to store value function of discrete MDP during planning self.tabularStorage = TabularStorage( stateSpace=None, actions=self.actions, **self.configDict["function_approximator"]) assert self.configDict["planner"]['name'] == "PrioritizedSweeping", \ "Fitted-RMax works currently only with Prioritized Sweeping " \ " planner" # The object that performs planning self.planner = Planner.create(self.configDict["planner"], self.stateSpace, self.tabularStorage, self.configDict['gamma'], self.actions) # Choose action model class based on conf self.model = Model.create(self.configDict["model"], self, self.stateSpace, self.actionSpace, self.userDirObj) # TODO: Just a hack, should be refactored soon if "plotting" in self.configDict: self.configDict["plotting"]["modelRasterPoints"] = \ [self.configDict["plotting"]["modelRasterPoints"] for i in range(self.stateSpace.getNumberOfDimensions())] def _updatePolicy(self): """ Update policy using prioritized sweeping based on the internal model Construct a discrete, finite MDP based on the state transitions and reward expectations learned by the models. For state-action pairs that have not been explored sufficiently, be optimistic. i.e. assume that these states have a value that is equal to the maximal achievable reward (R-Max). """ # The states of the constructed MDP states = self.model.getStates() states.extend([("s_term", ), ("s_opt", )]) self.planner.setStates(states) # Generate the discrete, RMax-optimistic version of the MDP to be # solved. This MDP is derived from the learned model. discreteRMaxMDP = DiscreteRMaxMDP( self.model, self.configDict["RMax"], self.configDict["min_exploration_value"], self.stateSpace.hasContinuousDimensions()) # Perform prioritized sweeping to compute optimal value function # for the MDP specified by the model starting from the MDP state "nnState" try: nnState = self.model.getNearestNeighbor(self.lastState) self.planner.plan( state=nnState, action=self.lastAction, sampleStartState=None, sampleSuccessorState=None, stateTransitionFct=lambda state, action: discreteRMaxMDP. stateTransitionFct(state, action), invStateTransitionFct=lambda state, action: discreteRMaxMDP. invStateTransitionFct(state, action), rewardFct=lambda state, action: discreteRMaxMDP.rewardFct( state, action), isTerminalState=None) except (PlanningFailedException, ModelNotInitialized): self.agentLog.info("Planning failed!") # Get the computed qValues and remove the artificial states "s_term" and # "s_opt" qValues = self.planner.functionApproximator.getPlainValues() for action in self.actions: qValues.pop((("s_term", ), action), None) qValues.pop((("s_opt", ), action), None) # Create a function approximator that generalizes the compute q-values # to a value function over the whole continuous state space. self.configDict["function_approximator"]["learning_rate"] = 1.0 functionApproximator = \ FunctionApproximator.create(self.configDict["function_approximator"], self.stateSpace, self.actions) # Use the RMax function approximator wrapper to ensure that # underexplored states have value RMax self.functionApproximator = \ RMaxFunctionApproximatorWrapper(functionApproximator, self.stateSpace, self.actions, self.model, self.configDict["RMax"], self.configDict["min_exploration_value"]) self.functionApproximator.train(qValues) # Update observables self._updateObservables(discreteRMaxMDP) def _updateObservables(self, discreteRMaxMDP): super(FittedRMaxAgent, self)._updateObservables() # Update policy observable self.policyObservable.updateFunction( lambda state: self.functionApproximator.computeOptimalAction(state )[0]) # Update optimal value function observable self.optimalValueFunctionObservable.updateFunction( lambda state: self.functionApproximator.computeV(state)) # Update Q-function observable valueAccessFunction = lambda state, action : \ self.functionApproximator.computeQ(state, action) self.stateActionValuesObservable.updateValues(valueAccessFunction, self.actions) # Update reward expectation observable valueAccessFunction = lambda state, action : \ discreteRMaxMDP.rewardFct(state, action) self.expectedRewardObservable.updateValues(valueAccessFunction, self.actions)
class TDLambdaAgent(TDAgent): """ Agent that implements TD(lambda) RL An agent that uses temporal difference learning (e.g. Sarsa) with eligibility traces and function approximation (e.g. linear tile coding CMAC) to optimize its behavior in a given environment **CONFIG DICT** :update_rule: : Whether the learning is on-policy or off-policy.. Can be either "SARSA" (on-policy) or "WatkinsQ" (off-policy) :gamma: : The discount factor for computing the return given the rewards :epsilon: : Exploration rate. The probability that an action is chosen non-greedily, i.e. uniformly random among all available actions :epsilonDecay: : Decay factor for the exploration rate. The exploration rate is multiplied with this value after each episode. :lambda: : The eligibility trace decay rate :minTraceValue: : The minimum value of an entry in a trace that is considered to be relevant. If the eligibility falls below this value, it is set to 0 and the entry is thus no longer updated :replacingTraces: : Whether replacing or accumulating traces are used. :stateDimensionResolution: : The default "resolution" the agent uses for every state dimension. Can be either an int (same resolution for each dimension) or a dict mapping dimension name to its resolution. :actionDimensionResolution: : Per default, the agent discretizes a continuous action space in this number of discrete actions :function_approximator: : The function approximator used for representing the Q value function """ DEFAULT_CONFIG_DICT = {'update_rule' : "SARSA", 'gamma' : 0.9, 'epsilon' : 0.1, 'epsilonDecay' : 1.0, 'lambda' : 0.9, 'minTraceValue' : 0.1, 'replacingTraces' : True, 'stateDimensionResolution' : 5, 'actionDimensionResolution' : 7, 'function_approximator' : {'name' : 'TabularStorage', 'learning_rate' : 1.0, 'default' : 0.0}} def __init__(self, *args, **kwargs): if self.__class__ == TDLambdaAgent: # Create the agent info self.agentInfo = mmlf.framework.protocol.AgentInfo( versionNumber = "0.3", agentName = "TD(lambda)", continuousState = True, continuousAction = True, discreteAction = True, nonEpisodicCapable = True) # Call the constructor of the super class super(TDLambdaAgent, self).__init__(*args, **kwargs) if not 'update_rule' in self.configDict: self.configDict['update_rule'] = "SARSA" elif self.configDict['update_rule'] == "WatkinsQ" and self.configDict['lambda'] > 0.0: self.agentLog.warning("Using the update rule 'WatkinsQ' with lambda>0. " "MMLF does neither use WatkinsQ(lambda) nor " "PengQ(lambda) but the 'naive' Q(lambda) " "(compare Sutton/Barto, chapter 7.6)") #Create Eligibility Traces for all possible actions minTraceValue = self.configDict['minTraceValue'] self.eligibilityTrace = EligibilityTrace(minTraceValue = minTraceValue) # An observable that can be used to monitor the agent's eligibility traces self.eligibilityTraceObservable = \ StateActionValuesObservable(title='%s (eligibility trace)' % self.__class__.__name__) ###################### BEGIN COMMAND-HANDLING METHODS ##################### def nextEpisodeStarted(self): """ Informs the agent that a new episode has started.""" newEpisodeResponse = super(TDLambdaAgent, self).nextEpisodeStarted() # Before we continue, we have to reset the eligibility traces self.eligibilityTrace.traces.clear() return newEpisodeResponse ###################### End COMMAND-HANDLING METHODS ############################### def _train(self, terminalState = False): """ Train agent on last experience and eligibility traces. Train the agent using the last (s,a,r,s',a') tuple and the stored eligibility traces """ # Since we are going to update the Q-table for the # pair of lastState, lastAction, we have to check if they are # not None if self.lastState != None and self.lastAction != None: # We set the eligibility for the lastState, lastAction pair self.eligibilityTrace.setEligibility(self.lastState, self.lastAction, 1) if terminalState: # If we have reached a terminal state, # the target is simply the obtained reward obtained after executing # last Action in lastState target = self.reward else: # Otherwise we let the learner compute the delta target = self.tdLearner.computeTarget(self.lastState, self.lastAction, self.reward, self.state, self.action) # Train the learner using the eligibility traces traces = self.eligibilityTrace.getTraces() self.tdLearner.trainOnTraces(self.lastState, self.lastAction, target, traces) # Update eligibility trace observable valueAccessFunction = lambda state, action : \ self.eligibilityTrace.getEligibility(state, action) self.eligibilityTraceObservable.updateValues(valueAccessFunction, self.actions) # Decay the eligibility by the factor lambda * gamma self.eligibilityTrace.decayAllEligibilities(self.configDict['lambda'] * self.configDict['gamma'])