def getDiscreteStateSpace(N):
    """ Return discrete state space for mountain car with N values per dimension. """
    from mmlf.framework.spaces import StateSpace
    stateSpaceDict = {
        "position": ("discrete", numpy.linspace(-1.2, 0.6, N)),
        "velocity": ("discrete", numpy.linspace(-0.07, 0.07, N))
    }
    stateSpace = StateSpace()
    stateSpace.addOldStyleSpace(stateSpaceDict, limitType="soft")

    return stateSpace
class LinearMarkovChainEnvironment(SingleAgentEnvironment):
    """ A linear markov chain.
    
    The agent starts in the middle of this linear markov chain. He can either
    move right or left. The chain is not stochastic, i.e. when the agent 
    wants to move right, the state is decreased with probability 1 by 1.  
    When the agent wants to move left, the state is increased with probability 1
    by 1 accordingly.
    
    .. versionadded:: 0.9.10
       Added LinearMarkovChain environment
    
    **CONFIG DICT**
        :length: : The number of states of the linear markov chain
    
    """

    # Add default configuration for this environment to this static dict
    # This specific parameter controls how long the linear markov chain is
    # (i.e. how many states there are)
    DEFAULT_CONFIG_DICT = {"length": 21}

    def __init__(self, useGUI, *args, **kwargs):
        # Create the environment info
        self.environmentInfo = \
            EnvironmentInfo(# Which communication protocol version can the 
                            # environment handle?
                            versionNumber="0.3",
                            # Name of the environment (can be chosen arbitrarily)
                            environmentName="LinearMarkovChain",
                            # Is the action space of this environment discrete?
                            discreteActionSpace=True,
                            # Is the environment episodic?
                            episodic=True,
                            # Is the state space of environment continuous?
                            continuousStateSpace=False,
                            # Is the action space of environment continuous?
                            continuousActionSpace=False,
                            # Is the environment stochastic?
                            stochastic=False)

        # Calls constructor of base class
        # After this call, the environment has an attribute "self.configDict",
        # The values of this dict are evaluated, i.e. instead of '100' (string),
        # the key 'length' will have the same value 100 (int).
        super(LinearMarkovChainEnvironment, self).__init__(useGUI=useGUI,
                                                           *args,
                                                           **kwargs)

        # The state space of the linear markov chain
        oldStyleStateSpace = {
            "field": ("discrete", range(self.configDict["length"]))
        }

        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")

        # The action space of the linear markov chain
        oldStyleActionSpace = {"action": ("discrete", ["left", "right"])}

        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace,
                                          limitType="soft")

        # The initial state of the environment
        self.initialState = {"field": self.configDict["length"] / 2}
        # The current state is initially set to the initial state
        self.currentState = deepcopy(self.initialState)

    ########################## Interface Functions #####################################
    def getInitialState(self):
        """ Returns the initial state of the environment """
        self.environmentLog.debug("Episode starts in state '%s'." %
                                  (self.initialState['field']))
        return self.initialState

    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """
        action = actionObject['action']
        previousState = self.currentState['field']

        # Change state of environment deterministically
        if action == 'left':
            self.currentState['field'] -= 1
        else:
            self.currentState['field'] += 1

        self.environmentLog.debug(
            "Agent chose action '%s' which caused a transition from '%s' to '%s'."
            % (action, previousState, self.currentState['field']))

        #Check if the episode is finished (i.e. the goal is reached)
        episodeFinished = self._checkEpisodeFinished()

        terminalState = self.currentState if episodeFinished else None

        if episodeFinished:
            self.episodeLengthObservable.addValue(self.episodeCounter,
                                                  self.stepCounter + 1)
            self.returnObservable.addValue(self.episodeCounter,
                                           -self.stepCounter)
            self.environmentLog.debug("Terminal state '%s' reached." %
                                      self.currentState['field'])
            self.environmentLog.info(
                "Episode %s lasted for %s steps." %
                (self.episodeCounter, self.stepCounter + 1))

            reward = 10 if self.currentState['field'] != 0 else -10

            self.stepCounter = 0
            self.episodeCounter += 1

            # Reset the simulation to the initial state (always the same)
            self.currentState = deepcopy(self.initialState)
        else:
            reward = -1
            self.stepCounter += 1

        resultsDict = {
            "reward": reward,
            "terminalState": terminalState,
            "nextState": self.currentState,
            "startNewEpisode": episodeFinished
        }
        return resultsDict

    def _checkEpisodeFinished(self):
        """ Checks whether the episode is finished.
        
        An episode is finished whenever the leftmost or rightmost state of the
        chain is reached.
        """
        return self.currentState['field'] in [0, self.configDict['length'] - 1]
Esempio n. 3
0
class SinglePoleBalancingEnvironment(SingleAgentEnvironment):
    """ The single pole balancing environment.
    
    In the single pole balancing environment, the task of the agent is to control
    a cart such that a pole which is mounted on the cart stays in a nearly
    vertical position (to balance it). At the same time, the cart has to stay
    in a confined region.
    
    The agent can apply in every time step a force between -2N and 2N in order to
    accelerate the car. Thus the action space is one-dimensional and continuous. 
    The state consists of the cart's current position and velocity as well as the
    pole's angle and angular velocity. Thus, the state space is four-dimensional
    and continuous.
    
    **CONFIG DICT** 
        :GRAVITY: : The gravity force. Benchmark default "-9.8"    
        :MASSCART: : The mass of the cart. Benchmark default "1.0"
        :MASSPOLE: : The mass of the pole. Benchmark default "0.1"
        :TOTAL_MASS: : The total mass (pole + cart). Benchmark default "1.1"
        :LENGTH: : The length of the pole. Benchmark default "0.5"
        :POLEMASS_LENGTH: : The center of mass of the pole. Benchmark default "0.05"
        :TAU: : The time step between two commands of the agent. Benchmark default "0.02"                         
        :MAXCARTPOSITION: : The maximal distance the cart is allowed to move away
                            from its start position. Benchmark default "7.5"
        :MAXPOLEANGULARPOSITION: : Maximal angle the pole is allowed to take on. Benchmark default "0.7"
        :MAXSTEPS: : The number of steps the agent must balance the poles. Benchmark default "100000"
    """
    
    DEFAULT_CONFIG_DICT = {'GRAVITY' : 9.8,    
                           'MASSCART' : 1.0,
                           'MASSPOLE' : 0.1,
                           'TOTAL_MASS' : 1.1,
                           'LENGTH' : 0.5,
                           'POLEMASS_LENGTH' : 0.05,
                           'TAU' : 0.02,                         
                           'MAXCARTPOSITION' : 7.5,
                           'MAXPOLEANGULARPOSITION' : 0.7,
                           'MAXSTEPS' : 100000}
    
    def __init__(self, useGUI, *args, **kwargs):
        
        self.environmentInfo = EnvironmentInfo(versionNumber="0.3",
                                               environmentName="Single Pole Balancing",
                                               discreteActionSpace=False,
                                               episodic=True,
                                               continuousStateSpace=True,
                                               continuousActionSpace=True,
                                               stochastic=False)

        super(SinglePoleBalancingEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs)
        
        #The state space of the Single Pole Balancing Simulation
        oldStyleStateSpace = {"cartPosition": ("continuous", [(-3.125, 3.125)]),
                              "cartVelocity": ("continuous", [(-0.5, 0.5)]),
                              "poleAngularPosition": ("continuous", [(-1.13, 1.13)]),
                              "poleAngularVelocity": ("continuous", [(-0.80, 0.80)]),
                              }
        
        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")
        
        #The action space of the Single Pole Balancing Simulation
        oldStyleActionSpace =  {"force": ("continuous", [(-2, 2)])}
        
        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft")

        #The current state of the simulation
        #Note that the values of this dict can be accesed directly as 
        #attributes of the class (see the __getattr__ and _setattr__ method)
        self.initialState =  { 
                     "cartPosition": 0.0,
                     "poleAngularPosition": 0.1,
                     "cartVelocity": 0.0,
                     "poleAngularVelocity": 0.0,
                  }
        #The current state is initially set to the initial state
        self.currentState = deepcopy(self.initialState)
        
        if useGUI:
            from mmlf.gui.viewers import VIEWERS
            from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer
            from mmlf.worlds.single_pole_balancing.environments.spb_trajectory_viewer import SPBTrajectoryViewer
            
            # Add general trajectory viewer
            VIEWERS.addViewer(lambda : TrajectoryViewer(self.stateSpace), 
                              'TrajectoryViewer')
            
            VIEWERS.addViewer(lambda : SPBTrajectoryViewer(),
                              'SPB Cart Viewer')
             
    def __setattr__(self, attrName, attrValue):
        """
        Sets the attribute with name attrName to the Value attrValue.
        
        If there is no such attribute but a key with this name exists in self.currentState,
        this entry of the dictiionary is updated instead.
        """
        if attrName in self.__dict__.iterkeys(): 
            self.__dict__[attrName] = attrValue
        elif attrName != 'currentState' \
             and hasattr(self,'currentState') \
             and attrName in self.currentState.iterkeys():
            self.currentState[attrName] = attrValue
        else:
            self.__dict__[attrName] = attrValue
                
    def __getattr__(self, attrName):
        """
        Returns the value of the attribute specified by attrName. If there is no such attribute,
        it checks if such an attribute is contained in the self.currentState dict.
        """
        if attrName in self.__dict__.iterkeys(): 
            return self.__dict__[attrName]
        elif attrName != 'currentState' and attrName in self.currentState.iterkeys():
            return self.currentState[attrName]
        else:
            raise AttributeError("%s object has no attribute %s" % (self.__class__.__name__, attrName))
    
    ########################## Interface Functions #####################################
    def getInitialState(self):
        """ Returns the initial state of the environment """
        return self._createStateForAgent(self.initialState)
    
    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """
        # Remember state before executing action
        previousState = self.currentState
        
        # Determine force applied to the cart
        force = actionObject['force'] # wish be the agent
        force = self.actionSpace.chopContinuousAction(force) # actual force
        
        self._stateTransition(force)
        
        episodeFinished = self._checkEpisodeFinished()
        terminalState = self._createStateForAgent(self.currentState) \
                                 if episodeFinished else None
        
        if episodeFinished:
            self.episodeLengthObservable.addValue(self.episodeCounter,
                                                  self.stepCounter + 1)
            self.returnObservable.addValue(self.episodeCounter,
                                           self.stepCounter)
            self.environmentLog.info("Episode %s lasted for %s steps." 
                                     % (self.episodeCounter, self.stepCounter  + 1))
            
            self.stepCounter = 0
            self.episodeCounter += 1
            #Reset the environment to the initial state (always the same)
            self.currentState = deepcopy(self.initialState)
            
            self.trajectoryObservable.addTransition(self._createStateForAgent(previousState),
                                                    actionObject, 1,
                                                    terminalState, 
                                                    episodeTerminated=True)
        else:
            self.stepCounter += 1
            if self.stepCounter in [10000, 20000, 30000, 40000, 50000, 60000, 
                                    70000, 80000, 90000]:
                self.environmentLog.info("Balanced for %s steps!" % self.stepCounter)
            
            self.trajectoryObservable.addTransition(self._createStateForAgent(previousState),
                                                    actionObject, 1,
                                                    self._createStateForAgent(self.currentState), 
                                                    episodeTerminated=False)
        
        resultsDict = {"reward" : 1, # we always give a reward of 1
                       "terminalState" : terminalState,
                       "nextState" : self._createStateForAgent(self.currentState),
                       "startNewEpisode" : episodeFinished}
        return resultsDict
        
    ########################## Helper Functions #####################################
    def _stateTransition(self, force):
        "Update self.currentState with new values based on the current values and the force applied"
        costheta = math.cos(self.currentState["poleAngularPosition"])
        sintheta = math.sin(self.currentState["poleAngularPosition"])
        
        temp = (force + self.configDict["POLEMASS_LENGTH"] * self.currentState["poleAngularPosition"] * \
               self.currentState["poleAngularPosition"] * sintheta)/ self.configDict["TOTAL_MASS"]

        thetaacc = (self.configDict["GRAVITY"] * sintheta - costheta* temp)/ \
                   (self.configDict["LENGTH"] * (1.333333333333 \
                   - self.configDict["MASSPOLE"] * costheta * costheta / self.configDict["TOTAL_MASS"]))

        xacc  = temp - self.configDict["POLEMASS_LENGTH"] * thetaacc* costheta / self.configDict["TOTAL_MASS"]
        
        #Update the four state variables, using Euler's method. 
        
        self.currentState["cartPosition"] = self.currentState["cartPosition"] + self.configDict["TAU"] * self.currentState["cartVelocity"]
        self.currentState["cartVelocity"] = self.currentState["cartVelocity"] + self.configDict["TAU"] * xacc
        self.currentState["poleAngularPosition"] = self.currentState["poleAngularPosition"] + self.configDict["TAU"] * self.currentState["poleAngularVelocity"]
        self.currentState["poleAngularVelocity"] = self.currentState["poleAngularVelocity"] + self.configDict["TAU"] * thetaacc
    
    def _checkTerminalState(self):
        """
        Returns whether the simulation has reached a terminal state.
        
        A terminal state is reached if the cart or the pole exceed certain boundaries
        """
        return ((math.fabs(self.currentState["cartPosition"]) > self.configDict["MAXCARTPOSITION"]) \
                    or (math.fabs(self.currentState["poleAngularPosition"]) > self.configDict["MAXPOLEANGULARPOSITION"]))
        
    def _checkEpisodeFinished(self):
        """
        Returns whether an episode is finished. 
        
        An episode is finished if a terminal state is reached or the maximum number of steps is exceeded.
        """
        return self._checkTerminalState() or self.stepCounter >= self.configDict["MAXSTEPS"]-1
    
    def _createStateForAgent(self, state):
        """
        Creates a subset of the state which can be communicated to an agent
        """
        # pass the state along unmodified
        stateForAgent =   {
                         "cartPosition": state['cartPosition']/2.4,
                         "cartVelocity": state['cartVelocity']/10.0,
                         "poleAngularPosition": state['poleAngularPosition']/0.62,
                         "poleAngularVelocity": state['poleAngularVelocity']/5.0,
                        }
        return stateForAgent
Esempio n. 4
0
class PODoublePoleBalancingEnvironment(DoublePoleBalancingEnvironment):
    """ The partially observable double pole balancing environment
    
    In the partially observable double pole balancing environment, 
    the task of the agent is to control a cart such that two poles which are mounted
    on the cart stay in a nearly vertical position (to balance them). At the same 
    time, the cart has to stay in a confined region. In contrast to the fully
    observable double pole balancing environment, the agent only observes the
    current position of cart and the two poles but not their velocities.
    This renders the problem to be not markovian.
    
    The agent can apply in every time step a force between -10N and 10N in order to
    accelerate the car. Thus the action space is one-dimensional and continuous. 
    The state consists of the cart's current position and velocity as well as the
    poles' angles and angular velocities. Thus, the state space is six-dimensional
    and continuous.
    
    The config dict of the environment expects the following parameters: 
    
    **CONFIG DICT** 
        :GRAVITY: : The gravity force. Benchmark default "-9.8".    
        :MASSCART: : The mass of the cart. Benchmark default "1.0".
        :TAU: : The time step between two commands of the agent. 
                Benchmark default "0.02"                         
        :MASSPOLE_1: : The mass of pole 1. Benchmark default "0.1"
        :MASSPOLE_2: : The mass of pole 2. Benchmark default "0.01"
        :LENGTH_1: : The length of pole 1. Benchmark default "0.5"
        :LENGTH_2: : The length of pole 2. Benchmark default "0.05"
        :MUP: : Coefficient of friction of the poles' hinges.
                Benchmark default "0.000002"
        :MUC: : Coefficient that controls friction. Benchmark default "0.0005"
        :INITIALPOLEANGULARPOSITION1: : Initial angle of pole 1. 
                                        Benchmark default "4.0"
        :MAXCARTPOSITION: : The maximal distance the cart is allowed to move away
                            from its start position. Benchmark default "2.4"
        :MAXPOLEANGULARPOSITION1: : Maximal angle pole 1 is allowed to take on. 
                                    Benchmark default "36.0"
        :MAXPOLEANGULARPOSITION2: : Maximal angle pole 2 is allowed to take on. 
                                    Benchmark default "36.0"
        :MAXSTEPS: : The number of steps the agent must balance the poles. 
                     Benchmark default "100000"
    """
    def __init__(self, useGUI, *args, **kwargs):

        self.environmentInfo = \
            EnvironmentInfo(versionNumber="0.3",
                            environmentName="Partially Observable Double Pole Balancing",
                            discreteActionSpace=False, episodic=True,
                            continuousStateSpace=True,
                            continuousActionSpace=True, stochastic=False)

        super(PODoublePoleBalancingEnvironment, self).__init__(useGUI=useGUI,
                                                               *args,
                                                               **kwargs)

        #The state space of partially observable double pole balancing
        oldStyleStateSpace = {
            "cartPosition": ("continuous", [(-1.0, 1.0)]),
            "poleAngularPosition1": ("continuous", [(-1.0, 1.0)]),
            "poleAngularPosition2": ("continuous", [(-1.0, 1.0)])
        }

        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")

        # The name of the state dimensions that are send to the agent.
        # NOTE: The ordering of the state dimensions is important!
        self.stateNameList = [
            "cartPosition", "poleAngularPosition1", "poleAngularPosition2"
        ]
Esempio n. 5
0
class DoublePoleBalancingEnvironment(SingleAgentEnvironment):
    """ The double pole balancing environment
    
    In the double pole balancing environment, the task of the agent is to control
    a cart such that two poles which are mounted on the cart stay in a nearly
    vertical position (to balance them). At the same time, the cart has to stay
    in a confined region.
    
    The agent can apply in every time step a force between -10N and 10N in order to
    accelerate the car. Thus the action space is one-dimensional and continuous. 
    The state consists of the cart's current position and velocity as well as the
    poles' angles and angular velocities. Thus, the state space is six-dimensional
    and continuous.
    
    The config dict of the environment expects the following parameters:
    
    **CONFIG DICT** 
        :GRAVITY: : The gravity force. Benchmark default "-9.8".    
        :MASSCART: : The mass of the cart. Benchmark default "1.0".
        :TAU: : The time step between two commands of the agent. 
                Benchmark default "0.02"                         
        :MASSPOLE_1: : The mass of pole 1. Benchmark default "0.1"
        :MASSPOLE_2: : The mass of pole 2. Benchmark default "0.01"
        :LENGTH_1: : The length of pole 1. Benchmark default "0.5"
        :LENGTH_2: : The length of pole 2. Benchmark default "0.05"
        :MUP: : Coefficient of friction of the poles' hinges.
                Benchmark default "0.000002"
        :MUC: : Coefficient that controls friction. Benchmark default "0.0005"
        :INITIALPOLEANGULARPOSITION1: : Initial angle of pole 1. 
                                        Benchmark default "4.0"
        :MAXCARTPOSITION: : The maximal distance the cart is allowed to move away
                            from its start position. Benchmark default "2.4"
        :MAXPOLEANGULARPOSITION1: : Maximal angle pole 1 is allowed to take on. 
                                    Benchmark default "36.0"
        :MAXPOLEANGULARPOSITION2: : Maximal angle pole 2 is allowed to take on. 
                                    Benchmark default "36.0"
        :MAXSTEPS: : The number of steps the agent must balance the poles. 
                     Benchmark default "100000"
    """

    DEFAULT_CONFIG_DICT = {
        'GRAVITY': -9.8,
        'MASSCART': 1.0,
        'TAU': 0.02,
        'MASSPOLE_1': 0.1,
        'MASSPOLE_2': 0.1,
        'LENGTH_1': 0.5,
        'LENGTH_2': 0.05,
        'MUP': 0.000002,
        'MUC': 0.0005,
        'INITIALPOLEANGULARPOSITION1': 4.0,
        'MAXCARTPOSITION': 2.4,
        'MAXPOLEANGULARPOSITION1': 36.0,
        'MAXPOLEANGULARPOSITION2': 36.0,
        'MAXSTEPS': 100000
    }

    def __init__(self, useGUI, *args, **kwargs):

        self.environmentInfo = \
            EnvironmentInfo(versionNumber="0.3",
                            environmentName="Double Pole Balancing",
                            discreteActionSpace=False, episodic=True,
                            continuousStateSpace=True,
                            continuousActionSpace=True, stochastic=False)

        super(DoublePoleBalancingEnvironment, self).__init__(useGUI=useGUI,
                                                             *args,
                                                             **kwargs)

        # Convert from degrees to radians
        self.configDict["INITIALPOLEANGULARPOSITION1"] *= pi / 180.0
        self.configDict['MAXPOLEANGULARPOSITION1'] *= pi / 180.0
        self.configDict['MAXPOLEANGULARPOSITION2'] *= pi / 180.0

        # The object which computes the dpb dynamics
        self.dpbDynamics = DoublePoleBalancingDynamics(self.configDict)

        #The state space of the Double Pole Balancing Simulation
        oldStyleStateSpace = {
            "cartPosition": ("continuous", [(-1.0, 1.0)]),
            "cartVelocity": ("continuous", [(-0.1, 0.1)]),
            "poleAngularPosition1": ("continuous", [(-1.0, 1.0)]),
            "poleAngularVelocity1": ("continuous", [(-0.5, 0.5)]),
            "poleAngularPosition2": ("continuous", [(-1.0, 1.0)]),
            "poleAngularVelocity2": ("continuous", [(-0.5, 0.5)])
        }
        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")

        #The action space of the Double Pole Balancing Simulation
        oldStyleActionSpace = {"force": ("continuous", [(-10, 10)])}
        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace,
                                          limitType="soft")

        # The name of the state dimensions that are send to the agent.
        # NOTE: The ordering of the state dimensions is important!
        self.stateNameList = [
            "cartPosition", "cartVelocity", "poleAngularPosition1",
            "poleAngularVelocity1", "poleAngularPosition2",
            "poleAngularVelocity2"
        ]

        # The vector used for normalization of the state for the agent
        self.normalizationVector = array([
            1.0 / self.configDict['MAXCARTPOSITION'], 0.1,
            1.0 / self.configDict['MAXPOLEANGULARPOSITION1'], 0.2,
            1.0 / self.configDict['MAXPOLEANGULARPOSITION2'], 0.1
        ])

        #The current state of the simulation
        self.initialState = array([
            0.0, 0.0, self.configDict["INITIALPOLEANGULARPOSITION1"], 0.0, 0.0,
            0.0
        ])
        #The current state is initially set to the initial state
        self.currentState = array(self.initialState)

        if useGUI:
            from mmlf.gui.viewers import VIEWERS
            from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer

            # Add general trajectory viewer
            VIEWERS.addViewer(lambda: TrajectoryViewer(self.stateSpace),
                              'TrajectoryViewer')

    ########################## Interface Functions ##########################
    def getInitialState(self):
        """Returns the initial state of the environment
        
        More information about (valid) states can be found in 
        :ref:`state_and_action_spaces`
        """
        return self._createStateForAgent(self.initialState)

    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """
        # Remember state before executing action
        previousState = self.currentState

        force = actionObject['force']
        minForce, maxForce = self.actionSpace['force']['dimensionValues'][0]

        # Force has to be within the allowed range (minForce, maxForce)
        force = min(max(force, minForce), maxForce)

        # if force is less than +/-1/256*10N we set it to this level
        if fabs(force) < 10.0 / 256:
            force = 10.0 / 256 if force >= 0 else -10.0 / 256

        # Compute the successor state
        self.currentState = self.dpbDynamics.stateTransition(
            self.currentState, force)

        episodeFinished = self._checkEpisodeFinished()

        terminalState = self._createStateForAgent(self.currentState) \
                             if episodeFinished else None

        if episodeFinished:
            self.episodeLengthObservable.addValue(self.episodeCounter,
                                                  self.stepCounter + 1)
            self.returnObservable.addValue(self.episodeCounter,
                                           self.stepCounter)
            self.environmentLog.info(
                "Episode %s lasted for %s steps." %
                (self.episodeCounter, self.stepCounter + 1))

            self.stepCounter = 0
            self.episodeCounter += 1
            # Reset the simulation to the initial state (always the same)
            self.currentState = array(self.initialState)

            self.trajectoryObservable.addTransition(
                self._createStateForAgent(previousState),
                actionObject,
                1,
                terminalState,
                episodeTerminated=True)
        else:
            self.stepCounter += 1
            if self.stepCounter in [
                    10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000,
                    90000
            ]:
                self.environmentLog.info(
                    "Episode %s. Balanced for %s steps!" %
                    (self.episodeCounter, self.stepCounter))

            self.trajectoryObservable.addTransition(
                self._createStateForAgent(previousState),
                actionObject,
                1,
                self._createStateForAgent(self.currentState),
                episodeTerminated=False)

        resultsDict = {
            "reward": 1,  #we give always reward 1
            "terminalState": terminalState,
            "nextState": self._createStateForAgent(self.currentState),
            "startNewEpisode": episodeFinished
        }
        return resultsDict

    ########################## Helper Functions #####################################

    def _checkTerminalState(self):
        """ Returns whether the simulation has reached a terminal state.
        
        A terminal state is reached if the cart or the pole exceed certain 
        boundaries
        """
        return (
            (fabs(self.currentState[0]) > self.configDict['MAXCARTPOSITION'])
            or (fabs(self.currentState[2]) >
                self.configDict['MAXPOLEANGULARPOSITION1'])
            or (fabs(self.currentState[4]) >
                self.configDict['MAXPOLEANGULARPOSITION2']))

    def _checkEpisodeFinished(self):
        """ Returns whether an episode is finished. 
        
        An episode is finished if a terminal state is reached or the 
        maximum number of steps is exceeded.
        """
        return self._checkTerminalState() \
                or self.stepCounter >= self.configDict["MAXSTEPS"]

    def _createStateForAgent(self, state):
        """ Returns the representation of the given *state* for the agent."""
        stateForAgent = dict(
            zip(self.stateNameList, state * self.normalizationVector))
        return stateForAgent
Esempio n. 6
0
class MazeCliffEnvironment(SingleAgentEnvironment):
    """ The two-dimensional maze cliff environment.
    
    In this maze, there are two alternative ways from the start to the goal 
    state: one short way which leads along a dangerous cliff and one long 
    but secure way. If the agent happens to step into the maze, it will
    get a huge negative reward (configurable via *cliffPenalty*) and is reset
    into the start state. Per default, the maze is deterministic, i.e. the agent 
    always moves in the direction it chooses. However, the parameter
    *stochasticity* allows to control the stochasticity of the environment.
    For instance, when stochasticity is set to 0.01, the the agent performs a
    random move instead of the chosen one with probability 0.01. 
    
    The maze structure is as follows where "S" is the start state, "G" the goal 
    state and "C" is a cliff field:
    **************
    *            *
    *            *
    *            *
    *SCCCCCCCCCCG*
    **************
    
    **CONFIG DICT**
        :cliffPenalty: : The reward an agent obtains when stepping into the cliff area
        :stochasticity: : The stochasticity of the state transition matrix. With probability 1-*stochasticity* the desired transition is made, otherwise a random transition 
    
    """
    
    DEFAULT_CONFIG_DICT = {"cliffPenalty" : -100,
                           "stochasticity" : 0.0}
    
    def __init__(self, useGUI, *args, **kwargs):
        self.environmentInfo = EnvironmentInfo(versionNumber="0.3",
                                               environmentName="Maze Cliff",
                                               discreteActionSpace=True,
                                               episodic=True,
                                               continuousStateSpace=False,
                                               continuousActionSpace=False,
                                               stochastic=False)

        super(MazeCliffEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs)
        
        # A string which describes the structure of the maze
        # A * indicates a wall, an S the start position of the agent
        # and a G the goal. A blank indicates a free cell.
        mazeDescriptionString =  """**************
                                    *            *
                                    *            *
                                    *            *
                                    *S          G*
                                    **************
                                    """                            
                                    
        #The maze object is created from the description
        self.maze = Maze.createMazeFromString(mazeDescriptionString,
                                              cliffPenalty=self.configDict["cliffPenalty"],
                                              stochasticity=self.configDict["stochasticity"])
        
        #The state space of the Maze2d Simulation
        oldStyleStateSpace =   {
                                "column": ("discrete", range(self.maze.getColumns())),
                                "row": ("discrete", range(self.maze.getRows())),
                            }
        
        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")
        
        #The action space of the Single Pole Balancing Simulation
        oldStyleActionSpace =  {
                                "action": ("discrete", ["up", "down", "left", "right"])
                            }
        
        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft")
        
        
        # dictionary which contains all configuration options specific to this environment
        # it is VERY important to put ALL configuration options which uniquely determine
        # the behavior of the environment in this dictionary.
        self.configDict =  {}
               
        #The current state of the simulation
        self.initialState =  { 
                     "row": self.maze.getStartPosition()[0],
                     "column": self.maze.getStartPosition()[1],
                  }
        #The current state is initially set to the initial state
        self.currentState = deepcopy(self.initialState)
        
        #A counter which stores the number of steps which have been perfomed in this episode
        self.stepCounter = 0
        self.episodeCounter = 0
        
        #The accumulated reward
        self.reward = 0.0
             
        if useGUI:
            from mmlf.gui.viewers import VIEWERS
            from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer
            from mmlf.worlds.maze2d.environments.maze2d_viewer import Maze2DDetailedViewer
            from mmlf.worlds.maze2d.environments.maze2d_function_viewer import Maze2DFunctionViewer
            
            # Create customized trajectory viewer
            class MazeCliffTrajectoryViewer(TrajectoryViewer):
                def __init__(self, stateSpace, plotStateSpaceStructure):
                    super(MazeCliffTrajectoryViewer, self).__init__(stateSpace)
                    plotStateSpaceStructure(self.axisTrajectory)
                
            
            VIEWERS.addViewer(lambda : \
                                MazeCliffTrajectoryViewer(self.stateSpace,
                                                          lambda ax : self.plotStateSpaceStructure(ax)), 
                              'MazeCliffTrajectoryViewer')
            
            # Add viewers for the maze world
            VIEWERS.addViewer(lambda : Maze2DDetailedViewer(self.maze,
                                                            self.stateSpace,
                                                            ["left", "right", "up", "down"]),
                              'MazeCliffDetailedViewer')
            VIEWERS.addViewer(lambda : Maze2DFunctionViewer(self.maze,
                                                            self.stateSpace),
                              'MazeCliffFunctionViewer')
    
    ########################## Interface Functions #####################################
    def getInitialState(self):
        """
        Returns the initial state of this environment
        """
        return self._createStateForAgent(self.initialState)
    
    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """  
        # The state before executing the action
        previousState = dict(self.currentState)
        
        action = actionObject['action']
        
        # Execute the action which was chosen by the agent
        reward = self._stateTransition(action)
        
        self.reward +=reward
           
        #Check if the episode is finished (i.e. the goal is reached)
        episodeFinished = self._checkEpisodeFinished()
        
        terminalState = self.currentState if episodeFinished else None
        
        if episodeFinished:
            self.episodeLengthObservable.addValue(self.episodeCounter,
                                                  self.stepCounter + 1)
            self.returnObservable.addValue(self.episodeCounter, self.reward)
            self.environmentLog.info("Episode %s. Length: %s steps, "
                                     "Accumulated reward: %s."
                                         % (self.episodeCounter, 
                                            self.stepCounter+1, self.reward))
            #Reset the simulation to the initial state (always the same)
            self.stepCounter = 0
            self.reward = 0.0
            self.currentState = deepcopy(self.initialState)
            self.episodeCounter += 1
            
            self.trajectoryObservable.addTransition(previousState, action, 
                                                    reward, terminalState, 
                                                    episodeTerminated=episodeFinished)
        else:
            self.stepCounter += 1
            self.trajectoryObservable.addTransition(previousState, action, 
                                                    reward, self.currentState, 
                                                    episodeTerminated=episodeFinished)

        resultsDict = {"reward" : reward,
                       "terminalState" : terminalState,
                       "nextState" : self._createStateForAgent(self.currentState),
                       "startNewEpisode" : episodeFinished}
        return resultsDict
    
    def _stateTransition(self, action):
        "Execute the specified action and store the resulting state"
        # If the action was move forward:
        currentPos = (self.currentState['row'],self.currentState['column'])
        nextPos, reward = self.maze.tryToMove(currentPos,action)
    
        self.currentState['row'] = nextPos[0]
        self.currentState['column'] = nextPos[1]
                
        return reward
        
    def _checkEpisodeFinished(self):
        "Checks whether the episode is finished, i. e. the goal is reached"        
        currentPos = (self.currentState['row'],self.currentState['column'])
        return self.maze.isGoalReached(currentPos)
    
    def _createStateForAgent(self, state):
        "Create a state description for the agent"
        return state
    
    def plotStateSpaceStructure(self, axis):
        """ Plot structure of state space into given axis. 
        
        Just a helper function for viewers and graphic logging.
        """
        self.maze.drawIntoAxis(axis)
Esempio n. 7
0
class Maze2dEnvironment(SingleAgentEnvironment):
    """ The two-dimensional maze environment for an agent without orientation.
    
    A 2d maze world, in which the agent is situated at each moment in time in a 
    certain field (specified by its (row,column) coordinate) and can move
    either upwards, downwards, left or right. The structure of the maze can be
    configured via a text-based config file. 

    **CONFIG DICT**
        :episodesUntilDoorChange: : Episodes that the door will remain in their initial state. After this number of episodes, the door state is inverted.
        :MAZE: : Name of the config file, where the maze is defined. These files are located in folder 'worlds/maze2d'
    
    """

    DEFAULT_CONFIG_DICT = {
        "episodesUntilDoorChange": 25,
        "MAZE": "maze_simple.cfg"
    }

    def __init__(self, useGUI, *args, **kwargs):

        self.environmentInfo = EnvironmentInfo(versionNumber="0.3",
                                               environmentName="Maze2D",
                                               discreteActionSpace=True,
                                               episodic=True,
                                               continuousStateSpace=False,
                                               continuousActionSpace=False,
                                               stochastic=False)

        super(Maze2dEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs)

        # Reading string which describes the structure of the maze
        mazeDescriptionString = open(mmlf.getRWPath() + os.sep + "config" +
                                     os.sep + "maze2d" + os.sep +
                                     self.configDict['MAZE']).read()
        # Remove comment lines and superfluous whitespace
        lines = map(lambda line: line.strip(),
                    mazeDescriptionString.split("\n"))
        lines = filter(lambda line: not line.startswith("#"), lines)
        mazeDescriptionString = "\n".join(lines)

        #The maze object is created from the description
        self.maze = Maze.createMazeFromString(mazeDescriptionString)

        #The state space of the Maze2d Simulation
        oldStyleStateSpace = {
            "column": ("discrete", range(self.maze.getColumns())),
            "row": ("discrete", range(self.maze.getRows()))
        }

        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")

        #The action space of the Maze2d Simulation
        oldStyleActionSpace = {
            "action": ("discrete", ["left", "right", "up", "down"])
        }

        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace,
                                          limitType="soft")

        #The current state of the simulation
        self.initialState = {
            "row": self.maze.getStartPosition()[0],
            "column": self.maze.getStartPosition()[1]
        }
        #The current state is initially set to the initial state
        self.currentState = deepcopy(self.initialState)

        if useGUI:
            from mmlf.gui.viewers import VIEWERS
            from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer
            from mmlf.worlds.maze2d.environments.maze2d_viewer import Maze2DDetailedViewer
            from mmlf.worlds.maze2d.environments.maze2d_function_viewer import Maze2DFunctionViewer

            # Create customized trajectory viewer
            class Maze2dTrajectoryViewer(TrajectoryViewer):
                def __init__(self, stateSpace, plotStateSpaceStructure):
                    super(Maze2dTrajectoryViewer, self).__init__(stateSpace)
                    plotStateSpaceStructure(self.axisTrajectory)


            VIEWERS.addViewer(lambda : \
                                Maze2dTrajectoryViewer(self.stateSpace,
                                                       lambda ax : self.plotStateSpaceStructure(ax)),
                              'Maze2dTrajectoryViewer')

            # Add viewers for the maze world
            VIEWERS.addViewer(
                lambda: Maze2DDetailedViewer(self.maze, self.stateSpace,
                                             ["left", "right", "up", "down"]),
                'Maze2DDetailedViewer')
            VIEWERS.addViewer(
                lambda: Maze2DFunctionViewer(self.maze, self.stateSpace),
                'Maze2DFunctionViewer')

    ########################## Interface Functions #####################################
    def getInitialState(self):
        """ Returns the initial state of the environment """
        return self._createStateForAgent(self.initialState)

    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """
        # The state before executing the action
        previousState = dict(self.currentState)

        action = actionObject['action']
        # Execute the action which was chosen by the agent
        self._stateTransition(action)

        #Check if the episode is finished (i.e. the goal is reached)
        episodeFinished = self._checkEpisodeFinished()

        terminalState = self.currentState if episodeFinished else None

        if episodeFinished:
            self.episodeLengthObservable.addValue(self.episodeCounter,
                                                  self.stepCounter + 1)
            self.returnObservable.addValue(self.episodeCounter,
                                           -self.stepCounter)
            self.environmentLog.info(
                "Episode %s lasted for %s steps." %
                (self.episodeCounter, self.stepCounter + 1))

            self.stepCounter = 0
            self.episodeCounter += 1
            # Check if maze should be blocked
            if self.episodeCounter == self.configDict[
                    "episodesUntilDoorChange"]:
                self.maze.switchBlocking()

            # Reset the simulation to the initial state (always the same)
            self.currentState = deepcopy(self.initialState)

            self.trajectoryObservable.addTransition(
                previousState,
                action,
                -1,
                terminalState,
                episodeTerminated=episodeFinished)
        else:
            self.stepCounter += 1

            self.trajectoryObservable.addTransition(
                previousState,
                action,
                -1,
                self.currentState,
                episodeTerminated=episodeFinished)

        resultsDict = {
            "reward": -1,  # we always give a reward of -1
            "terminalState": terminalState,
            "nextState": self._createStateForAgent(self.currentState),
            "startNewEpisode": episodeFinished
        }
        return resultsDict

    def _stateTransition(self, action):
        "Execute the specified action and store the resulting state"
        # If the action was move forward:
        currentPos = (self.currentState['row'], self.currentState['column'])
        nextPos = self.maze.tryToMove(currentPos, action)

        # The current state is initially set to the initial state
        self.currentState['row'] = nextPos[0]
        self.currentState['column'] = nextPos[1]

    def _checkEpisodeFinished(self):
        "Checks whether the episode is finished, i. e. the goal is reached"
        currentPos = (self.currentState['row'], self.currentState['column'])
        return self.maze.isGoalReached(currentPos)

    def _createStateForAgent(self, state):
        "Create a state description for the agent"
        return state

    def plotStateSpaceStructure(self, axis):
        """ Plot structure of state space into given axis. 
        
        Just a helper function for viewers and graphic logging.
        """
        self.maze.drawIntoAxis(axis)
Esempio n. 8
0
class SeventeenAndFourEnvironment(SingleAgentEnvironment):
    """ The seventeen & four environment
    
    This environment implements a simplified form of the card game seventeen & four,
    in which the agent takes the role of the player and plays against a hard-coded 
    dealer.
    
    The player starts initially with two randomly drawn card with values of
    2,3,4,7,8,9,10 or 11. The goal is get a set of cards whose sum is as close 
    as possible to 21. The agent can stick with two cards or draw arbitrarily 
    many cards sequentially. If the sum of cards becomes greater than 21, the
    agent looses and gets a reward of -1. If the agent stops with cards less 
    valued than 22, a hard-coded dealer policy starts playing against the agent.
    This dealer draws card until it has either equal/more points than the agent 
    or more than 21. In the first case, the dealer wins and the agent gets a 
    reward of -1, otherwise the player wins and gets a reward of 0.   
    """

    DEFAULT_CONFIG_DICT = {}

    def __init__(self, useGUI, *args, **kwargs):

        self.environmentInfo = EnvironmentInfo(versionNumber="0.3",
                                               environmentName="17 and 4",
                                               discreteActionSpace=True,
                                               episodic=True,
                                               continuousStateSpace=False,
                                               continuousActionSpace=False,
                                               stochastic=True)

        super(SeventeenAndFourEnvironment, self).__init__(useGUI=useGUI,
                                                          *args,
                                                          **kwargs)

        # State and action space definition
        oldStyleStateSpace = {"count": ("discrete", range(23))}

        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")

        oldStyleActionSpace = {"action": ("discrete", ["continue", "stop"])}

        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace,
                                          limitType="hard")

        # The available cards
        self.cards = [
            2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9,
            9, 9, 10, 10, 10, 10, 11, 11, 11, 11
        ]
        # Initialize first game
        self.getInitialState()

        # Some observables
        self.pointsObservable = \
                FloatStreamObservable(title='%s Points' % self.__class__.__name__,
                                      time_dimension_name='Episode',
                                      value_name='Points')

        if useGUI:
            from mmlf.gui.viewers import VIEWERS
            from mmlf.worlds.seventeen_and_four.environments.seventeen_and_four_viewers\
                        import SeventeenAndFourValuefunctionViewer
            # Add a Q-value viewer for this world
            VIEWERS.addViewer(
                lambda: SeventeenAndFourValuefunctionViewer(self.stateSpace),
                'SeventeenAndFourValuefunctionViewer')

    ########################## Interface Functions #####################################
    def getInitialState(self):
        """ Returns the initial state of the environment """
        self.remainingCards = list(self.cards)
        self.drawnCards = []
        # Player starts with two cards
        self._drawCard(self.drawnCards)
        self._drawCard(self.drawnCards)
        return self._createState()

    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """
        if actionObject['action'] == 'stop':
            # Agent stopped
            self.episodeCounter += 1
            sumOfCards = sum(self.drawnCards)
            self.pointsObservable.addValue(self.episodeCounter, sumOfCards)
            # Determine dealers outcome
            dealersCards = []
            self._drawCard(dealersCards)  # Dealer starts with two cards
            self._drawCard(dealersCards)

            # Dealer draws until he has same number of points as agent or too many
            while sum(dealersCards) <= sumOfCards and sum(dealersCards) < 22:
                self._drawCard(dealersCards)

            self.environmentLog.info(
                "Episode %s: Agent %s Dealer %s" %
                (self.episodeCounter, sumOfCards, sum(dealersCards)))
            if sum(dealersCards) > sumOfCards and sum(dealersCards) < 22:
                # Agent lost against dealer
                self.returnObservable.addValue(self.episodeCounter, -1)
                return {
                    "reward": -1,  # lost
                    "terminalState": {
                        'count': 22
                    },
                    "nextState": self.getInitialState(),
                    "startNewEpisode": True
                }
            else:
                # Agent won since it has more points than dealer
                self.returnObservable.addValue(self.episodeCounter, 0)
                return {
                    "reward": 0,  # won
                    "terminalState": {
                        'count': 22
                    },
                    "nextState": self.getInitialState(),
                    "startNewEpisode": True
                }

        # Draw a card
        self._drawCard(self.drawnCards)

        if sum(self.drawnCards) > 21:
            # Agent lost since its cards exceed 21
            self.environmentLog.info(
                "Episode %s: Agent %s" %
                (self.episodeCounter, sum(self.drawnCards)))
            self.episodeCounter += 1
            self.pointsObservable.addValue(self.episodeCounter,
                                           sum(self.drawnCards))
            self.returnObservable.addValue(self.episodeCounter, -1)
            return {
                "reward": -1,  # lost
                "terminalState": {
                    'count': 22
                },
                "nextState": self.getInitialState(),
                "startNewEpisode": True
            }
        else:
            return {
                "reward": 0,  # game still running
                "terminalState": None,
                "nextState": self._createState(),
                "startNewEpisode": False
            }

    def _createState(self):
        return {'count': min(22, sum(self.drawnCards))}

    def _drawCard(self, listOfCards):
        """ Draw a card randomly """
        card = random.choice(self.remainingCards)
        self.remainingCards.remove(card)
        listOfCards.append(card)
Esempio n. 9
0
class MountainCarEnvironment(SingleAgentEnvironment):
    """ The mountain car environment. 
    
    In the mountain car environment, the agent has to control a car which is
    situated somewhere in a valley between two hills. The goal of the agent
    is to reach the top of the right hill. Unfortunately, the engine of the
    car is not strong enough to reach the top of the hill directly from many
    start states. Thus, it has first to drive in the wrong direction to gather
    enough potential energy. 
    
    The agent can either accelerate left, right, or coast. Thus, the action
    space is discrete with three discrete actions. The agent observes
    two continuous state components: The current position and velocity of the
    car. The start state of the car is stochastically initialised. 
    
    **CONFIG DICT**
        :maxStepsPerEpisode: : The number of steps the agent has maximally to reach the goal. Benchmark default is "500".
        :accelerationFactor: : A factor that influences how strong the cars engine is relative to the slope of the hill. Benchmark default is "0.001".
        :maxGoalVelocity: : Maximum velocity the agent might have when reaching the goal. If smaller than 0.07, this effectively makes the task MountainPark instead of MountainCar. Benchmark default is "0.07" 
        :positionNoise: : Noise that is added to the agent's observation of the position. Benchmark default is "0.0"
        :velocityNoise: : Noise that is added to the agent's observation of the velocity. Benchmark default is "0.0"

    """

    DEFAULT_CONFIG_DICT = {
        'maxStepsPerEpisode': 500,
        'accelerationFactor': 0.001,
        'maxGoalVelocity': 0.07,
        'positionNoise': 0.0,
        'velocityNoise': 0.0
    }

    def __init__(self, config, useGUI, *args, **kwargs):

        self.environmentInfo = EnvironmentInfo(versionNumber="0.3",
                                               environmentName="Mountain Car",
                                               discreteActionSpace=True,
                                               episodic=True,
                                               continuousStateSpace=True,
                                               continuousActionSpace=False,
                                               stochastic=True)

        # Add value for N to config dict (required for discretization
        # in optimal policy computation)
        if "N" not in config["configDict"]:
            config["configDict"]["N"] = "50"

        super(MountainCarEnvironment, self).__init__(config,
                                                     useGUI=useGUI,
                                                     *args,
                                                     **kwargs)

        # configuration
        self.randomStarts = True

        # Some constants
        self.minPosition = -1.2  # Minimum car position
        self.maxPosition = 0.6  # Maximum car position (past goal)
        self.maxVelocity = 0.07  # Maximum velocity of car
        self.goalPosition = 0.5  # Goal position - how to tell we are done

        # If "maxGoalVelocity" is not set in configDict, set it to maximal
        # velocity
        if not "maxGoalVelocity" in self.configDict:
            self.configDict["maxGoalVelocity"] = self.maxVelocity

        # The current state of the system
        self.state = None

        # Some counters
        self.overallStepCounter = 0

        # State and action space definition
        oldStyleStateSpace = {
            "position": ("continuous", [(self.minPosition, self.maxPosition)]),
            "velocity": ("continuous", [(-self.maxVelocity, self.maxVelocity)])
        }

        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")

        self.actions = ["left", "right", "none"]
        oldStyleActionSpace = {"thrust": ("discrete", self.actions)}

        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace,
                                          limitType="hard")

        if useGUI:
            from mmlf.gui.viewers import VIEWERS
            from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer
            from mmlf.worlds.mountain_car.environments.mcar_policy_viewer \
                    import MountainCarPolicyViewer
            from mmlf.worlds.mountain_car.environments.mcar_valuefunction_viewer \
                    import MountainCarValueFunctionViewer
            # Add general trajectory viewer
            VIEWERS.addViewer(lambda: TrajectoryViewer(self.stateSpace),
                              'TrajectoryViewer')
            VIEWERS.addViewer(lambda: MountainCarPolicyViewer(self.stateSpace),
                              'MountainCar PolicyViewer')
            VIEWERS.addViewer(
                lambda: MountainCarValueFunctionViewer(self.stateSpace),
                'MountainCar ValueFunctionViewer')

    ########################## Interface Functions #####################################
    def getInitialState(self):
        """ Returns the initial state of the environment
        
        More information about (valid) states can be found in 
        :ref:`state_and_action_spaces`
        """
        if self.randomStarts:  # random start state

            def randomInInterval(min, max):
                "Returns a random number between min and max"
                return min + (random.random() * (max - min))

            position = randomInInterval(self.minPosition, self.goalPosition)
            velocity = randomInInterval(-self.maxVelocity, self.maxVelocity)
        else:  # deterministically start in (-0.5, 0.0)
            position = -0.5
            velocity = 0.0
        self.state = {"position": position, "velocity": velocity}
        return self._stateForAgent(self.state)

    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """
        # Remember state before executing action
        previousState = self.state

        # Execute the action which was chosen by the agent
        self.state, prob = list(
            self.stateTransitionFct(self.state, actionObject['thrust']))[0]
        self.stepCounter += 1

        #Check if the episode is finished (i.e. the goal is reached)
        episodeFinished = False
        terminalState = None

        if self.isTerminalState(self.state):
            episodeFinished = True
            terminalState = self._stateForAgent(self.state)
            self.environmentLog.info(
                "Episode %s: Goal reached after %s steps." %
                (self.episodeCounter, self.stepCounter))
        elif self.stepCounter >= self.configDict["maxStepsPerEpisode"]:
            episodeFinished = True
            self.environmentLog.info(
                "Episode %s: No goal reached but %s steps expired!" %
                (self.episodeCounter, self.stepCounter))

        # Compute reward
        reward = self.rewardFct(self.state, actionObject['thrust'])

        self.trajectoryObservable.addTransition(
            self._stateForAgent(previousState),
            actionObject,
            reward,
            self._stateForAgent(self.state),
            episodeTerminated=episodeFinished)

        if episodeFinished:
            self.episodeLengthObservable.addValue(self.episodeCounter,
                                                  self.stepCounter + 1)
            self.returnObservable.addValue(self.episodeCounter,
                                           -self.stepCounter)

            self.stepCounter = 0
            self.episodeCounter += 1
            # Reset the simulation to some initial state
            self.state = self.getInitialState()

        resultsDict = {
            "reward": reward,
            "terminalState": terminalState,
            "nextState": self._stateForAgent(self.state),
            "startNewEpisode": episodeFinished
        }
        return resultsDict

    def stateTransitionFct(self, state, action):
        """ Returns iterator of the successor states of *action* in *state*."""

        #Applies the action and calculates the new position and velocity
        def minmax(item, limit1, limit2):
            "Bounds item to between limit1 and limit2 (or -limit1)"
            return max(limit1, min(limit2, item))

        # Get position and velocity
        position = state["position"]
        velocity = state["velocity"]

        # Determine acceleration factor
        if action == 'left':  # action is backward thrust
            factor = -1
        elif action == 'none':  # action is coast
            factor = 0
        else:  # action is forward thrust
            factor = 1

        # Do the actual state update
        velocityChange = self.configDict["accelerationFactor"] * factor \
                                - 0.0025 * cos(3 * position)
        velocity = minmax(velocity + velocityChange, -self.maxVelocity,
                          self.maxVelocity)
        position += velocity
        position = minmax(position, self.minPosition, self.maxPosition)

        if (position <= self.minPosition) and (velocity < 0):
            velocity = 0.0

        if position >= self.goalPosition \
                    and abs(velocity) > self.configDict["maxGoalVelocity"]:
            velocity = -velocity

        yield State(
            [position, velocity],
            [self.stateSpace["position"], self.stateSpace["velocity"]]), 1.0

    def rewardFct(self, state, action):
        """ Returns the reward obtained after executing *action* in *state*. """
        # We always give reward -1
        return -1

    def isTerminalState(self, state):
        """ Returns whether *state* is a terminal state. """
        # Returns whether the car has reached the goal
        return state["position"] >= self.goalPosition \
                and abs(state["velocity"]) <= self.configDict["maxGoalVelocity"]

    def _stateForAgent(self, state):
        return {
            "position":
            state["position"] +
            random.normalvariate(0.0, self.configDict["positionNoise"]),
            "velocity":
            state["velocity"] +
            random.normalvariate(0.0, self.configDict["velocityNoise"])
        }
Esempio n. 10
0
class PinballMazeEnvironment(SingleAgentEnvironment):
    """ The pinball maze environment class.

    The pinball maze environment class.

    .. seealso::
        George Konidaris and Andrew G Barto
        "Skill Discovery in Continuous Reinforcement Learning Domains using Skill Chaining"
        in "Advances in Neural Information Processing Systems", 2009

    .. versionadded:: 0.9.9

    **CONFIG DICT** 
        :DRAG: : Factor that slows the ball each time step (multiplied to velocity after each step)
        :NOISE: : gaussian noise with MU_POS for position [x,y] and MU_VEL for velocity [xdot,ydot]; as simplification the covariance matrix is just a unit matrix multiplied with SIGMA
        :THRUST_PENALTY: : Reward the agent gains each time it accelerates the ball
        :STEP_PENALTY: : Reward the agent gains each time step it not thrusts or terminates
        :END_EPISODE_REWARD: : Reward the agent gains if the ball reaches the goal
        :SUBSTEPS: : number of dynamic steps of the environment between each of the agent's actions
        :MAZE: : Name of the config file, where the maze is defined. These files are located in folder 'worlds/pinball_maze'
    """
    DEFAULT_CONFIG_DICT = {
        "DRAG": 0.995,
        "NOISE": {
            "MU_POS": [0.0, 0.0],
            "MU_VEL": [0.0, 0.0],
            "SIGMA": 0.0
        },
        "THRUST_PENALTY": -5,
        "STEP_PENALTY": -1,
        "END_EPISODE_REWARD": 10000,
        "SUBSTEPS": 20,
        "MAZE": "pinball_simple_single.cfg"
    }

    def __init__(self, useGUI, *args, **kwargs):

        self.environmentInfo = EnvironmentInfo(versionNumber="0.3",
                                               environmentName="PinballMaze",
                                               discreteActionSpace=True,
                                               episodic=True,
                                               continuousStateSpace=True,
                                               continuousActionSpace=False,
                                               stochastic=False)

        super(PinballMazeEnvironment, self).__init__(useGUI=useGUI,
                                                     *args,
                                                     **kwargs)

        mazeString = open(
            os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir +
            os.sep + self.configDict['MAZE'], 'r').read()

        #The maze object is created from the description
        self.maze = PinballMaze.createMazeFromString(mazeString)

        #The state space of the Maze2d Simulation
        oldStyleStateSpace = {
            "x": ("continuous", [(0.0, 1.0)]),
            "y": ("continuous", [(0.0, 1.0)]),
            "xdot": ("continuous", [(-1.0, 1.0)]),
            "ydot": ("continuous", [(-1.0, 1.0)]),
        }

        self.stateSpace = StateSpace()
        self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft")

        #The action space of the Maze2d Simulation
        oldStyleActionSpace = {
            "action": ("discrete", ["xinc", "xdec", "yinc", "ydec", "none"])
        }

        self.actionSpace = ActionSpace()
        self.actionSpace.addOldStyleSpace(oldStyleActionSpace,
                                          limitType="soft")

        #The current state is initially set to the initial state
        self.currentState = self.getInitialState()

        if useGUI:
            # Add viewer specific for the pinball world
            from mmlf.gui.viewers import VIEWERS
            from mmlf.worlds.pinball_maze.environments.pinball_maze_trajectory_viewer \
                        import PinballMazeTrajectoryViewer
            from mmlf.worlds.pinball_maze.environments.pinball_maze_function_viewer \
                        import PinballMazeFunctionViewer

            VIEWERS.addViewer(
                lambda: PinballMazeTrajectoryViewer(self, self.stateSpace),
                'PinballMaze TrajectoryViewer')
            VIEWERS.addViewer(
                lambda: PinballMazeFunctionViewer(self, self.stateSpace),
                'PinballMaze FunctionViewer')

    ########################## Interface Functions #####################################

    def getInitialState(self):
        """ Returns the initial state of the environment
        
        More information about (valid) states can be found in 
        :ref:`state_and_action_spaces`
        """
        startPos = self.maze.getStartPos()
        return {"x": startPos[0], "y": startPos[1], "xdot": 0.0, "ydot": 0.0}

    def evaluateAction(self, actionObject):
        """ Execute an agent's action in the environment.
        
        Take an actionObject containing the action of an agent, and evaluate 
        this action, calculating the next state, and the reward the agent 
        should receive for having taken this action.
        
        Additionally, decide whether the episode should continue,
        or end after the reward has been  issued to the agent.
        
        This method returns a dictionary with the following keys:
           :rewardValue: : An integer or float representing the agent's reward.
                           If rewardValue == None, then no reward is given to the agent.
           :startNewEpisode: : True if the agent's action has caused an episode
                               to get finished.
           :nextState: : A State object which contains the state the environment
                         takes on after executing the action. This might be the
                         initial state of the next episode if a new episode
                         has just started (startNewEpisode == True)
           :terminalState: : A State object which contains the terminal state 
                             of the environment in the last episode if a new 
                             episode has just started (startNewEpisode == True). 
                             Otherwise None.        
        """
        # The state before executing the action
        previousState = deepcopy(self.currentState)

        # Fetch action and do state transition and compute reward
        action = actionObject['action']
        reward, episodeFinished = self._stateTransition(action)

        self.accumulatedReward += reward

        self.trajectoryObservable.addTransition(
            previousState,
            action,
            reward,
            self.currentState,
            episodeTerminated=episodeFinished)

        terminalState = self.currentState if episodeFinished else None

        if episodeFinished:
            self.episodeLengthObservable.addValue(self.episodeCounter,
                                                  self.stepCounter + 1)
            self.returnObservable.addValue(self.episodeCounter,
                                           self.accumulatedReward)
            self.environmentLog.info(
                "Episode %s lasted for %s steps. Accumulated reward: %s" %
                (self.episodeCounter, self.stepCounter + 1,
                 self.accumulatedReward))

            self.stepCounter = 0
            self.accumulatedReward = 0.0
            self.episodeCounter += 1

            # Reset to the initial state
            self.currentState = self.getInitialState()
        else:
            self.stepCounter += 1
            if self.stepCounter % 250 == 0:  # Keep user up-to-date
                self.environmentLog.info(
                    "Episode %s: Agent active for %s steps "
                    "without reaching goal yet." %
                    (self.episodeCounter, self.stepCounter))

        resultsDict = {
            "reward": reward,
            "terminalState": terminalState,
            "nextState": self._createStateForAgent(self.currentState),
            "startNewEpisode": episodeFinished
        }
        return resultsDict

    def _stateTransition(self, action):
        # Determine action effect
        if action == "xinc":
            self.currentState["xdot"] = min(self.currentState["xdot"] + 0.2,
                                            1.0)
        elif action == "xdec":
            self.currentState["xdot"] = max(self.currentState["xdot"] - 0.2,
                                            -1.0)
        elif action == "yinc":
            self.currentState["ydot"] = min(self.currentState["ydot"] + 0.2,
                                            1.0)
        elif action == "ydec":
            self.currentState["ydot"] = max(self.currentState["ydot"] - 0.2,
                                            -1.0)

        # Do state transition, split into SUBSTEPS substeps in order to deal
        # with collisions on a more fine-granular base.
        for j in range(self.configDict["SUBSTEPS"]):
            # Compute next would-be position
            factor = self.maze.ballRadius / self.configDict["SUBSTEPS"]
            posX = self.currentState["x"] + self.currentState["xdot"] * factor
            posY = self.currentState["y"] + self.currentState["ydot"] * factor

            # Check for collision with obstacle
            collided, obstacle = self.maze.collide([posX, posY])
            if collided:
                # Determine collision effect
                self.currentState["xdot"], self.currentState["ydot"] = \
                    self.maze.collisionEffect(oldPos=(self.currentState["x"],
                                                      self.currentState["y"]),
                                              newPos=(posX, posY),
                                              vel=(self.currentState["xdot"],
                                                   self.currentState["ydot"]),
                                              obstacle=obstacle)
            else:
                # No collision, go to would-be position
                self.currentState["x"] = posX
                self.currentState["y"] = posY

            # Check if target reached
            if self.maze.goalReached(pos=(self.currentState["x"],
                                          self.currentState["y"])):
                return self.configDict["END_EPISODE_REWARD"], True

        # Apply drag
        self.currentState["xdot"] *= self.configDict["DRAG"]
        self.currentState["ydot"] *= self.configDict["DRAG"]

        if action == "none":
            return self.configDict["STEP_PENALTY"], False
        else:
            return self.configDict["THRUST_PENALTY"], False

    def _createStateForAgent(self, state):
        "Create a state description for the agent"
        mu_pos = self.configDict["NOISE"]["MU_POS"]
        mu_vel = self.configDict["NOISE"]["MU_VEL"]
        sigma = self.configDict["NOISE"]["SIGMA"]
        noisyState = state
        noisyState['x'] = noisyState['x'] + random.gauss(mu_pos[0], sigma)
        noisyState['y'] = noisyState['y'] + random.gauss(mu_pos[1], sigma)
        noisyState['xdot'] = noisyState['xdot'] + random.gauss(
            mu_vel[0], sigma)
        noisyState['ydot'] = noisyState['ydot'] + random.gauss(
            mu_vel[1], sigma)
        return noisyState

    def plotStateSpaceStructure(self, axis):
        """ Plot structure of state space into given axis. 
        
        Just a helper function for viewers and graphic logging.
        """
        self.maze.draw(axis)