def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = \ EnvironmentInfo(versionNumber="0.3", environmentName="Partially Observable Double Pole Balancing", discreteActionSpace=False, episodic=True, continuousStateSpace=True, continuousActionSpace=True, stochastic=False) super(PODoublePoleBalancingEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) #The state space of partially observable double pole balancing oldStyleStateSpace = { "cartPosition": ("continuous", [(-1.0, 1.0)]), "poleAngularPosition1": ("continuous", [(-1.0, 1.0)]), "poleAngularPosition2": ("continuous", [(-1.0, 1.0)]) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") # The name of the state dimensions that are send to the agent. # NOTE: The ordering of the state dimensions is important! self.stateNameList = [ "cartPosition", "poleAngularPosition1", "poleAngularPosition2" ]
def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="PinballMaze", discreteActionSpace=True, episodic=True, continuousStateSpace=True, continuousActionSpace=False, stochastic=False) super(PinballMazeEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) mazeString = open( os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir + os.sep + self.configDict['MAZE'], 'r').read() #The maze object is created from the description self.maze = PinballMaze.createMazeFromString(mazeString) #The state space of the Maze2d Simulation oldStyleStateSpace = { "x": ("continuous", [(0.0, 1.0)]), "y": ("continuous", [(0.0, 1.0)]), "xdot": ("continuous", [(-1.0, 1.0)]), "ydot": ("continuous", [(-1.0, 1.0)]), } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Maze2d Simulation oldStyleActionSpace = { "action": ("discrete", ["xinc", "xdec", "yinc", "ydec", "none"]) } self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") #The current state is initially set to the initial state self.currentState = self.getInitialState() if useGUI: # Add viewer specific for the pinball world from mmlf.gui.viewers import VIEWERS from mmlf.worlds.pinball_maze.environments.pinball_maze_trajectory_viewer \ import PinballMazeTrajectoryViewer from mmlf.worlds.pinball_maze.environments.pinball_maze_function_viewer \ import PinballMazeFunctionViewer VIEWERS.addViewer( lambda: PinballMazeTrajectoryViewer(self, self.stateSpace), 'PinballMaze TrajectoryViewer') VIEWERS.addViewer( lambda: PinballMazeFunctionViewer(self, self.stateSpace), 'PinballMaze FunctionViewer')
def getDiscreteStateSpace(N): """ Return discrete state space for mountain car with N values per dimension. """ from mmlf.framework.spaces import StateSpace stateSpaceDict = { "position": ("discrete", numpy.linspace(-1.2, 0.6, N)), "velocity": ("discrete", numpy.linspace(-0.07, 0.07, N)) } stateSpace = StateSpace() stateSpace.addOldStyleSpace(stateSpaceDict, limitType="soft") return stateSpace
def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Single Pole Balancing", discreteActionSpace=False, episodic=True, continuousStateSpace=True, continuousActionSpace=True, stochastic=False) super(SinglePoleBalancingEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) #The state space of the Single Pole Balancing Simulation oldStyleStateSpace = {"cartPosition": ("continuous", [(-3.125, 3.125)]), "cartVelocity": ("continuous", [(-0.5, 0.5)]), "poleAngularPosition": ("continuous", [(-1.13, 1.13)]), "poleAngularVelocity": ("continuous", [(-0.80, 0.80)]), } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Single Pole Balancing Simulation oldStyleActionSpace = {"force": ("continuous", [(-2, 2)])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") #The current state of the simulation #Note that the values of this dict can be accesed directly as #attributes of the class (see the __getattr__ and _setattr__ method) self.initialState = { "cartPosition": 0.0, "poleAngularPosition": 0.1, "cartVelocity": 0.0, "poleAngularVelocity": 0.0, } #The current state is initially set to the initial state self.currentState = deepcopy(self.initialState) if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.single_pole_balancing.environments.spb_trajectory_viewer import SPBTrajectoryViewer # Add general trajectory viewer VIEWERS.addViewer(lambda : TrajectoryViewer(self.stateSpace), 'TrajectoryViewer') VIEWERS.addViewer(lambda : SPBTrajectoryViewer(), 'SPB Cart Viewer')
def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="17 and 4", discreteActionSpace=True, episodic=True, continuousStateSpace=False, continuousActionSpace=False, stochastic=True) super(SeventeenAndFourEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # State and action space definition oldStyleStateSpace = {"count": ("discrete", range(23))} self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") oldStyleActionSpace = {"action": ("discrete", ["continue", "stop"])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="hard") # The available cards self.cards = [ 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11 ] # Initialize first game self.getInitialState() # Some observables self.pointsObservable = \ FloatStreamObservable(title='%s Points' % self.__class__.__name__, time_dimension_name='Episode', value_name='Points') if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.worlds.seventeen_and_four.environments.seventeen_and_four_viewers\ import SeventeenAndFourValuefunctionViewer # Add a Q-value viewer for this world VIEWERS.addViewer( lambda: SeventeenAndFourValuefunctionViewer(self.stateSpace), 'SeventeenAndFourValuefunctionViewer')
def __init__(self, useGUI, *args, **kwargs): # Create the environment info self.environmentInfo = \ EnvironmentInfo(# Which communication protocol version can the # environment handle? versionNumber="0.3", # Name of the environment (can be chosen arbitrarily) environmentName="LinearMarkovChain", # Is the action space of this environment discrete? discreteActionSpace=True, # Is the environment episodic? episodic=True, # Is the state space of environment continuous? continuousStateSpace=False, # Is the action space of environment continuous? continuousActionSpace=False, # Is the environment stochastic? stochastic=False) # Calls constructor of base class # After this call, the environment has an attribute "self.configDict", # The values of this dict are evaluated, i.e. instead of '100' (string), # the key 'length' will have the same value 100 (int). super(LinearMarkovChainEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # The state space of the linear markov chain oldStyleStateSpace = { "field": ("discrete", range(self.configDict["length"])) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") # The action space of the linear markov chain oldStyleActionSpace = {"action": ("discrete", ["left", "right"])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") # The initial state of the environment self.initialState = {"field": self.configDict["length"] / 2} # The current state is initially set to the initial state self.currentState = deepcopy(self.initialState)
class LinearMarkovChainEnvironment(SingleAgentEnvironment): """ A linear markov chain. The agent starts in the middle of this linear markov chain. He can either move right or left. The chain is not stochastic, i.e. when the agent wants to move right, the state is decreased with probability 1 by 1. When the agent wants to move left, the state is increased with probability 1 by 1 accordingly. .. versionadded:: 0.9.10 Added LinearMarkovChain environment **CONFIG DICT** :length: : The number of states of the linear markov chain """ # Add default configuration for this environment to this static dict # This specific parameter controls how long the linear markov chain is # (i.e. how many states there are) DEFAULT_CONFIG_DICT = {"length": 21} def __init__(self, useGUI, *args, **kwargs): # Create the environment info self.environmentInfo = \ EnvironmentInfo(# Which communication protocol version can the # environment handle? versionNumber="0.3", # Name of the environment (can be chosen arbitrarily) environmentName="LinearMarkovChain", # Is the action space of this environment discrete? discreteActionSpace=True, # Is the environment episodic? episodic=True, # Is the state space of environment continuous? continuousStateSpace=False, # Is the action space of environment continuous? continuousActionSpace=False, # Is the environment stochastic? stochastic=False) # Calls constructor of base class # After this call, the environment has an attribute "self.configDict", # The values of this dict are evaluated, i.e. instead of '100' (string), # the key 'length' will have the same value 100 (int). super(LinearMarkovChainEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # The state space of the linear markov chain oldStyleStateSpace = { "field": ("discrete", range(self.configDict["length"])) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") # The action space of the linear markov chain oldStyleActionSpace = {"action": ("discrete", ["left", "right"])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") # The initial state of the environment self.initialState = {"field": self.configDict["length"] / 2} # The current state is initially set to the initial state self.currentState = deepcopy(self.initialState) ########################## Interface Functions ##################################### def getInitialState(self): """ Returns the initial state of the environment """ self.environmentLog.debug("Episode starts in state '%s'." % (self.initialState['field'])) return self.initialState def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ action = actionObject['action'] previousState = self.currentState['field'] # Change state of environment deterministically if action == 'left': self.currentState['field'] -= 1 else: self.currentState['field'] += 1 self.environmentLog.debug( "Agent chose action '%s' which caused a transition from '%s' to '%s'." % (action, previousState, self.currentState['field'])) #Check if the episode is finished (i.e. the goal is reached) episodeFinished = self._checkEpisodeFinished() terminalState = self.currentState if episodeFinished else None if episodeFinished: self.episodeLengthObservable.addValue(self.episodeCounter, self.stepCounter + 1) self.returnObservable.addValue(self.episodeCounter, -self.stepCounter) self.environmentLog.debug("Terminal state '%s' reached." % self.currentState['field']) self.environmentLog.info( "Episode %s lasted for %s steps." % (self.episodeCounter, self.stepCounter + 1)) reward = 10 if self.currentState['field'] != 0 else -10 self.stepCounter = 0 self.episodeCounter += 1 # Reset the simulation to the initial state (always the same) self.currentState = deepcopy(self.initialState) else: reward = -1 self.stepCounter += 1 resultsDict = { "reward": reward, "terminalState": terminalState, "nextState": self.currentState, "startNewEpisode": episodeFinished } return resultsDict def _checkEpisodeFinished(self): """ Checks whether the episode is finished. An episode is finished whenever the leftmost or rightmost state of the chain is reached. """ return self.currentState['field'] in [0, self.configDict['length'] - 1]
def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = \ EnvironmentInfo(versionNumber="0.3", environmentName="Double Pole Balancing", discreteActionSpace=False, episodic=True, continuousStateSpace=True, continuousActionSpace=True, stochastic=False) super(DoublePoleBalancingEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # Convert from degrees to radians self.configDict["INITIALPOLEANGULARPOSITION1"] *= pi / 180.0 self.configDict['MAXPOLEANGULARPOSITION1'] *= pi / 180.0 self.configDict['MAXPOLEANGULARPOSITION2'] *= pi / 180.0 # The object which computes the dpb dynamics self.dpbDynamics = DoublePoleBalancingDynamics(self.configDict) #The state space of the Double Pole Balancing Simulation oldStyleStateSpace = { "cartPosition": ("continuous", [(-1.0, 1.0)]), "cartVelocity": ("continuous", [(-0.1, 0.1)]), "poleAngularPosition1": ("continuous", [(-1.0, 1.0)]), "poleAngularVelocity1": ("continuous", [(-0.5, 0.5)]), "poleAngularPosition2": ("continuous", [(-1.0, 1.0)]), "poleAngularVelocity2": ("continuous", [(-0.5, 0.5)]) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Double Pole Balancing Simulation oldStyleActionSpace = {"force": ("continuous", [(-10, 10)])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") # The name of the state dimensions that are send to the agent. # NOTE: The ordering of the state dimensions is important! self.stateNameList = [ "cartPosition", "cartVelocity", "poleAngularPosition1", "poleAngularVelocity1", "poleAngularPosition2", "poleAngularVelocity2" ] # The vector used for normalization of the state for the agent self.normalizationVector = array([ 1.0 / self.configDict['MAXCARTPOSITION'], 0.1, 1.0 / self.configDict['MAXPOLEANGULARPOSITION1'], 0.2, 1.0 / self.configDict['MAXPOLEANGULARPOSITION2'], 0.1 ]) #The current state of the simulation self.initialState = array([ 0.0, 0.0, self.configDict["INITIALPOLEANGULARPOSITION1"], 0.0, 0.0, 0.0 ]) #The current state is initially set to the initial state self.currentState = array(self.initialState) if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer # Add general trajectory viewer VIEWERS.addViewer(lambda: TrajectoryViewer(self.stateSpace), 'TrajectoryViewer')
class DoublePoleBalancingEnvironment(SingleAgentEnvironment): """ The double pole balancing environment In the double pole balancing environment, the task of the agent is to control a cart such that two poles which are mounted on the cart stay in a nearly vertical position (to balance them). At the same time, the cart has to stay in a confined region. The agent can apply in every time step a force between -10N and 10N in order to accelerate the car. Thus the action space is one-dimensional and continuous. The state consists of the cart's current position and velocity as well as the poles' angles and angular velocities. Thus, the state space is six-dimensional and continuous. The config dict of the environment expects the following parameters: **CONFIG DICT** :GRAVITY: : The gravity force. Benchmark default "-9.8". :MASSCART: : The mass of the cart. Benchmark default "1.0". :TAU: : The time step between two commands of the agent. Benchmark default "0.02" :MASSPOLE_1: : The mass of pole 1. Benchmark default "0.1" :MASSPOLE_2: : The mass of pole 2. Benchmark default "0.01" :LENGTH_1: : The length of pole 1. Benchmark default "0.5" :LENGTH_2: : The length of pole 2. Benchmark default "0.05" :MUP: : Coefficient of friction of the poles' hinges. Benchmark default "0.000002" :MUC: : Coefficient that controls friction. Benchmark default "0.0005" :INITIALPOLEANGULARPOSITION1: : Initial angle of pole 1. Benchmark default "4.0" :MAXCARTPOSITION: : The maximal distance the cart is allowed to move away from its start position. Benchmark default "2.4" :MAXPOLEANGULARPOSITION1: : Maximal angle pole 1 is allowed to take on. Benchmark default "36.0" :MAXPOLEANGULARPOSITION2: : Maximal angle pole 2 is allowed to take on. Benchmark default "36.0" :MAXSTEPS: : The number of steps the agent must balance the poles. Benchmark default "100000" """ DEFAULT_CONFIG_DICT = { 'GRAVITY': -9.8, 'MASSCART': 1.0, 'TAU': 0.02, 'MASSPOLE_1': 0.1, 'MASSPOLE_2': 0.1, 'LENGTH_1': 0.5, 'LENGTH_2': 0.05, 'MUP': 0.000002, 'MUC': 0.0005, 'INITIALPOLEANGULARPOSITION1': 4.0, 'MAXCARTPOSITION': 2.4, 'MAXPOLEANGULARPOSITION1': 36.0, 'MAXPOLEANGULARPOSITION2': 36.0, 'MAXSTEPS': 100000 } def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = \ EnvironmentInfo(versionNumber="0.3", environmentName="Double Pole Balancing", discreteActionSpace=False, episodic=True, continuousStateSpace=True, continuousActionSpace=True, stochastic=False) super(DoublePoleBalancingEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # Convert from degrees to radians self.configDict["INITIALPOLEANGULARPOSITION1"] *= pi / 180.0 self.configDict['MAXPOLEANGULARPOSITION1'] *= pi / 180.0 self.configDict['MAXPOLEANGULARPOSITION2'] *= pi / 180.0 # The object which computes the dpb dynamics self.dpbDynamics = DoublePoleBalancingDynamics(self.configDict) #The state space of the Double Pole Balancing Simulation oldStyleStateSpace = { "cartPosition": ("continuous", [(-1.0, 1.0)]), "cartVelocity": ("continuous", [(-0.1, 0.1)]), "poleAngularPosition1": ("continuous", [(-1.0, 1.0)]), "poleAngularVelocity1": ("continuous", [(-0.5, 0.5)]), "poleAngularPosition2": ("continuous", [(-1.0, 1.0)]), "poleAngularVelocity2": ("continuous", [(-0.5, 0.5)]) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Double Pole Balancing Simulation oldStyleActionSpace = {"force": ("continuous", [(-10, 10)])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") # The name of the state dimensions that are send to the agent. # NOTE: The ordering of the state dimensions is important! self.stateNameList = [ "cartPosition", "cartVelocity", "poleAngularPosition1", "poleAngularVelocity1", "poleAngularPosition2", "poleAngularVelocity2" ] # The vector used for normalization of the state for the agent self.normalizationVector = array([ 1.0 / self.configDict['MAXCARTPOSITION'], 0.1, 1.0 / self.configDict['MAXPOLEANGULARPOSITION1'], 0.2, 1.0 / self.configDict['MAXPOLEANGULARPOSITION2'], 0.1 ]) #The current state of the simulation self.initialState = array([ 0.0, 0.0, self.configDict["INITIALPOLEANGULARPOSITION1"], 0.0, 0.0, 0.0 ]) #The current state is initially set to the initial state self.currentState = array(self.initialState) if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer # Add general trajectory viewer VIEWERS.addViewer(lambda: TrajectoryViewer(self.stateSpace), 'TrajectoryViewer') ########################## Interface Functions ########################## def getInitialState(self): """Returns the initial state of the environment More information about (valid) states can be found in :ref:`state_and_action_spaces` """ return self._createStateForAgent(self.initialState) def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ # Remember state before executing action previousState = self.currentState force = actionObject['force'] minForce, maxForce = self.actionSpace['force']['dimensionValues'][0] # Force has to be within the allowed range (minForce, maxForce) force = min(max(force, minForce), maxForce) # if force is less than +/-1/256*10N we set it to this level if fabs(force) < 10.0 / 256: force = 10.0 / 256 if force >= 0 else -10.0 / 256 # Compute the successor state self.currentState = self.dpbDynamics.stateTransition( self.currentState, force) episodeFinished = self._checkEpisodeFinished() terminalState = self._createStateForAgent(self.currentState) \ if episodeFinished else None if episodeFinished: self.episodeLengthObservable.addValue(self.episodeCounter, self.stepCounter + 1) self.returnObservable.addValue(self.episodeCounter, self.stepCounter) self.environmentLog.info( "Episode %s lasted for %s steps." % (self.episodeCounter, self.stepCounter + 1)) self.stepCounter = 0 self.episodeCounter += 1 # Reset the simulation to the initial state (always the same) self.currentState = array(self.initialState) self.trajectoryObservable.addTransition( self._createStateForAgent(previousState), actionObject, 1, terminalState, episodeTerminated=True) else: self.stepCounter += 1 if self.stepCounter in [ 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000 ]: self.environmentLog.info( "Episode %s. Balanced for %s steps!" % (self.episodeCounter, self.stepCounter)) self.trajectoryObservable.addTransition( self._createStateForAgent(previousState), actionObject, 1, self._createStateForAgent(self.currentState), episodeTerminated=False) resultsDict = { "reward": 1, #we give always reward 1 "terminalState": terminalState, "nextState": self._createStateForAgent(self.currentState), "startNewEpisode": episodeFinished } return resultsDict ########################## Helper Functions ##################################### def _checkTerminalState(self): """ Returns whether the simulation has reached a terminal state. A terminal state is reached if the cart or the pole exceed certain boundaries """ return ( (fabs(self.currentState[0]) > self.configDict['MAXCARTPOSITION']) or (fabs(self.currentState[2]) > self.configDict['MAXPOLEANGULARPOSITION1']) or (fabs(self.currentState[4]) > self.configDict['MAXPOLEANGULARPOSITION2'])) def _checkEpisodeFinished(self): """ Returns whether an episode is finished. An episode is finished if a terminal state is reached or the maximum number of steps is exceeded. """ return self._checkTerminalState() \ or self.stepCounter >= self.configDict["MAXSTEPS"] def _createStateForAgent(self, state): """ Returns the representation of the given *state* for the agent.""" stateForAgent = dict( zip(self.stateNameList, state * self.normalizationVector)) return stateForAgent
class PODoublePoleBalancingEnvironment(DoublePoleBalancingEnvironment): """ The partially observable double pole balancing environment In the partially observable double pole balancing environment, the task of the agent is to control a cart such that two poles which are mounted on the cart stay in a nearly vertical position (to balance them). At the same time, the cart has to stay in a confined region. In contrast to the fully observable double pole balancing environment, the agent only observes the current position of cart and the two poles but not their velocities. This renders the problem to be not markovian. The agent can apply in every time step a force between -10N and 10N in order to accelerate the car. Thus the action space is one-dimensional and continuous. The state consists of the cart's current position and velocity as well as the poles' angles and angular velocities. Thus, the state space is six-dimensional and continuous. The config dict of the environment expects the following parameters: **CONFIG DICT** :GRAVITY: : The gravity force. Benchmark default "-9.8". :MASSCART: : The mass of the cart. Benchmark default "1.0". :TAU: : The time step between two commands of the agent. Benchmark default "0.02" :MASSPOLE_1: : The mass of pole 1. Benchmark default "0.1" :MASSPOLE_2: : The mass of pole 2. Benchmark default "0.01" :LENGTH_1: : The length of pole 1. Benchmark default "0.5" :LENGTH_2: : The length of pole 2. Benchmark default "0.05" :MUP: : Coefficient of friction of the poles' hinges. Benchmark default "0.000002" :MUC: : Coefficient that controls friction. Benchmark default "0.0005" :INITIALPOLEANGULARPOSITION1: : Initial angle of pole 1. Benchmark default "4.0" :MAXCARTPOSITION: : The maximal distance the cart is allowed to move away from its start position. Benchmark default "2.4" :MAXPOLEANGULARPOSITION1: : Maximal angle pole 1 is allowed to take on. Benchmark default "36.0" :MAXPOLEANGULARPOSITION2: : Maximal angle pole 2 is allowed to take on. Benchmark default "36.0" :MAXSTEPS: : The number of steps the agent must balance the poles. Benchmark default "100000" """ def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = \ EnvironmentInfo(versionNumber="0.3", environmentName="Partially Observable Double Pole Balancing", discreteActionSpace=False, episodic=True, continuousStateSpace=True, continuousActionSpace=True, stochastic=False) super(PODoublePoleBalancingEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) #The state space of partially observable double pole balancing oldStyleStateSpace = { "cartPosition": ("continuous", [(-1.0, 1.0)]), "poleAngularPosition1": ("continuous", [(-1.0, 1.0)]), "poleAngularPosition2": ("continuous", [(-1.0, 1.0)]) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") # The name of the state dimensions that are send to the agent. # NOTE: The ordering of the state dimensions is important! self.stateNameList = [ "cartPosition", "poleAngularPosition1", "poleAngularPosition2" ]
def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Maze2D", discreteActionSpace=True, episodic=True, continuousStateSpace=False, continuousActionSpace=False, stochastic=False) super(Maze2dEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # Reading string which describes the structure of the maze mazeDescriptionString = open(mmlf.getRWPath() + os.sep + "config" + os.sep + "maze2d" + os.sep + self.configDict['MAZE']).read() # Remove comment lines and superfluous whitespace lines = map(lambda line: line.strip(), mazeDescriptionString.split("\n")) lines = filter(lambda line: not line.startswith("#"), lines) mazeDescriptionString = "\n".join(lines) #The maze object is created from the description self.maze = Maze.createMazeFromString(mazeDescriptionString) #The state space of the Maze2d Simulation oldStyleStateSpace = { "column": ("discrete", range(self.maze.getColumns())), "row": ("discrete", range(self.maze.getRows())) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Maze2d Simulation oldStyleActionSpace = { "action": ("discrete", ["left", "right", "up", "down"]) } self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") #The current state of the simulation self.initialState = { "row": self.maze.getStartPosition()[0], "column": self.maze.getStartPosition()[1] } #The current state is initially set to the initial state self.currentState = deepcopy(self.initialState) if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.maze2d.environments.maze2d_viewer import Maze2DDetailedViewer from mmlf.worlds.maze2d.environments.maze2d_function_viewer import Maze2DFunctionViewer # Create customized trajectory viewer class Maze2dTrajectoryViewer(TrajectoryViewer): def __init__(self, stateSpace, plotStateSpaceStructure): super(Maze2dTrajectoryViewer, self).__init__(stateSpace) plotStateSpaceStructure(self.axisTrajectory) VIEWERS.addViewer(lambda : \ Maze2dTrajectoryViewer(self.stateSpace, lambda ax : self.plotStateSpaceStructure(ax)), 'Maze2dTrajectoryViewer') # Add viewers for the maze world VIEWERS.addViewer( lambda: Maze2DDetailedViewer(self.maze, self.stateSpace, ["left", "right", "up", "down"]), 'Maze2DDetailedViewer') VIEWERS.addViewer( lambda: Maze2DFunctionViewer(self.maze, self.stateSpace), 'Maze2DFunctionViewer')
class Maze2dEnvironment(SingleAgentEnvironment): """ The two-dimensional maze environment for an agent without orientation. A 2d maze world, in which the agent is situated at each moment in time in a certain field (specified by its (row,column) coordinate) and can move either upwards, downwards, left or right. The structure of the maze can be configured via a text-based config file. **CONFIG DICT** :episodesUntilDoorChange: : Episodes that the door will remain in their initial state. After this number of episodes, the door state is inverted. :MAZE: : Name of the config file, where the maze is defined. These files are located in folder 'worlds/maze2d' """ DEFAULT_CONFIG_DICT = { "episodesUntilDoorChange": 25, "MAZE": "maze_simple.cfg" } def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Maze2D", discreteActionSpace=True, episodic=True, continuousStateSpace=False, continuousActionSpace=False, stochastic=False) super(Maze2dEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # Reading string which describes the structure of the maze mazeDescriptionString = open(mmlf.getRWPath() + os.sep + "config" + os.sep + "maze2d" + os.sep + self.configDict['MAZE']).read() # Remove comment lines and superfluous whitespace lines = map(lambda line: line.strip(), mazeDescriptionString.split("\n")) lines = filter(lambda line: not line.startswith("#"), lines) mazeDescriptionString = "\n".join(lines) #The maze object is created from the description self.maze = Maze.createMazeFromString(mazeDescriptionString) #The state space of the Maze2d Simulation oldStyleStateSpace = { "column": ("discrete", range(self.maze.getColumns())), "row": ("discrete", range(self.maze.getRows())) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Maze2d Simulation oldStyleActionSpace = { "action": ("discrete", ["left", "right", "up", "down"]) } self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") #The current state of the simulation self.initialState = { "row": self.maze.getStartPosition()[0], "column": self.maze.getStartPosition()[1] } #The current state is initially set to the initial state self.currentState = deepcopy(self.initialState) if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.maze2d.environments.maze2d_viewer import Maze2DDetailedViewer from mmlf.worlds.maze2d.environments.maze2d_function_viewer import Maze2DFunctionViewer # Create customized trajectory viewer class Maze2dTrajectoryViewer(TrajectoryViewer): def __init__(self, stateSpace, plotStateSpaceStructure): super(Maze2dTrajectoryViewer, self).__init__(stateSpace) plotStateSpaceStructure(self.axisTrajectory) VIEWERS.addViewer(lambda : \ Maze2dTrajectoryViewer(self.stateSpace, lambda ax : self.plotStateSpaceStructure(ax)), 'Maze2dTrajectoryViewer') # Add viewers for the maze world VIEWERS.addViewer( lambda: Maze2DDetailedViewer(self.maze, self.stateSpace, ["left", "right", "up", "down"]), 'Maze2DDetailedViewer') VIEWERS.addViewer( lambda: Maze2DFunctionViewer(self.maze, self.stateSpace), 'Maze2DFunctionViewer') ########################## Interface Functions ##################################### def getInitialState(self): """ Returns the initial state of the environment """ return self._createStateForAgent(self.initialState) def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ # The state before executing the action previousState = dict(self.currentState) action = actionObject['action'] # Execute the action which was chosen by the agent self._stateTransition(action) #Check if the episode is finished (i.e. the goal is reached) episodeFinished = self._checkEpisodeFinished() terminalState = self.currentState if episodeFinished else None if episodeFinished: self.episodeLengthObservable.addValue(self.episodeCounter, self.stepCounter + 1) self.returnObservable.addValue(self.episodeCounter, -self.stepCounter) self.environmentLog.info( "Episode %s lasted for %s steps." % (self.episodeCounter, self.stepCounter + 1)) self.stepCounter = 0 self.episodeCounter += 1 # Check if maze should be blocked if self.episodeCounter == self.configDict[ "episodesUntilDoorChange"]: self.maze.switchBlocking() # Reset the simulation to the initial state (always the same) self.currentState = deepcopy(self.initialState) self.trajectoryObservable.addTransition( previousState, action, -1, terminalState, episodeTerminated=episodeFinished) else: self.stepCounter += 1 self.trajectoryObservable.addTransition( previousState, action, -1, self.currentState, episodeTerminated=episodeFinished) resultsDict = { "reward": -1, # we always give a reward of -1 "terminalState": terminalState, "nextState": self._createStateForAgent(self.currentState), "startNewEpisode": episodeFinished } return resultsDict def _stateTransition(self, action): "Execute the specified action and store the resulting state" # If the action was move forward: currentPos = (self.currentState['row'], self.currentState['column']) nextPos = self.maze.tryToMove(currentPos, action) # The current state is initially set to the initial state self.currentState['row'] = nextPos[0] self.currentState['column'] = nextPos[1] def _checkEpisodeFinished(self): "Checks whether the episode is finished, i. e. the goal is reached" currentPos = (self.currentState['row'], self.currentState['column']) return self.maze.isGoalReached(currentPos) def _createStateForAgent(self, state): "Create a state description for the agent" return state def plotStateSpaceStructure(self, axis): """ Plot structure of state space into given axis. Just a helper function for viewers and graphic logging. """ self.maze.drawIntoAxis(axis)
class SeventeenAndFourEnvironment(SingleAgentEnvironment): """ The seventeen & four environment This environment implements a simplified form of the card game seventeen & four, in which the agent takes the role of the player and plays against a hard-coded dealer. The player starts initially with two randomly drawn card with values of 2,3,4,7,8,9,10 or 11. The goal is get a set of cards whose sum is as close as possible to 21. The agent can stick with two cards or draw arbitrarily many cards sequentially. If the sum of cards becomes greater than 21, the agent looses and gets a reward of -1. If the agent stops with cards less valued than 22, a hard-coded dealer policy starts playing against the agent. This dealer draws card until it has either equal/more points than the agent or more than 21. In the first case, the dealer wins and the agent gets a reward of -1, otherwise the player wins and gets a reward of 0. """ DEFAULT_CONFIG_DICT = {} def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="17 and 4", discreteActionSpace=True, episodic=True, continuousStateSpace=False, continuousActionSpace=False, stochastic=True) super(SeventeenAndFourEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # State and action space definition oldStyleStateSpace = {"count": ("discrete", range(23))} self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") oldStyleActionSpace = {"action": ("discrete", ["continue", "stop"])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="hard") # The available cards self.cards = [ 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11 ] # Initialize first game self.getInitialState() # Some observables self.pointsObservable = \ FloatStreamObservable(title='%s Points' % self.__class__.__name__, time_dimension_name='Episode', value_name='Points') if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.worlds.seventeen_and_four.environments.seventeen_and_four_viewers\ import SeventeenAndFourValuefunctionViewer # Add a Q-value viewer for this world VIEWERS.addViewer( lambda: SeventeenAndFourValuefunctionViewer(self.stateSpace), 'SeventeenAndFourValuefunctionViewer') ########################## Interface Functions ##################################### def getInitialState(self): """ Returns the initial state of the environment """ self.remainingCards = list(self.cards) self.drawnCards = [] # Player starts with two cards self._drawCard(self.drawnCards) self._drawCard(self.drawnCards) return self._createState() def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ if actionObject['action'] == 'stop': # Agent stopped self.episodeCounter += 1 sumOfCards = sum(self.drawnCards) self.pointsObservable.addValue(self.episodeCounter, sumOfCards) # Determine dealers outcome dealersCards = [] self._drawCard(dealersCards) # Dealer starts with two cards self._drawCard(dealersCards) # Dealer draws until he has same number of points as agent or too many while sum(dealersCards) <= sumOfCards and sum(dealersCards) < 22: self._drawCard(dealersCards) self.environmentLog.info( "Episode %s: Agent %s Dealer %s" % (self.episodeCounter, sumOfCards, sum(dealersCards))) if sum(dealersCards) > sumOfCards and sum(dealersCards) < 22: # Agent lost against dealer self.returnObservable.addValue(self.episodeCounter, -1) return { "reward": -1, # lost "terminalState": { 'count': 22 }, "nextState": self.getInitialState(), "startNewEpisode": True } else: # Agent won since it has more points than dealer self.returnObservable.addValue(self.episodeCounter, 0) return { "reward": 0, # won "terminalState": { 'count': 22 }, "nextState": self.getInitialState(), "startNewEpisode": True } # Draw a card self._drawCard(self.drawnCards) if sum(self.drawnCards) > 21: # Agent lost since its cards exceed 21 self.environmentLog.info( "Episode %s: Agent %s" % (self.episodeCounter, sum(self.drawnCards))) self.episodeCounter += 1 self.pointsObservable.addValue(self.episodeCounter, sum(self.drawnCards)) self.returnObservable.addValue(self.episodeCounter, -1) return { "reward": -1, # lost "terminalState": { 'count': 22 }, "nextState": self.getInitialState(), "startNewEpisode": True } else: return { "reward": 0, # game still running "terminalState": None, "nextState": self._createState(), "startNewEpisode": False } def _createState(self): return {'count': min(22, sum(self.drawnCards))} def _drawCard(self, listOfCards): """ Draw a card randomly """ card = random.choice(self.remainingCards) self.remainingCards.remove(card) listOfCards.append(card)
def __init__(self, config, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Mountain Car", discreteActionSpace=True, episodic=True, continuousStateSpace=True, continuousActionSpace=False, stochastic=True) # Add value for N to config dict (required for discretization # in optimal policy computation) if "N" not in config["configDict"]: config["configDict"]["N"] = "50" super(MountainCarEnvironment, self).__init__(config, useGUI=useGUI, *args, **kwargs) # configuration self.randomStarts = True # Some constants self.minPosition = -1.2 # Minimum car position self.maxPosition = 0.6 # Maximum car position (past goal) self.maxVelocity = 0.07 # Maximum velocity of car self.goalPosition = 0.5 # Goal position - how to tell we are done # If "maxGoalVelocity" is not set in configDict, set it to maximal # velocity if not "maxGoalVelocity" in self.configDict: self.configDict["maxGoalVelocity"] = self.maxVelocity # The current state of the system self.state = None # Some counters self.overallStepCounter = 0 # State and action space definition oldStyleStateSpace = { "position": ("continuous", [(self.minPosition, self.maxPosition)]), "velocity": ("continuous", [(-self.maxVelocity, self.maxVelocity)]) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") self.actions = ["left", "right", "none"] oldStyleActionSpace = {"thrust": ("discrete", self.actions)} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="hard") if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.mountain_car.environments.mcar_policy_viewer \ import MountainCarPolicyViewer from mmlf.worlds.mountain_car.environments.mcar_valuefunction_viewer \ import MountainCarValueFunctionViewer # Add general trajectory viewer VIEWERS.addViewer(lambda: TrajectoryViewer(self.stateSpace), 'TrajectoryViewer') VIEWERS.addViewer(lambda: MountainCarPolicyViewer(self.stateSpace), 'MountainCar PolicyViewer') VIEWERS.addViewer( lambda: MountainCarValueFunctionViewer(self.stateSpace), 'MountainCar ValueFunctionViewer')
class MountainCarEnvironment(SingleAgentEnvironment): """ The mountain car environment. In the mountain car environment, the agent has to control a car which is situated somewhere in a valley between two hills. The goal of the agent is to reach the top of the right hill. Unfortunately, the engine of the car is not strong enough to reach the top of the hill directly from many start states. Thus, it has first to drive in the wrong direction to gather enough potential energy. The agent can either accelerate left, right, or coast. Thus, the action space is discrete with three discrete actions. The agent observes two continuous state components: The current position and velocity of the car. The start state of the car is stochastically initialised. **CONFIG DICT** :maxStepsPerEpisode: : The number of steps the agent has maximally to reach the goal. Benchmark default is "500". :accelerationFactor: : A factor that influences how strong the cars engine is relative to the slope of the hill. Benchmark default is "0.001". :maxGoalVelocity: : Maximum velocity the agent might have when reaching the goal. If smaller than 0.07, this effectively makes the task MountainPark instead of MountainCar. Benchmark default is "0.07" :positionNoise: : Noise that is added to the agent's observation of the position. Benchmark default is "0.0" :velocityNoise: : Noise that is added to the agent's observation of the velocity. Benchmark default is "0.0" """ DEFAULT_CONFIG_DICT = { 'maxStepsPerEpisode': 500, 'accelerationFactor': 0.001, 'maxGoalVelocity': 0.07, 'positionNoise': 0.0, 'velocityNoise': 0.0 } def __init__(self, config, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Mountain Car", discreteActionSpace=True, episodic=True, continuousStateSpace=True, continuousActionSpace=False, stochastic=True) # Add value for N to config dict (required for discretization # in optimal policy computation) if "N" not in config["configDict"]: config["configDict"]["N"] = "50" super(MountainCarEnvironment, self).__init__(config, useGUI=useGUI, *args, **kwargs) # configuration self.randomStarts = True # Some constants self.minPosition = -1.2 # Minimum car position self.maxPosition = 0.6 # Maximum car position (past goal) self.maxVelocity = 0.07 # Maximum velocity of car self.goalPosition = 0.5 # Goal position - how to tell we are done # If "maxGoalVelocity" is not set in configDict, set it to maximal # velocity if not "maxGoalVelocity" in self.configDict: self.configDict["maxGoalVelocity"] = self.maxVelocity # The current state of the system self.state = None # Some counters self.overallStepCounter = 0 # State and action space definition oldStyleStateSpace = { "position": ("continuous", [(self.minPosition, self.maxPosition)]), "velocity": ("continuous", [(-self.maxVelocity, self.maxVelocity)]) } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") self.actions = ["left", "right", "none"] oldStyleActionSpace = {"thrust": ("discrete", self.actions)} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="hard") if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.mountain_car.environments.mcar_policy_viewer \ import MountainCarPolicyViewer from mmlf.worlds.mountain_car.environments.mcar_valuefunction_viewer \ import MountainCarValueFunctionViewer # Add general trajectory viewer VIEWERS.addViewer(lambda: TrajectoryViewer(self.stateSpace), 'TrajectoryViewer') VIEWERS.addViewer(lambda: MountainCarPolicyViewer(self.stateSpace), 'MountainCar PolicyViewer') VIEWERS.addViewer( lambda: MountainCarValueFunctionViewer(self.stateSpace), 'MountainCar ValueFunctionViewer') ########################## Interface Functions ##################################### def getInitialState(self): """ Returns the initial state of the environment More information about (valid) states can be found in :ref:`state_and_action_spaces` """ if self.randomStarts: # random start state def randomInInterval(min, max): "Returns a random number between min and max" return min + (random.random() * (max - min)) position = randomInInterval(self.minPosition, self.goalPosition) velocity = randomInInterval(-self.maxVelocity, self.maxVelocity) else: # deterministically start in (-0.5, 0.0) position = -0.5 velocity = 0.0 self.state = {"position": position, "velocity": velocity} return self._stateForAgent(self.state) def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ # Remember state before executing action previousState = self.state # Execute the action which was chosen by the agent self.state, prob = list( self.stateTransitionFct(self.state, actionObject['thrust']))[0] self.stepCounter += 1 #Check if the episode is finished (i.e. the goal is reached) episodeFinished = False terminalState = None if self.isTerminalState(self.state): episodeFinished = True terminalState = self._stateForAgent(self.state) self.environmentLog.info( "Episode %s: Goal reached after %s steps." % (self.episodeCounter, self.stepCounter)) elif self.stepCounter >= self.configDict["maxStepsPerEpisode"]: episodeFinished = True self.environmentLog.info( "Episode %s: No goal reached but %s steps expired!" % (self.episodeCounter, self.stepCounter)) # Compute reward reward = self.rewardFct(self.state, actionObject['thrust']) self.trajectoryObservable.addTransition( self._stateForAgent(previousState), actionObject, reward, self._stateForAgent(self.state), episodeTerminated=episodeFinished) if episodeFinished: self.episodeLengthObservable.addValue(self.episodeCounter, self.stepCounter + 1) self.returnObservable.addValue(self.episodeCounter, -self.stepCounter) self.stepCounter = 0 self.episodeCounter += 1 # Reset the simulation to some initial state self.state = self.getInitialState() resultsDict = { "reward": reward, "terminalState": terminalState, "nextState": self._stateForAgent(self.state), "startNewEpisode": episodeFinished } return resultsDict def stateTransitionFct(self, state, action): """ Returns iterator of the successor states of *action* in *state*.""" #Applies the action and calculates the new position and velocity def minmax(item, limit1, limit2): "Bounds item to between limit1 and limit2 (or -limit1)" return max(limit1, min(limit2, item)) # Get position and velocity position = state["position"] velocity = state["velocity"] # Determine acceleration factor if action == 'left': # action is backward thrust factor = -1 elif action == 'none': # action is coast factor = 0 else: # action is forward thrust factor = 1 # Do the actual state update velocityChange = self.configDict["accelerationFactor"] * factor \ - 0.0025 * cos(3 * position) velocity = minmax(velocity + velocityChange, -self.maxVelocity, self.maxVelocity) position += velocity position = minmax(position, self.minPosition, self.maxPosition) if (position <= self.minPosition) and (velocity < 0): velocity = 0.0 if position >= self.goalPosition \ and abs(velocity) > self.configDict["maxGoalVelocity"]: velocity = -velocity yield State( [position, velocity], [self.stateSpace["position"], self.stateSpace["velocity"]]), 1.0 def rewardFct(self, state, action): """ Returns the reward obtained after executing *action* in *state*. """ # We always give reward -1 return -1 def isTerminalState(self, state): """ Returns whether *state* is a terminal state. """ # Returns whether the car has reached the goal return state["position"] >= self.goalPosition \ and abs(state["velocity"]) <= self.configDict["maxGoalVelocity"] def _stateForAgent(self, state): return { "position": state["position"] + random.normalvariate(0.0, self.configDict["positionNoise"]), "velocity": state["velocity"] + random.normalvariate(0.0, self.configDict["velocityNoise"]) }
class SinglePoleBalancingEnvironment(SingleAgentEnvironment): """ The single pole balancing environment. In the single pole balancing environment, the task of the agent is to control a cart such that a pole which is mounted on the cart stays in a nearly vertical position (to balance it). At the same time, the cart has to stay in a confined region. The agent can apply in every time step a force between -2N and 2N in order to accelerate the car. Thus the action space is one-dimensional and continuous. The state consists of the cart's current position and velocity as well as the pole's angle and angular velocity. Thus, the state space is four-dimensional and continuous. **CONFIG DICT** :GRAVITY: : The gravity force. Benchmark default "-9.8" :MASSCART: : The mass of the cart. Benchmark default "1.0" :MASSPOLE: : The mass of the pole. Benchmark default "0.1" :TOTAL_MASS: : The total mass (pole + cart). Benchmark default "1.1" :LENGTH: : The length of the pole. Benchmark default "0.5" :POLEMASS_LENGTH: : The center of mass of the pole. Benchmark default "0.05" :TAU: : The time step between two commands of the agent. Benchmark default "0.02" :MAXCARTPOSITION: : The maximal distance the cart is allowed to move away from its start position. Benchmark default "7.5" :MAXPOLEANGULARPOSITION: : Maximal angle the pole is allowed to take on. Benchmark default "0.7" :MAXSTEPS: : The number of steps the agent must balance the poles. Benchmark default "100000" """ DEFAULT_CONFIG_DICT = {'GRAVITY' : 9.8, 'MASSCART' : 1.0, 'MASSPOLE' : 0.1, 'TOTAL_MASS' : 1.1, 'LENGTH' : 0.5, 'POLEMASS_LENGTH' : 0.05, 'TAU' : 0.02, 'MAXCARTPOSITION' : 7.5, 'MAXPOLEANGULARPOSITION' : 0.7, 'MAXSTEPS' : 100000} def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Single Pole Balancing", discreteActionSpace=False, episodic=True, continuousStateSpace=True, continuousActionSpace=True, stochastic=False) super(SinglePoleBalancingEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) #The state space of the Single Pole Balancing Simulation oldStyleStateSpace = {"cartPosition": ("continuous", [(-3.125, 3.125)]), "cartVelocity": ("continuous", [(-0.5, 0.5)]), "poleAngularPosition": ("continuous", [(-1.13, 1.13)]), "poleAngularVelocity": ("continuous", [(-0.80, 0.80)]), } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Single Pole Balancing Simulation oldStyleActionSpace = {"force": ("continuous", [(-2, 2)])} self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") #The current state of the simulation #Note that the values of this dict can be accesed directly as #attributes of the class (see the __getattr__ and _setattr__ method) self.initialState = { "cartPosition": 0.0, "poleAngularPosition": 0.1, "cartVelocity": 0.0, "poleAngularVelocity": 0.0, } #The current state is initially set to the initial state self.currentState = deepcopy(self.initialState) if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.single_pole_balancing.environments.spb_trajectory_viewer import SPBTrajectoryViewer # Add general trajectory viewer VIEWERS.addViewer(lambda : TrajectoryViewer(self.stateSpace), 'TrajectoryViewer') VIEWERS.addViewer(lambda : SPBTrajectoryViewer(), 'SPB Cart Viewer') def __setattr__(self, attrName, attrValue): """ Sets the attribute with name attrName to the Value attrValue. If there is no such attribute but a key with this name exists in self.currentState, this entry of the dictiionary is updated instead. """ if attrName in self.__dict__.iterkeys(): self.__dict__[attrName] = attrValue elif attrName != 'currentState' \ and hasattr(self,'currentState') \ and attrName in self.currentState.iterkeys(): self.currentState[attrName] = attrValue else: self.__dict__[attrName] = attrValue def __getattr__(self, attrName): """ Returns the value of the attribute specified by attrName. If there is no such attribute, it checks if such an attribute is contained in the self.currentState dict. """ if attrName in self.__dict__.iterkeys(): return self.__dict__[attrName] elif attrName != 'currentState' and attrName in self.currentState.iterkeys(): return self.currentState[attrName] else: raise AttributeError("%s object has no attribute %s" % (self.__class__.__name__, attrName)) ########################## Interface Functions ##################################### def getInitialState(self): """ Returns the initial state of the environment """ return self._createStateForAgent(self.initialState) def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ # Remember state before executing action previousState = self.currentState # Determine force applied to the cart force = actionObject['force'] # wish be the agent force = self.actionSpace.chopContinuousAction(force) # actual force self._stateTransition(force) episodeFinished = self._checkEpisodeFinished() terminalState = self._createStateForAgent(self.currentState) \ if episodeFinished else None if episodeFinished: self.episodeLengthObservable.addValue(self.episodeCounter, self.stepCounter + 1) self.returnObservable.addValue(self.episodeCounter, self.stepCounter) self.environmentLog.info("Episode %s lasted for %s steps." % (self.episodeCounter, self.stepCounter + 1)) self.stepCounter = 0 self.episodeCounter += 1 #Reset the environment to the initial state (always the same) self.currentState = deepcopy(self.initialState) self.trajectoryObservable.addTransition(self._createStateForAgent(previousState), actionObject, 1, terminalState, episodeTerminated=True) else: self.stepCounter += 1 if self.stepCounter in [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000]: self.environmentLog.info("Balanced for %s steps!" % self.stepCounter) self.trajectoryObservable.addTransition(self._createStateForAgent(previousState), actionObject, 1, self._createStateForAgent(self.currentState), episodeTerminated=False) resultsDict = {"reward" : 1, # we always give a reward of 1 "terminalState" : terminalState, "nextState" : self._createStateForAgent(self.currentState), "startNewEpisode" : episodeFinished} return resultsDict ########################## Helper Functions ##################################### def _stateTransition(self, force): "Update self.currentState with new values based on the current values and the force applied" costheta = math.cos(self.currentState["poleAngularPosition"]) sintheta = math.sin(self.currentState["poleAngularPosition"]) temp = (force + self.configDict["POLEMASS_LENGTH"] * self.currentState["poleAngularPosition"] * \ self.currentState["poleAngularPosition"] * sintheta)/ self.configDict["TOTAL_MASS"] thetaacc = (self.configDict["GRAVITY"] * sintheta - costheta* temp)/ \ (self.configDict["LENGTH"] * (1.333333333333 \ - self.configDict["MASSPOLE"] * costheta * costheta / self.configDict["TOTAL_MASS"])) xacc = temp - self.configDict["POLEMASS_LENGTH"] * thetaacc* costheta / self.configDict["TOTAL_MASS"] #Update the four state variables, using Euler's method. self.currentState["cartPosition"] = self.currentState["cartPosition"] + self.configDict["TAU"] * self.currentState["cartVelocity"] self.currentState["cartVelocity"] = self.currentState["cartVelocity"] + self.configDict["TAU"] * xacc self.currentState["poleAngularPosition"] = self.currentState["poleAngularPosition"] + self.configDict["TAU"] * self.currentState["poleAngularVelocity"] self.currentState["poleAngularVelocity"] = self.currentState["poleAngularVelocity"] + self.configDict["TAU"] * thetaacc def _checkTerminalState(self): """ Returns whether the simulation has reached a terminal state. A terminal state is reached if the cart or the pole exceed certain boundaries """ return ((math.fabs(self.currentState["cartPosition"]) > self.configDict["MAXCARTPOSITION"]) \ or (math.fabs(self.currentState["poleAngularPosition"]) > self.configDict["MAXPOLEANGULARPOSITION"])) def _checkEpisodeFinished(self): """ Returns whether an episode is finished. An episode is finished if a terminal state is reached or the maximum number of steps is exceeded. """ return self._checkTerminalState() or self.stepCounter >= self.configDict["MAXSTEPS"]-1 def _createStateForAgent(self, state): """ Creates a subset of the state which can be communicated to an agent """ # pass the state along unmodified stateForAgent = { "cartPosition": state['cartPosition']/2.4, "cartVelocity": state['cartVelocity']/10.0, "poleAngularPosition": state['poleAngularPosition']/0.62, "poleAngularVelocity": state['poleAngularVelocity']/5.0, } return stateForAgent
class MazeCliffEnvironment(SingleAgentEnvironment): """ The two-dimensional maze cliff environment. In this maze, there are two alternative ways from the start to the goal state: one short way which leads along a dangerous cliff and one long but secure way. If the agent happens to step into the maze, it will get a huge negative reward (configurable via *cliffPenalty*) and is reset into the start state. Per default, the maze is deterministic, i.e. the agent always moves in the direction it chooses. However, the parameter *stochasticity* allows to control the stochasticity of the environment. For instance, when stochasticity is set to 0.01, the the agent performs a random move instead of the chosen one with probability 0.01. The maze structure is as follows where "S" is the start state, "G" the goal state and "C" is a cliff field: ************** * * * * * * *SCCCCCCCCCCG* ************** **CONFIG DICT** :cliffPenalty: : The reward an agent obtains when stepping into the cliff area :stochasticity: : The stochasticity of the state transition matrix. With probability 1-*stochasticity* the desired transition is made, otherwise a random transition """ DEFAULT_CONFIG_DICT = {"cliffPenalty" : -100, "stochasticity" : 0.0} def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Maze Cliff", discreteActionSpace=True, episodic=True, continuousStateSpace=False, continuousActionSpace=False, stochastic=False) super(MazeCliffEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # A string which describes the structure of the maze # A * indicates a wall, an S the start position of the agent # and a G the goal. A blank indicates a free cell. mazeDescriptionString = """************** * * * * * * *S G* ************** """ #The maze object is created from the description self.maze = Maze.createMazeFromString(mazeDescriptionString, cliffPenalty=self.configDict["cliffPenalty"], stochasticity=self.configDict["stochasticity"]) #The state space of the Maze2d Simulation oldStyleStateSpace = { "column": ("discrete", range(self.maze.getColumns())), "row": ("discrete", range(self.maze.getRows())), } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Single Pole Balancing Simulation oldStyleActionSpace = { "action": ("discrete", ["up", "down", "left", "right"]) } self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") # dictionary which contains all configuration options specific to this environment # it is VERY important to put ALL configuration options which uniquely determine # the behavior of the environment in this dictionary. self.configDict = {} #The current state of the simulation self.initialState = { "row": self.maze.getStartPosition()[0], "column": self.maze.getStartPosition()[1], } #The current state is initially set to the initial state self.currentState = deepcopy(self.initialState) #A counter which stores the number of steps which have been perfomed in this episode self.stepCounter = 0 self.episodeCounter = 0 #The accumulated reward self.reward = 0.0 if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.maze2d.environments.maze2d_viewer import Maze2DDetailedViewer from mmlf.worlds.maze2d.environments.maze2d_function_viewer import Maze2DFunctionViewer # Create customized trajectory viewer class MazeCliffTrajectoryViewer(TrajectoryViewer): def __init__(self, stateSpace, plotStateSpaceStructure): super(MazeCliffTrajectoryViewer, self).__init__(stateSpace) plotStateSpaceStructure(self.axisTrajectory) VIEWERS.addViewer(lambda : \ MazeCliffTrajectoryViewer(self.stateSpace, lambda ax : self.plotStateSpaceStructure(ax)), 'MazeCliffTrajectoryViewer') # Add viewers for the maze world VIEWERS.addViewer(lambda : Maze2DDetailedViewer(self.maze, self.stateSpace, ["left", "right", "up", "down"]), 'MazeCliffDetailedViewer') VIEWERS.addViewer(lambda : Maze2DFunctionViewer(self.maze, self.stateSpace), 'MazeCliffFunctionViewer') ########################## Interface Functions ##################################### def getInitialState(self): """ Returns the initial state of this environment """ return self._createStateForAgent(self.initialState) def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ # The state before executing the action previousState = dict(self.currentState) action = actionObject['action'] # Execute the action which was chosen by the agent reward = self._stateTransition(action) self.reward +=reward #Check if the episode is finished (i.e. the goal is reached) episodeFinished = self._checkEpisodeFinished() terminalState = self.currentState if episodeFinished else None if episodeFinished: self.episodeLengthObservable.addValue(self.episodeCounter, self.stepCounter + 1) self.returnObservable.addValue(self.episodeCounter, self.reward) self.environmentLog.info("Episode %s. Length: %s steps, " "Accumulated reward: %s." % (self.episodeCounter, self.stepCounter+1, self.reward)) #Reset the simulation to the initial state (always the same) self.stepCounter = 0 self.reward = 0.0 self.currentState = deepcopy(self.initialState) self.episodeCounter += 1 self.trajectoryObservable.addTransition(previousState, action, reward, terminalState, episodeTerminated=episodeFinished) else: self.stepCounter += 1 self.trajectoryObservable.addTransition(previousState, action, reward, self.currentState, episodeTerminated=episodeFinished) resultsDict = {"reward" : reward, "terminalState" : terminalState, "nextState" : self._createStateForAgent(self.currentState), "startNewEpisode" : episodeFinished} return resultsDict def _stateTransition(self, action): "Execute the specified action and store the resulting state" # If the action was move forward: currentPos = (self.currentState['row'],self.currentState['column']) nextPos, reward = self.maze.tryToMove(currentPos,action) self.currentState['row'] = nextPos[0] self.currentState['column'] = nextPos[1] return reward def _checkEpisodeFinished(self): "Checks whether the episode is finished, i. e. the goal is reached" currentPos = (self.currentState['row'],self.currentState['column']) return self.maze.isGoalReached(currentPos) def _createStateForAgent(self, state): "Create a state description for the agent" return state def plotStateSpaceStructure(self, axis): """ Plot structure of state space into given axis. Just a helper function for viewers and graphic logging. """ self.maze.drawIntoAxis(axis)
def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="Maze Cliff", discreteActionSpace=True, episodic=True, continuousStateSpace=False, continuousActionSpace=False, stochastic=False) super(MazeCliffEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) # A string which describes the structure of the maze # A * indicates a wall, an S the start position of the agent # and a G the goal. A blank indicates a free cell. mazeDescriptionString = """************** * * * * * * *S G* ************** """ #The maze object is created from the description self.maze = Maze.createMazeFromString(mazeDescriptionString, cliffPenalty=self.configDict["cliffPenalty"], stochasticity=self.configDict["stochasticity"]) #The state space of the Maze2d Simulation oldStyleStateSpace = { "column": ("discrete", range(self.maze.getColumns())), "row": ("discrete", range(self.maze.getRows())), } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Single Pole Balancing Simulation oldStyleActionSpace = { "action": ("discrete", ["up", "down", "left", "right"]) } self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") # dictionary which contains all configuration options specific to this environment # it is VERY important to put ALL configuration options which uniquely determine # the behavior of the environment in this dictionary. self.configDict = {} #The current state of the simulation self.initialState = { "row": self.maze.getStartPosition()[0], "column": self.maze.getStartPosition()[1], } #The current state is initially set to the initial state self.currentState = deepcopy(self.initialState) #A counter which stores the number of steps which have been perfomed in this episode self.stepCounter = 0 self.episodeCounter = 0 #The accumulated reward self.reward = 0.0 if useGUI: from mmlf.gui.viewers import VIEWERS from mmlf.gui.viewers.trajectory_viewer import TrajectoryViewer from mmlf.worlds.maze2d.environments.maze2d_viewer import Maze2DDetailedViewer from mmlf.worlds.maze2d.environments.maze2d_function_viewer import Maze2DFunctionViewer # Create customized trajectory viewer class MazeCliffTrajectoryViewer(TrajectoryViewer): def __init__(self, stateSpace, plotStateSpaceStructure): super(MazeCliffTrajectoryViewer, self).__init__(stateSpace) plotStateSpaceStructure(self.axisTrajectory) VIEWERS.addViewer(lambda : \ MazeCliffTrajectoryViewer(self.stateSpace, lambda ax : self.plotStateSpaceStructure(ax)), 'MazeCliffTrajectoryViewer') # Add viewers for the maze world VIEWERS.addViewer(lambda : Maze2DDetailedViewer(self.maze, self.stateSpace, ["left", "right", "up", "down"]), 'MazeCliffDetailedViewer') VIEWERS.addViewer(lambda : Maze2DFunctionViewer(self.maze, self.stateSpace), 'MazeCliffFunctionViewer')
class PinballMazeEnvironment(SingleAgentEnvironment): """ The pinball maze environment class. The pinball maze environment class. .. seealso:: George Konidaris and Andrew G Barto "Skill Discovery in Continuous Reinforcement Learning Domains using Skill Chaining" in "Advances in Neural Information Processing Systems", 2009 .. versionadded:: 0.9.9 **CONFIG DICT** :DRAG: : Factor that slows the ball each time step (multiplied to velocity after each step) :NOISE: : gaussian noise with MU_POS for position [x,y] and MU_VEL for velocity [xdot,ydot]; as simplification the covariance matrix is just a unit matrix multiplied with SIGMA :THRUST_PENALTY: : Reward the agent gains each time it accelerates the ball :STEP_PENALTY: : Reward the agent gains each time step it not thrusts or terminates :END_EPISODE_REWARD: : Reward the agent gains if the ball reaches the goal :SUBSTEPS: : number of dynamic steps of the environment between each of the agent's actions :MAZE: : Name of the config file, where the maze is defined. These files are located in folder 'worlds/pinball_maze' """ DEFAULT_CONFIG_DICT = { "DRAG": 0.995, "NOISE": { "MU_POS": [0.0, 0.0], "MU_VEL": [0.0, 0.0], "SIGMA": 0.0 }, "THRUST_PENALTY": -5, "STEP_PENALTY": -1, "END_EPISODE_REWARD": 10000, "SUBSTEPS": 20, "MAZE": "pinball_simple_single.cfg" } def __init__(self, useGUI, *args, **kwargs): self.environmentInfo = EnvironmentInfo(versionNumber="0.3", environmentName="PinballMaze", discreteActionSpace=True, episodic=True, continuousStateSpace=True, continuousActionSpace=False, stochastic=False) super(PinballMazeEnvironment, self).__init__(useGUI=useGUI, *args, **kwargs) mazeString = open( os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir + os.sep + self.configDict['MAZE'], 'r').read() #The maze object is created from the description self.maze = PinballMaze.createMazeFromString(mazeString) #The state space of the Maze2d Simulation oldStyleStateSpace = { "x": ("continuous", [(0.0, 1.0)]), "y": ("continuous", [(0.0, 1.0)]), "xdot": ("continuous", [(-1.0, 1.0)]), "ydot": ("continuous", [(-1.0, 1.0)]), } self.stateSpace = StateSpace() self.stateSpace.addOldStyleSpace(oldStyleStateSpace, limitType="soft") #The action space of the Maze2d Simulation oldStyleActionSpace = { "action": ("discrete", ["xinc", "xdec", "yinc", "ydec", "none"]) } self.actionSpace = ActionSpace() self.actionSpace.addOldStyleSpace(oldStyleActionSpace, limitType="soft") #The current state is initially set to the initial state self.currentState = self.getInitialState() if useGUI: # Add viewer specific for the pinball world from mmlf.gui.viewers import VIEWERS from mmlf.worlds.pinball_maze.environments.pinball_maze_trajectory_viewer \ import PinballMazeTrajectoryViewer from mmlf.worlds.pinball_maze.environments.pinball_maze_function_viewer \ import PinballMazeFunctionViewer VIEWERS.addViewer( lambda: PinballMazeTrajectoryViewer(self, self.stateSpace), 'PinballMaze TrajectoryViewer') VIEWERS.addViewer( lambda: PinballMazeFunctionViewer(self, self.stateSpace), 'PinballMaze FunctionViewer') ########################## Interface Functions ##################################### def getInitialState(self): """ Returns the initial state of the environment More information about (valid) states can be found in :ref:`state_and_action_spaces` """ startPos = self.maze.getStartPos() return {"x": startPos[0], "y": startPos[1], "xdot": 0.0, "ydot": 0.0} def evaluateAction(self, actionObject): """ Execute an agent's action in the environment. Take an actionObject containing the action of an agent, and evaluate this action, calculating the next state, and the reward the agent should receive for having taken this action. Additionally, decide whether the episode should continue, or end after the reward has been issued to the agent. This method returns a dictionary with the following keys: :rewardValue: : An integer or float representing the agent's reward. If rewardValue == None, then no reward is given to the agent. :startNewEpisode: : True if the agent's action has caused an episode to get finished. :nextState: : A State object which contains the state the environment takes on after executing the action. This might be the initial state of the next episode if a new episode has just started (startNewEpisode == True) :terminalState: : A State object which contains the terminal state of the environment in the last episode if a new episode has just started (startNewEpisode == True). Otherwise None. """ # The state before executing the action previousState = deepcopy(self.currentState) # Fetch action and do state transition and compute reward action = actionObject['action'] reward, episodeFinished = self._stateTransition(action) self.accumulatedReward += reward self.trajectoryObservable.addTransition( previousState, action, reward, self.currentState, episodeTerminated=episodeFinished) terminalState = self.currentState if episodeFinished else None if episodeFinished: self.episodeLengthObservable.addValue(self.episodeCounter, self.stepCounter + 1) self.returnObservable.addValue(self.episodeCounter, self.accumulatedReward) self.environmentLog.info( "Episode %s lasted for %s steps. Accumulated reward: %s" % (self.episodeCounter, self.stepCounter + 1, self.accumulatedReward)) self.stepCounter = 0 self.accumulatedReward = 0.0 self.episodeCounter += 1 # Reset to the initial state self.currentState = self.getInitialState() else: self.stepCounter += 1 if self.stepCounter % 250 == 0: # Keep user up-to-date self.environmentLog.info( "Episode %s: Agent active for %s steps " "without reaching goal yet." % (self.episodeCounter, self.stepCounter)) resultsDict = { "reward": reward, "terminalState": terminalState, "nextState": self._createStateForAgent(self.currentState), "startNewEpisode": episodeFinished } return resultsDict def _stateTransition(self, action): # Determine action effect if action == "xinc": self.currentState["xdot"] = min(self.currentState["xdot"] + 0.2, 1.0) elif action == "xdec": self.currentState["xdot"] = max(self.currentState["xdot"] - 0.2, -1.0) elif action == "yinc": self.currentState["ydot"] = min(self.currentState["ydot"] + 0.2, 1.0) elif action == "ydec": self.currentState["ydot"] = max(self.currentState["ydot"] - 0.2, -1.0) # Do state transition, split into SUBSTEPS substeps in order to deal # with collisions on a more fine-granular base. for j in range(self.configDict["SUBSTEPS"]): # Compute next would-be position factor = self.maze.ballRadius / self.configDict["SUBSTEPS"] posX = self.currentState["x"] + self.currentState["xdot"] * factor posY = self.currentState["y"] + self.currentState["ydot"] * factor # Check for collision with obstacle collided, obstacle = self.maze.collide([posX, posY]) if collided: # Determine collision effect self.currentState["xdot"], self.currentState["ydot"] = \ self.maze.collisionEffect(oldPos=(self.currentState["x"], self.currentState["y"]), newPos=(posX, posY), vel=(self.currentState["xdot"], self.currentState["ydot"]), obstacle=obstacle) else: # No collision, go to would-be position self.currentState["x"] = posX self.currentState["y"] = posY # Check if target reached if self.maze.goalReached(pos=(self.currentState["x"], self.currentState["y"])): return self.configDict["END_EPISODE_REWARD"], True # Apply drag self.currentState["xdot"] *= self.configDict["DRAG"] self.currentState["ydot"] *= self.configDict["DRAG"] if action == "none": return self.configDict["STEP_PENALTY"], False else: return self.configDict["THRUST_PENALTY"], False def _createStateForAgent(self, state): "Create a state description for the agent" mu_pos = self.configDict["NOISE"]["MU_POS"] mu_vel = self.configDict["NOISE"]["MU_VEL"] sigma = self.configDict["NOISE"]["SIGMA"] noisyState = state noisyState['x'] = noisyState['x'] + random.gauss(mu_pos[0], sigma) noisyState['y'] = noisyState['y'] + random.gauss(mu_pos[1], sigma) noisyState['xdot'] = noisyState['xdot'] + random.gauss( mu_vel[0], sigma) noisyState['ydot'] = noisyState['ydot'] + random.gauss( mu_vel[1], sigma) return noisyState def plotStateSpaceStructure(self, axis): """ Plot structure of state space into given axis. Just a helper function for viewers and graphic logging. """ self.maze.draw(axis)