Example #1
0
 def _setup(self, conditions):
     """ Tells the agent, if the environment is discrete or continuous and the
         number/dimensionalty of states and actions. This function is called
         just before the first state is integrated.
     """
     self.conditions_ = conditions
     
     # create history to store experiences
     self.history = History(conditions['stateDim'], conditions['actionDim'])
Example #2
0
 def _setup(self, conditions):
     """ if agent is discrete in states and actions create Q-Table. """
     Agent._setup(self, conditions)
     if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions'] == False):
         raise AgentException('BASAgent expects continuous states and actions. Use adapter or a different environment.')
         
     self.estimator = FAEstimator(self.conditions['stateDim'] + self.conditions['actionDim'], 2**self.conditions['actionDim'], self.faClass)
     
     # change history to store bas-extended experiences
     self.history = History(conditions['stateDim']+self.conditions['actionDim'] , 1)
Example #3
0
class Agent(object):
    
    def __init__(self):        
        # current observation, action, reward
        self.state = None
        self.action = None
        self.reward = None
        
        # agent conditions, inherited from environment (plus adapters)
        self.conditions_ = {}
        
        # progress counter:
        # 0 = reward was given, before observation was integrated
        # 1 = integration was integrated, before action was returned
        # 2 = action was returned, before reward was given
        # 0 = reward was given. store experience in history
        self.progressCnt = 0
        
        # enable or disable logging to dataset (for testing)
        self.loggingEnabled = True
        
        # tell agent that evaluation takes place (no internal exploration)
        self.evaluation = False
        
        # agent can access the experiment it is part of
        self.experiment = None
        
        
    def _setup(self, conditions):
        """ Tells the agent, if the environment is discrete or continuous and the
            number/dimensionalty of states and actions. This function is called
            just before the first state is integrated.
        """
        self.conditions_ = conditions
        
        # create history to store experiences
        self.history = History(conditions['stateDim'], conditions['actionDim'])
    
    @property
    def conditions(self):
        return self.conditions_
        
    @property
    def episode(self):
        """ returns the last (current) episode. """
        if len(self.history) > 0:
            return self.history[-1]
        else:
            return None
        
    def integrateState(self, state):
        if self.progressCnt == 0:
            self.state = copy(state)
            self.progressCnt = 1
        else:
            raise AgentException('observation was given twice before action was requested.')
                
    def getAction(self):
        if self.progressCnt == 1:
            self._calculate()
            self.progressCnt = 2
            return self.action
        else:
            if self.progressCnt == 0:
                raise AgentException('action was requested before observation was integrated.')
            if self.progressCnt > 1:
                raise AgentException('action was requested after reward was given.')
        
    def giveReward(self, reward):
        if self.progressCnt == 2:
            self.reward = reward
            self.progressCnt = 0
            if self.loggingEnabled:
                self.history.append(self.state, self.action, self.reward)
        else:
            raise AgentException('reward was given before action was returned.')
        
    def newEpisode(self):
        self.history.newEpisode()
            
    def learn(self):
        pass
    
    def forget(self):
        """ deletes the entire history. """
        self.history.clear()
        
    def _calculate(self):
        """ this method needs to be overwritten by subclasses to return a non-zero action. """
        self.action = zeros(self.conditions['actionDim'])
Example #4
0
class BASAgent(Agent):
    
    alpha = 1.0
    gamma = 0.9
    
    def __init__(self, faClass=Linear):
        """ initialize the agent with the estimatorClass. """
        Agent.__init__(self)
        
        self.amin = -1.
        self.amax = 1.
        self.nres = 3
        
        # store (decision,action) tuples for one action in the list
        self.decisions = []
        
        self.faClass = faClass
    
    def _setup(self, conditions):
        """ if agent is discrete in states and actions create Q-Table. """
        Agent._setup(self, conditions)
        if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions'] == False):
            raise AgentException('BASAgent expects continuous states and actions. Use adapter or a different environment.')
            
        self.estimator = FAEstimator(self.conditions['stateDim'] + self.conditions['actionDim'], 2**self.conditions['actionDim'], self.faClass)
        
        # change history to store bas-extended experiences
        self.history = History(conditions['stateDim']+self.conditions['actionDim'] , 1)
    
    
    def giveReward(self, reward):
        """ override function to store the internal actions in the history. """
        if self.progressCnt == 2:
            self.reward = reward
            self.progressCnt = 0
            if self.loggingEnabled:
                # go through internal decisions and transform them to states, actions, rewards
                olda = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim'])
                for i, (d,a) in enumerate(self.decisions):
                    state = r_[self.state, olda]
                    action = d
                    
                    if i < self.nres-1:
                        reward = 0.
                    else:
                        reward = self.reward
                    
                    self.history.append(state, action, reward)
                    olda = a                   
                    

        else:
            raise AgentException('reward was given before action was returned.')
    
    def _internalDecisions(self, state):
        """ takes a state and queries the estimator several times as a binary search.
            generates (binary) decision and action at each timestep. """
        
        self.decisions = []
        
        a = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim'])
        delta = (self.amax - self.amin) * float(2**(self.nres-1)) / (2**self.nres -1)
        for i in range(self.nres):
            delta = delta/2.
            decision = self.estimator.getBestAction(r_[self.state, a])
            
            # internal epsilon-greedy exploration
            if random.random() < 0.1:
                decision = array([random.randint(2**self.conditions['actionDim'])])

            # turn into binary list
            blist = -1.*ones(self.conditions['actionDim'])
            for i,bit in enumerate(reversed(bin(decision)[2:])):
                if bit == '1':
                    blist[-i-1] = 1.
            
            # update action
            a = a + delta*blist
            self.decisions.append((decision, a))
            
        return a
                
    def _calculate(self):
        """ Return the action with the maximal value for the given state. """
        self.action = self._internalDecisions(self.state)


    def learn(self):
        """ go through whole episode and make Q-value updates. """  
        for i in range(1):
            
            self.estimator.reset()

            for episode in self.history:
                for state, action, reward, nextstate in episode:
                    # # don't consider last state
                    # if equal(state, nextstate).all():
                    #     break

                    qvalue = self.estimator.getValue(state, action)
                    bestnext = self.estimator.getValue(nextstate, self.estimator.getBestAction(nextstate))
                    target = (1-self.alpha) * qvalue + self.alpha * (reward + self.gamma * bestnext)

                    self.estimator.updateValue(state, action, target)

            self.estimator.train()