def _setup(self, conditions): """ Tells the agent, if the environment is discrete or continuous and the number/dimensionalty of states and actions. This function is called just before the first state is integrated. """ self.conditions_ = conditions # create history to store experiences self.history = History(conditions['stateDim'], conditions['actionDim'])
def _setup(self, conditions): """ if agent is discrete in states and actions create Q-Table. """ Agent._setup(self, conditions) if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions'] == False): raise AgentException('BASAgent expects continuous states and actions. Use adapter or a different environment.') self.estimator = FAEstimator(self.conditions['stateDim'] + self.conditions['actionDim'], 2**self.conditions['actionDim'], self.faClass) # change history to store bas-extended experiences self.history = History(conditions['stateDim']+self.conditions['actionDim'] , 1)
class Agent(object): def __init__(self): # current observation, action, reward self.state = None self.action = None self.reward = None # agent conditions, inherited from environment (plus adapters) self.conditions_ = {} # progress counter: # 0 = reward was given, before observation was integrated # 1 = integration was integrated, before action was returned # 2 = action was returned, before reward was given # 0 = reward was given. store experience in history self.progressCnt = 0 # enable or disable logging to dataset (for testing) self.loggingEnabled = True # tell agent that evaluation takes place (no internal exploration) self.evaluation = False # agent can access the experiment it is part of self.experiment = None def _setup(self, conditions): """ Tells the agent, if the environment is discrete or continuous and the number/dimensionalty of states and actions. This function is called just before the first state is integrated. """ self.conditions_ = conditions # create history to store experiences self.history = History(conditions['stateDim'], conditions['actionDim']) @property def conditions(self): return self.conditions_ @property def episode(self): """ returns the last (current) episode. """ if len(self.history) > 0: return self.history[-1] else: return None def integrateState(self, state): if self.progressCnt == 0: self.state = copy(state) self.progressCnt = 1 else: raise AgentException('observation was given twice before action was requested.') def getAction(self): if self.progressCnt == 1: self._calculate() self.progressCnt = 2 return self.action else: if self.progressCnt == 0: raise AgentException('action was requested before observation was integrated.') if self.progressCnt > 1: raise AgentException('action was requested after reward was given.') def giveReward(self, reward): if self.progressCnt == 2: self.reward = reward self.progressCnt = 0 if self.loggingEnabled: self.history.append(self.state, self.action, self.reward) else: raise AgentException('reward was given before action was returned.') def newEpisode(self): self.history.newEpisode() def learn(self): pass def forget(self): """ deletes the entire history. """ self.history.clear() def _calculate(self): """ this method needs to be overwritten by subclasses to return a non-zero action. """ self.action = zeros(self.conditions['actionDim'])
class BASAgent(Agent): alpha = 1.0 gamma = 0.9 def __init__(self, faClass=Linear): """ initialize the agent with the estimatorClass. """ Agent.__init__(self) self.amin = -1. self.amax = 1. self.nres = 3 # store (decision,action) tuples for one action in the list self.decisions = [] self.faClass = faClass def _setup(self, conditions): """ if agent is discrete in states and actions create Q-Table. """ Agent._setup(self, conditions) if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions'] == False): raise AgentException('BASAgent expects continuous states and actions. Use adapter or a different environment.') self.estimator = FAEstimator(self.conditions['stateDim'] + self.conditions['actionDim'], 2**self.conditions['actionDim'], self.faClass) # change history to store bas-extended experiences self.history = History(conditions['stateDim']+self.conditions['actionDim'] , 1) def giveReward(self, reward): """ override function to store the internal actions in the history. """ if self.progressCnt == 2: self.reward = reward self.progressCnt = 0 if self.loggingEnabled: # go through internal decisions and transform them to states, actions, rewards olda = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim']) for i, (d,a) in enumerate(self.decisions): state = r_[self.state, olda] action = d if i < self.nres-1: reward = 0. else: reward = self.reward self.history.append(state, action, reward) olda = a else: raise AgentException('reward was given before action was returned.') def _internalDecisions(self, state): """ takes a state and queries the estimator several times as a binary search. generates (binary) decision and action at each timestep. """ self.decisions = [] a = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim']) delta = (self.amax - self.amin) * float(2**(self.nres-1)) / (2**self.nres -1) for i in range(self.nres): delta = delta/2. decision = self.estimator.getBestAction(r_[self.state, a]) # internal epsilon-greedy exploration if random.random() < 0.1: decision = array([random.randint(2**self.conditions['actionDim'])]) # turn into binary list blist = -1.*ones(self.conditions['actionDim']) for i,bit in enumerate(reversed(bin(decision)[2:])): if bit == '1': blist[-i-1] = 1. # update action a = a + delta*blist self.decisions.append((decision, a)) return a def _calculate(self): """ Return the action with the maximal value for the given state. """ self.action = self._internalDecisions(self.state) def learn(self): """ go through whole episode and make Q-value updates. """ for i in range(1): self.estimator.reset() for episode in self.history: for state, action, reward, nextstate in episode: # # don't consider last state # if equal(state, nextstate).all(): # break qvalue = self.estimator.getValue(state, action) bestnext = self.estimator.getValue(nextstate, self.estimator.getBestAction(nextstate)) target = (1-self.alpha) * qvalue + self.alpha * (reward + self.gamma * bestnext) self.estimator.updateValue(state, action, target) self.estimator.train()