for layer in range(len(self.traces)): self.net.layers[layer].np['b'] -= self.alpha * delta * self.traces[layer]['b'] self.net.layers[layer].np['w'] -= self.alpha * delta * self.traces[layer]['w'] #newQ = self.net.sim([x_t]).flatten() #print Q_t[a_t], deltaQ[a_t], newQ[a_t] def agent_end(self,reward): lastState = self.lastObservation.doubleArray lastAction = self.lastAction.intArray[0] # Update eligibility traces self.decayTraces() self.update(lastState, lastAction, None, 0, reward) def agent_cleanup(self): pass def has_diverged(self): value = self.net.layers[0].np['w'].sum() return numpy.isnan(value) or numpy.isinf(value) if __name__=="__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(sarsa_lambda_ann)
# Update the weights with both a scalar and vector stepsize used # (Maybe we should actually make them both work together naturally) self.weights += self.rescale_update(phi_t, phi_tp, delta, reward, delta * self.traces) def agent_end(self, reward): """Receive the final reward in an episode, also signaling the end of the episode. Args: reward: The reward received for taking the last action from the previous state. """ lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] lastDiscState = self.getDiscState(self.lastObservation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) self.update_traces(phi_t, None) self.update(phi_t, None, 0, reward) if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(qlearning_agent)
Args: inMessage: A string message sent by either the environment or experiment to the agent. Returns: A string response message. """ if inMessage.lower( ) == "agent_diverged?": # If we find that this is needed, we can fill it in later return "False" #str(self.has_diverged(self.weights)) else: return name + " does not understand your message." if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(ModelBasedAgent) # If executed as a standalone script this will default to RLGlue network mode. # Some parameters can be passed at the command line to customize behavior. # if __name__=="__main__": # import argparse # parser = argparse.ArgumentParser(description='Run ModelBasedAgent in network mode') # parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") # parser.add_argument("--model", type=str, default="knn", help="What model class to use", choices=["knn", "randforest", "svm", "gp"]) # parser.add_argument("--planner", type=str, default="fittedq", help="What planner class to use", choices=["fittedq"]) # parser.add_argument("--svmde", action='store_true', help="Use the one class SVM density estimator for known/unknown distinctions.") # args = parser.parse_args() # model_params = {} # planner_params = {} # model_class = None
def agent_end(self, reward): """Receive the final reward in an episode, also signaling the end of the episode. Args: reward: The reward received for taking the last action from the previous state. """ lastState = numpy.array(list(self.last_observation.doubleArray)) lastAction = self.last_action.intArray[0] lastDiscState = self.getDiscState(self.last_observation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) self.update_traces(phi_t, None) self.update(phi_t, None, 0, reward) if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(QlearningAgent)
From the paper: Least-Squares Policy Iteration. 2003. Michail Lagoudakis and Ronald Parr. """ name = "LSPI" @classmethod def agent_parameters(cls): param_set = super(LSPI, cls).agent_parameters() add_parameter(param_set, "lspi_threshold", default=0.001) return param_set def init_parameters(self): super(LSPI, self).init_parameters() self.threshold = self.params.setdefault('lspi_threshold', 0.001) # Threshold for convergence def updateWeights(self): # Outer loop of LSPI algorithm, repeat until policy converges prev_weights = None while (prev_weights is None) or numpy.linalg.norm(prev_weights - self.weights.ravel()) >= self.threshold: prev_weights = self.weights.flatten() super(LSPI, self).updateWeights() if __name__=="__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(LSTD)
sarsa_lambda.sarsa_lambda.agent_init(self, taskSpec) self.traces = numpy.zeros((numpy.prod(self.weights.shape[:-1]) + self.weights.size,)) # combined e_t^w and e_t^v self.value_weights = numpy.zeros((numpy.prod(self.weights.shape[:-1]),)) self.advantage_weights = numpy.zeros((self.weights.size,)) def update(self, phi_t, phi_tp, reward, compatFeatures): phi_hat = numpy.zeros(self.traces.shape) phi_hat[:phi_t.size] = phi_t.flatten() phi_hat[phi_t.size:] = compatFeatures.flatten() self.traces *= self.lmbda self.traces += phi_hat delta = numpy.dot(self.value_weights, (self.gamma * phi_tp - phi_t).flatten()) + reward self.advantage_weights += self.beta * (delta - numpy.dot(self.advantage_weights, compatFeatures.flatten())) * self.traces[self.value_weights.size:] self.value_weights += self.beta * delta * self.traces[:self.value_weights.size] if self.step_count % self.nac_freq == 0: # Update the weights with both a scalar and vector stepsize used self.weights += self.step_sizes * self.advantage_weights.reshape(self.weights.shape) / numpy.linalg.norm(self.advantage_weights) if __name__=="__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(nac_sarsa)
# Update the weights with both a scalar and vector stepsize used # (Maybe we should actually make them both work together naturally) self.weights += self.rescale_update(phi_t, phi_tp, delta, reward, delta * self.traces) def agent_end(self, reward): """Receive the final reward in an episode, also signaling the end of the episode. Args: reward: The reward received for taking the last action from the previous state. """ lastState = numpy.array(list(self.last_observation.doubleArray)) lastAction = self.last_action.intArray[0] lastDiscState = self.getDiscState(self.last_observation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) self.update_traces(phi_t, None) self.update(phi_t, None, 0, reward) if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(QlearningAgent)
# Update the weights for layer in range(len(self.traces)): self.net.layers[layer].np[ 'b'] -= self.alpha * delta * self.traces[layer]['b'] self.net.layers[layer].np[ 'w'] -= self.alpha * delta * self.traces[layer]['w'] #newQ = self.net.sim([x_t]).flatten() #print Q_t[a_t], deltaQ[a_t], newQ[a_t] def agent_end(self, reward): lastState = self.lastObservation.doubleArray lastAction = self.lastAction.intArray[0] # Update eligibility traces self.decayTraces() self.update(lastState, lastAction, None, 0, reward) def agent_cleanup(self): pass def has_diverged(self): value = self.net.layers[0].np['w'].sum() return numpy.isnan(value) or numpy.isinf(value) if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(sarsa_lambda_ann)
self.value_weights = numpy.zeros( (numpy.prod(self.weights.shape[:-1]), )) self.advantage_weights = numpy.zeros((self.weights.size, )) def update(self, phi_t, phi_tp, reward, compatFeatures): phi_hat = numpy.zeros(self.traces.shape) phi_hat[:phi_t.size] = phi_t.flatten() phi_hat[phi_t.size:] = compatFeatures.flatten() self.traces *= self.lmbda self.traces += phi_hat delta = numpy.dot(self.value_weights, (self.gamma * phi_tp - phi_t).flatten()) + reward self.advantage_weights += self.beta * ( delta - numpy.dot(self.advantage_weights, compatFeatures.flatten()) ) * self.traces[self.value_weights.size:] self.value_weights += self.beta * delta * self.traces[:self. value_weights. size] if self.step_count % self.nac_freq == 0: # Update the weights with both a scalar and vector stepsize used self.weights += self.step_sizes * self.advantage_weights.reshape( self.weights.shape) / numpy.linalg.norm(self.advantage_weights) if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(nac_sarsa)
""" return numpy.dot(self.weights[discState,:,:].T, self.basis.computeFeatures(state)).argmax() def update(self, phi_t, state, discState, reward): reward = (reward - self.reward_range[0]) / (self.reward_range[1] - self.reward_range[0]) self.step_count += 1 state_action = numpy.where(phi_t != 0) if self.LEARN[state_action]: # If Learn[s,a] qvalues = self.getActionValues(state, discState) self.updates[state_action] += reward + self.gamma * qvalues.max() self.visit_count[state_action] += 1 if self.visit_count[state_action] == self.m: if self.weights[state_action] - self.updates[state_action]/self.m >= 2. * self.epsilon: self.weights[state_action] = self.updates[state_action]/self.m + self.epsilon self.last_update = self.step_count #print (self.weights.ravel() < self.weights.max()).sum(), self.weights.size elif self.update_time[state_action] >= self.last_update: self.LEARN[state_action] = False self.update_time[state_action] = self.step_count self.updates[state_action] = 0 self.visit_count[state_action] = 0 elif self.update_time[state_action] < self.last_update: self.LEARN[state_action] = True if __name__=="__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(delayed_qlearning)
Args: inMessage: A string message sent by either the environment or experiment to the agent. Returns: A string response message. """ if inMessage.lower() == "agent_diverged?": # If we find that this is needed, we can fill it in later return "False" #str(self.has_diverged(self.weights)) else: return name + " does not understand your message." if __name__=="__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(ModelBasedAgent) # If executed as a standalone script this will default to RLGlue network mode. # Some parameters can be passed at the command line to customize behavior. # if __name__=="__main__": # import argparse # parser = argparse.ArgumentParser(description='Run ModelBasedAgent in network mode') # parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor") # parser.add_argument("--model", type=str, default="knn", help="What model class to use", choices=["knn", "randforest", "svm", "gp"]) # parser.add_argument("--planner", type=str, default="fittedq", help="What planner class to use", choices=["fittedq"]) # parser.add_argument("--svmde", action='store_true', help="Use the one class SVM density estimator for known/unknown distinctions.") # args = parser.parse_args() # model_params = {} # planner_params = {} # model_class = None
delta = self.gamma * qvalues[a_tp] + reward - numpy.dot(self.weights.flatten(), phi_t.flatten()) # Update the weights with both a scalar and vector stepsize used # (Maybe we should actually make them both work together naturally) self.weights += self.rescale_update(phi_t, phi_tp, delta, reward, delta * self.traces) def agent_end(self, reward): """Receive the final reward in an episode, also signaling the end of the episode. Args: reward: The reward received for taking the last action from the previous state. """ lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] lastDiscState = self.getDiscState(self.lastObservation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) self.update_traces(phi_t, None) self.update(phi_t, None, 0, reward) if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(qlearning_agent)
approxMaxGrad = numpy.exp(qvalues - logSumExp) # Compute gradient of smoothed TD error fa_grad = self.basisGradient(state) for a in range(self.numActions): deltaGrad += approxMaxGrad[a] * (fa_grad * self.weights[discState, :, a]) fa_grad = self.basisGradient(lastState) deltaGrad = self.gamma * deltaGrad - ( fa_grad * self.weights[lastDiscState, :, lastAction]) # Compute the update to the basis scale features update_fs = self.beta * delta * deltaGrad # Do MDA update for weights md_qlearn.update(self, phi_t, state, discState, reward) # Update frequency scaling update_fs += self.freq_scale # Change scaling on multipliers self.basis.multipliers = numpy.dot( numpy.diag(update_fs / self.freq_scale), self.basis.multipliers) self.freq_scale = update_fs else: md_qlearn.update(self, phi_t, state, discState, reward) if __name__ == "__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(mdba_qlearn)
approxMaxGrad = numpy.exp(qvalues - logSumExp) # Compute gradient of smoothed TD error fa_grad = self.basisGradient(state) for a in range(self.numActions): deltaGrad += approxMaxGrad[a] * (fa_grad * self.weights[discState,:,a]) fa_grad = self.basisGradient(lastState) deltaGrad = self.gamma * deltaGrad - (fa_grad * self.weights[lastDiscState, :,lastAction]) # Compute the update to the basis scale features update_fs = self.beta * delta * deltaGrad # Do MDA update for weights md_qlearn.update(self, phi_t, state, discState, reward) # Update frequency scaling update_fs += self.freq_scale # Change scaling on multipliers self.basis.multipliers = numpy.dot(numpy.diag(update_fs/self.freq_scale), self.basis.multipliers) self.freq_scale = update_fs else: md_qlearn.update(self, phi_t, state, discState, reward) if __name__=="__main__": from pyrl.agents.skeleton_agent import runAgent runAgent(mdba_qlearn)