def agent_init(self, taskspec): """ This function is called once at the begining of an episode. Performs sanity checks with the environment. :param taskspec: The task specifications :type taskspec: str """ spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec) if len(spec.getIntActions()) != 1: raise Exception("Expecting 1-dimensional discrete actions") if len(spec.getDoubleActions()) != 0: raise Exception("Expecting no continuous actions") if spec.isSpecial(spec.getIntActions()[0][0]): raise Exception( "Expecting min action to be a number not a special value") if spec.isSpecial(spec.getIntActions()[0][1]): raise Exception( "Expecting max action to be a number not a special value") observation_ranges = spec.getDoubleObservations() self.basis = FourierBasis(len(observation_ranges), self.fa_order, observation_ranges) self.weights = np.zeros((self.basis.numTerms, len(self.options))) self.last_action = 0 self.last_features = [] self.last_observation = []
def agent_init(self, taskspec): """ This function is called once at the begining of an episode. Performs sanity checks with the environment. :param taskspec: The task specifications :type taskspec: str """ spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec) if len(spec.getIntActions()) != 1: raise Exception("Expecting 1-dimensional discrete actions") if len(spec.getDoubleActions()) != 0: raise Exception("Expecting no continuous actions") if spec.isSpecial(spec.getIntActions()[0][0]): raise Exception("Expecting min action to be a number not a special value") if spec.isSpecial(spec.getIntActions()[0][1]): raise Exception("Expecting max action to be a number not a special value") observation_ranges = spec.getDoubleObservations() self.basis = FourierBasis(len(observation_ranges), self.fa_order, observation_ranges) self.weights = np.zeros((self.basis.numTerms, len(self.options))) self.last_action = 0 self.last_features = [] self.last_observation = []
class IntraOptionLearning(Agent): """ This class implements Intra-Option learning with linear function approximation. R. S. Sutton, D. Precup, and S. Singh, "Intra-option learning about temporally abstract actions," In Proceedings of the Fifteenth International Conference on Machine Learning (ICML 1998), 1998, pp. 556-564. """ def __init__(self, options, alpha, gamma, epsilon, fa_order): """ :param options: A set of options with learnt policies :type options: list of Option """ self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.fa_order = fa_order self.options = options self.current_option = None self.finished_learning = False def intraoption_update(self, reward, features, observation): """ Perform a step of intra-option learning :param reward: The reward just obtained :param features: The features representation of the current state """ for i in self.consistent_options(self.last_observation, self.last_action): current_value = 0.0 if self.options[i].terminate(observation): initializable_options = self.initializable_options(observation) current_value = np.dot(self.weights[:,initializable_options].T, features).max() else: current_value = np.dot(self.weights[:,i].T, features) delta = reward + self.gamma*current_value - np.dot(self.weights[:,i].T, self.last_features) self.weights[:,i] += self.alpha*delta def consistent_options(self, observation, action): """ :returns: a subset of the options for which pi_o(s) = a :rtype: list of int """ return filter(lambda idx: self.options[idx].pi(observation) == action, xrange(len(self.options))) def initializable_options(self, observation): """ Find the options available under the current state :retuns: The indices of the options that can be initiated under the current state :rtype: list of int """ return filter(lambda idx: self.options[idx].initiate(observation), xrange(len(self.options))) def egreedy(self, observation, features): """ Use epsilon-greedy exploration for the behavior policy :param observation: The raw observations :param features: The features representation of the observation :returns: A random option with probability epsilon, or the option with the highest value with probability 1 - epsilon. :rtype: int """ initializable_options = self.initializable_options(observation) if not self.finished_learning and (random.random() < self.epsilon): return random.choice(initializable_options) return initializable_options[np.dot(self.weights[:,initializable_options].T, features).argmax()] def mu(self, observation, features=None): """ The semi-markov deterministic policy that follows an option to completion before starting another one. :param observation: The raw observations :param features: The features representation of the observation :returns: the best option according to the current policy :rtype: Option """ if self.current_option == None or self.current_option.terminate(observation): self.current_option = self.options[self.egreedy(observation, features)] return self.current_option def agent_init(self, taskspec): """ This function is called once at the begining of an episode. Performs sanity checks with the environment. :param taskspec: The task specifications :type taskspec: str """ spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec) if len(spec.getIntActions()) != 1: raise Exception("Expecting 1-dimensional discrete actions") if len(spec.getDoubleActions()) != 0: raise Exception("Expecting no continuous actions") if spec.isSpecial(spec.getIntActions()[0][0]): raise Exception("Expecting min action to be a number not a special value") if spec.isSpecial(spec.getIntActions()[0][1]): raise Exception("Expecting max action to be a number not a special value") observation_ranges = spec.getDoubleObservations() self.basis = FourierBasis(len(observation_ranges), self.fa_order, observation_ranges) self.weights = np.zeros((self.basis.numTerms, len(self.options))) self.last_action = 0 self.last_features = [] self.last_observation = [] def agent_start(self, obs): """ This function is called by the environment in the initial state. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action def agent_step(self, reward, obs): """ This function is called by the environment while the episode lasts. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) if not self.finished_learning: self.intraoption_update(reward, current_features, observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action def agent_end(self, reward): """ This function is called by the environment when the episode finishes. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. """ if not self.finished_learning: for i in self.consistent_options(self.last_observation, self.last_action): delta = reward - np.dot(self.weights[:,i].T, self.last_features) self.weights[:,i] = self.weights[:,i] + self.alpha*delta def agent_cleanup(self): pass def agent_message(self, msg): return "Intra-Option Learning does not understand your message."
class IntraOptionLearning(Agent): """ This class implements Intra-Option learning with linear function approximation. R. S. Sutton, D. Precup, and S. Singh, "Intra-option learning about temporally abstract actions," In Proceedings of the Fifteenth International Conference on Machine Learning (ICML 1998), 1998, pp. 556-564. """ def __init__(self, options, alpha, gamma, epsilon, fa_order): """ :param options: A set of options with learnt policies :type options: list of Option """ self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.fa_order = fa_order self.options = options self.current_option = None self.finished_learning = False def intraoption_update(self, reward, features, observation): """ Perform a step of intra-option learning :param reward: The reward just obtained :param features: The features representation of the current state """ for i in self.consistent_options(self.last_observation, self.last_action): current_value = 0.0 if self.options[i].terminate(observation): initializable_options = self.initializable_options(observation) current_value = np.dot( self.weights[:, initializable_options].T, features).max() else: current_value = np.dot(self.weights[:, i].T, features) delta = reward + self.gamma * current_value - np.dot( self.weights[:, i].T, self.last_features) self.weights[:, i] += self.alpha * delta def consistent_options(self, observation, action): """ :returns: a subset of the options for which pi_o(s) = a :rtype: list of int """ return filter(lambda idx: self.options[idx].pi(observation) == action, xrange(len(self.options))) def initializable_options(self, observation): """ Find the options available under the current state :retuns: The indices of the options that can be initiated under the current state :rtype: list of int """ return filter(lambda idx: self.options[idx].initiate(observation), xrange(len(self.options))) def egreedy(self, observation, features): """ Use epsilon-greedy exploration for the behavior policy :param observation: The raw observations :param features: The features representation of the observation :returns: A random option with probability epsilon, or the option with the highest value with probability 1 - epsilon. :rtype: int """ initializable_options = self.initializable_options(observation) if not self.finished_learning and (random.random() < self.epsilon): return random.choice(initializable_options) return initializable_options[np.dot( self.weights[:, initializable_options].T, features).argmax()] def mu(self, observation, features=None): """ The semi-markov deterministic policy that follows an option to completion before starting another one. :param observation: The raw observations :param features: The features representation of the observation :returns: the best option according to the current policy :rtype: Option """ if self.current_option == None or self.current_option.terminate( observation): self.current_option = self.options[self.egreedy( observation, features)] return self.current_option def agent_init(self, taskspec): """ This function is called once at the begining of an episode. Performs sanity checks with the environment. :param taskspec: The task specifications :type taskspec: str """ spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec) if len(spec.getIntActions()) != 1: raise Exception("Expecting 1-dimensional discrete actions") if len(spec.getDoubleActions()) != 0: raise Exception("Expecting no continuous actions") if spec.isSpecial(spec.getIntActions()[0][0]): raise Exception( "Expecting min action to be a number not a special value") if spec.isSpecial(spec.getIntActions()[0][1]): raise Exception( "Expecting max action to be a number not a special value") observation_ranges = spec.getDoubleObservations() self.basis = FourierBasis(len(observation_ranges), self.fa_order, observation_ranges) self.weights = np.zeros((self.basis.numTerms, len(self.options))) self.last_action = 0 self.last_features = [] self.last_observation = [] def agent_start(self, obs): """ This function is called by the environment in the initial state. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action def agent_step(self, reward, obs): """ This function is called by the environment while the episode lasts. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) if not self.finished_learning: self.intraoption_update(reward, current_features, observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action def agent_end(self, reward): """ This function is called by the environment when the episode finishes. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. """ if not self.finished_learning: for i in self.consistent_options(self.last_observation, self.last_action): delta = reward - np.dot(self.weights[:, i].T, self.last_features) self.weights[:, i] = self.weights[:, i] + self.alpha * delta def agent_cleanup(self): pass def agent_message(self, msg): return "Intra-Option Learning does not understand your message."