Example #1
0
        for layer in range(len(self.traces)):
            self.net.layers[layer].np['b'] -= self.alpha * delta * self.traces[layer]['b']
            self.net.layers[layer].np['w'] -= self.alpha * delta * self.traces[layer]['w']

        #newQ = self.net.sim([x_t]).flatten()
        #print Q_t[a_t], deltaQ[a_t], newQ[a_t]

    def agent_end(self,reward):
        lastState = self.lastObservation.doubleArray
        lastAction = self.lastAction.intArray[0]

        # Update eligibility traces
        self.decayTraces()
        self.update(lastState, lastAction, None, 0, reward)

    def agent_cleanup(self):
        pass

    def has_diverged(self):
        value = self.net.layers[0].np['w'].sum()
        return numpy.isnan(value) or numpy.isinf(value)


if __name__=="__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(sarsa_lambda_ann)




Example #2
0
        # Update the weights with both a scalar and vector stepsize used
        # (Maybe we should actually make them both work together naturally)
        self.weights += self.rescale_update(phi_t, phi_tp, delta, reward,
                                            delta * self.traces)

    def agent_end(self, reward):
        """Receive the final reward in an episode, also signaling the end of the episode.

        Args:
            reward: The reward received for taking the last action from the previous state.
        """

        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[lastDiscState, :,
              lastAction] = self.basis.computeFeatures(lastState)

        self.update_traces(phi_t, None)
        self.update(phi_t, None, 0, reward)


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(qlearning_agent)
Example #3
0
        Args:
            inMessage: A string message sent by either the environment or experiment to the agent.

        Returns:
            A string response message.
        """
        if inMessage.lower(
        ) == "agent_diverged?":  # If we find that this is needed, we can fill it in later
            return "False"  #str(self.has_diverged(self.weights))
        else:
            return name + " does not understand your message."


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(ModelBasedAgent)

# If executed as a standalone script this will default to RLGlue network mode.
# Some parameters can be passed at the command line to customize behavior.
# if __name__=="__main__":
#     import argparse
#     parser = argparse.ArgumentParser(description='Run ModelBasedAgent in network mode')
#     parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
#     parser.add_argument("--model", type=str, default="knn", help="What model class to use", choices=["knn", "randforest", "svm", "gp"])
#     parser.add_argument("--planner", type=str, default="fittedq", help="What planner class to use", choices=["fittedq"])
#     parser.add_argument("--svmde",  action='store_true', help="Use the one class SVM density estimator for known/unknown distinctions.")
#     args = parser.parse_args()

#     model_params = {}
#     planner_params = {}
#     model_class = None
Example #4
0
    def agent_end(self, reward):
        """Receive the final reward in an episode, also signaling the end of the episode.

        Args:
            reward: The reward received for taking the last action from the previous state.
        """

        lastState = numpy.array(list(self.last_observation.doubleArray))
        lastAction = self.last_action.intArray[0]

        lastDiscState = self.getDiscState(self.last_observation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState)

        self.update_traces(phi_t, None)
        self.update(phi_t, None, 0, reward)


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent

    runAgent(QlearningAgent)





Example #5
0
    From the paper:
    Least-Squares Policy Iteration. 2003.
    Michail Lagoudakis and Ronald Parr.
    """

    name = "LSPI"

    @classmethod
    def agent_parameters(cls):
        param_set = super(LSPI, cls).agent_parameters()
        add_parameter(param_set, "lspi_threshold", default=0.001)
        return param_set

    def init_parameters(self):
        super(LSPI, self).init_parameters()
        self.threshold = self.params.setdefault('lspi_threshold', 0.001) # Threshold for convergence

    def updateWeights(self):
        # Outer loop of LSPI algorithm, repeat until policy converges
        prev_weights = None
        while (prev_weights is None) or numpy.linalg.norm(prev_weights - self.weights.ravel()) >= self.threshold:
            prev_weights = self.weights.flatten()
            super(LSPI, self).updateWeights()


if __name__=="__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(LSTD)

Example #6
0
        sarsa_lambda.sarsa_lambda.agent_init(self, taskSpec)
        self.traces = numpy.zeros((numpy.prod(self.weights.shape[:-1]) + self.weights.size,)) # combined e_t^w and e_t^v
        self.value_weights = numpy.zeros((numpy.prod(self.weights.shape[:-1]),))
        self.advantage_weights = numpy.zeros((self.weights.size,))

    def update(self, phi_t, phi_tp, reward, compatFeatures):
        phi_hat = numpy.zeros(self.traces.shape)
        phi_hat[:phi_t.size] = phi_t.flatten()
        phi_hat[phi_t.size:] = compatFeatures.flatten()

        self.traces *= self.lmbda
        self.traces += phi_hat

        delta = numpy.dot(self.value_weights, (self.gamma * phi_tp - phi_t).flatten()) + reward
        self.advantage_weights += self.beta * (delta - numpy.dot(self.advantage_weights, compatFeatures.flatten())) * self.traces[self.value_weights.size:]
        self.value_weights += self.beta * delta * self.traces[:self.value_weights.size]

        if self.step_count % self.nac_freq == 0:
            # Update the weights with both a scalar and vector stepsize used
            self.weights += self.step_sizes * self.advantage_weights.reshape(self.weights.shape) / numpy.linalg.norm(self.advantage_weights)


if __name__=="__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(nac_sarsa)





Example #7
0
        # Update the weights with both a scalar and vector stepsize used
        # (Maybe we should actually make them both work together naturally)
        self.weights += self.rescale_update(phi_t, phi_tp, delta, reward,
                                            delta * self.traces)

    def agent_end(self, reward):
        """Receive the final reward in an episode, also signaling the end of the episode.

        Args:
            reward: The reward received for taking the last action from the previous state.
        """

        lastState = numpy.array(list(self.last_observation.doubleArray))
        lastAction = self.last_action.intArray[0]

        lastDiscState = self.getDiscState(self.last_observation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[lastDiscState, :,
              lastAction] = self.basis.computeFeatures(lastState)

        self.update_traces(phi_t, None)
        self.update(phi_t, None, 0, reward)


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent

    runAgent(QlearningAgent)
Example #8
0
        # Update the weights
        for layer in range(len(self.traces)):
            self.net.layers[layer].np[
                'b'] -= self.alpha * delta * self.traces[layer]['b']
            self.net.layers[layer].np[
                'w'] -= self.alpha * delta * self.traces[layer]['w']

        #newQ = self.net.sim([x_t]).flatten()
        #print Q_t[a_t], deltaQ[a_t], newQ[a_t]

    def agent_end(self, reward):
        lastState = self.lastObservation.doubleArray
        lastAction = self.lastAction.intArray[0]

        # Update eligibility traces
        self.decayTraces()
        self.update(lastState, lastAction, None, 0, reward)

    def agent_cleanup(self):
        pass

    def has_diverged(self):
        value = self.net.layers[0].np['w'].sum()
        return numpy.isnan(value) or numpy.isinf(value)


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(sarsa_lambda_ann)
        self.value_weights = numpy.zeros(
            (numpy.prod(self.weights.shape[:-1]), ))
        self.advantage_weights = numpy.zeros((self.weights.size, ))

    def update(self, phi_t, phi_tp, reward, compatFeatures):
        phi_hat = numpy.zeros(self.traces.shape)
        phi_hat[:phi_t.size] = phi_t.flatten()
        phi_hat[phi_t.size:] = compatFeatures.flatten()

        self.traces *= self.lmbda
        self.traces += phi_hat

        delta = numpy.dot(self.value_weights,
                          (self.gamma * phi_tp - phi_t).flatten()) + reward
        self.advantage_weights += self.beta * (
            delta - numpy.dot(self.advantage_weights, compatFeatures.flatten())
        ) * self.traces[self.value_weights.size:]
        self.value_weights += self.beta * delta * self.traces[:self.
                                                              value_weights.
                                                              size]

        if self.step_count % self.nac_freq == 0:
            # Update the weights with both a scalar and vector stepsize used
            self.weights += self.step_sizes * self.advantage_weights.reshape(
                self.weights.shape) / numpy.linalg.norm(self.advantage_weights)


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(nac_sarsa)
Example #10
0
        """
        return numpy.dot(self.weights[discState,:,:].T, self.basis.computeFeatures(state)).argmax()

    def update(self, phi_t, state, discState, reward):
        reward = (reward - self.reward_range[0]) / (self.reward_range[1] - self.reward_range[0])
        self.step_count += 1
        state_action = numpy.where(phi_t != 0)
        if self.LEARN[state_action]: # If Learn[s,a]
            qvalues = self.getActionValues(state, discState)
            self.updates[state_action] += reward + self.gamma * qvalues.max()
            self.visit_count[state_action] += 1
            if self.visit_count[state_action] == self.m:
                if self.weights[state_action] - self.updates[state_action]/self.m >= 2. * self.epsilon:
                    self.weights[state_action] = self.updates[state_action]/self.m + self.epsilon
                    self.last_update = self.step_count
                    #print (self.weights.ravel() < self.weights.max()).sum(), self.weights.size
                elif self.update_time[state_action] >= self.last_update:
                    self.LEARN[state_action] = False
                self.update_time[state_action] = self.step_count
                self.updates[state_action] = 0
                self.visit_count[state_action] = 0
        elif self.update_time[state_action] < self.last_update:
            self.LEARN[state_action] = True

if __name__=="__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(delayed_qlearning)



Example #11
0
        Args:
            inMessage: A string message sent by either the environment or experiment to the agent.

        Returns:
            A string response message.
        """
        if inMessage.lower() == "agent_diverged?": # If we find that this is needed, we can fill it in later
            return "False" #str(self.has_diverged(self.weights))
        else:
            return name + " does not understand your message."


if __name__=="__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(ModelBasedAgent)

# If executed as a standalone script this will default to RLGlue network mode.
# Some parameters can be passed at the command line to customize behavior.
# if __name__=="__main__":
#     import argparse
#     parser = argparse.ArgumentParser(description='Run ModelBasedAgent in network mode')
#     parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
#     parser.add_argument("--model", type=str, default="knn", help="What model class to use", choices=["knn", "randforest", "svm", "gp"])
#     parser.add_argument("--planner", type=str, default="fittedq", help="What planner class to use", choices=["fittedq"])
#     parser.add_argument("--svmde",  action='store_true', help="Use the one class SVM density estimator for known/unknown distinctions.")
#     args = parser.parse_args()

#     model_params = {}
#     planner_params = {}
#     model_class = None
Example #12
0
        delta = self.gamma * qvalues[a_tp] + reward - numpy.dot(self.weights.flatten(), phi_t.flatten())

        # Update the weights with both a scalar and vector stepsize used
        # (Maybe we should actually make them both work together naturally)
        self.weights += self.rescale_update(phi_t, phi_tp, delta, reward, delta * self.traces)

    def agent_end(self, reward):
        """Receive the final reward in an episode, also signaling the end of the episode.

        Args:
            reward: The reward received for taking the last action from the previous state.
        """

        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState)

        self.update_traces(phi_t, None)
        self.update(phi_t, None, 0, reward)


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent

    runAgent(qlearning_agent)
Example #13
0
            approxMaxGrad = numpy.exp(qvalues - logSumExp)

            # Compute gradient of smoothed TD error
            fa_grad = self.basisGradient(state)
            for a in range(self.numActions):
                deltaGrad += approxMaxGrad[a] * (fa_grad *
                                                 self.weights[discState, :, a])
            fa_grad = self.basisGradient(lastState)
            deltaGrad = self.gamma * deltaGrad - (
                fa_grad * self.weights[lastDiscState, :, lastAction])

            # Compute the update to the basis scale features
            update_fs = self.beta * delta * deltaGrad
            # Do MDA update for weights
            md_qlearn.update(self, phi_t, state, discState, reward)

            # Update frequency scaling
            update_fs += self.freq_scale
            # Change scaling on multipliers
            self.basis.multipliers = numpy.dot(
                numpy.diag(update_fs / self.freq_scale),
                self.basis.multipliers)
            self.freq_scale = update_fs
        else:
            md_qlearn.update(self, phi_t, state, discState, reward)


if __name__ == "__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(mdba_qlearn)
Example #14
0
            approxMaxGrad = numpy.exp(qvalues - logSumExp)

            # Compute gradient of smoothed TD error
            fa_grad = self.basisGradient(state)
            for a in range(self.numActions):
                deltaGrad += approxMaxGrad[a] * (fa_grad * self.weights[discState,:,a])
            fa_grad = self.basisGradient(lastState)
            deltaGrad = self.gamma * deltaGrad - (fa_grad * self.weights[lastDiscState, :,lastAction])

            # Compute the update to the basis scale features
            update_fs = self.beta * delta * deltaGrad
            # Do MDA update for weights
            md_qlearn.update(self, phi_t, state, discState, reward)

            # Update frequency scaling
            update_fs += self.freq_scale
            # Change scaling on multipliers
            self.basis.multipliers = numpy.dot(numpy.diag(update_fs/self.freq_scale), self.basis.multipliers)
            self.freq_scale = update_fs
        else:
            md_qlearn.update(self, phi_t, state, discState, reward)


if __name__=="__main__":
    from pyrl.agents.skeleton_agent import runAgent
    runAgent(mdba_qlearn)