Beispiel #1
0
class MDP(object):
    def __init__(self, num_positions=1000, num_orientations=10, map=None, grid_debug=False, seed=False):
        # Build the markov model
        self.markov_model = MarkovModel(num_positions=num_positions, num_orientations=num_orientations, map=map)
        self.markov_model.make_states(grid_debug=grid_debug, seed=seed)
        self.markov_model.build_roadmap()

        self.num_states = num_positions * num_orientations
        if(grid_debug):
            self.num_states = self.markov_model.num_states

        self.goal_state_idx = 0
        self.reward_radius = 0
        self.rewards = np.empty(self.num_states)

        self.policy = [] # List of actions to take corresponding to each state idx in markov_model.model_states
        self.value_function = [] # List of rewards corresponding to being each state based on the policy

        # Visualization
        self.turn_right_pub = rospy.Publisher('/right_pose_array', PoseArray, queue_size=10)
        self.turn_left_pub = rospy.Publisher('/left_pose_array', PoseArray, queue_size=10)
        self.forward_pub = rospy.Publisher('/forward_pose_array', PoseArray, queue_size=10)
        self.goal_state_pub = rospy.Publisher('/goal_state_marker', Marker, queue_size=10)

    """
        Function: set_goal_state
        Inputs: State state
                int reward_radius

        Sets the state that the robot wants to end up in, as well as the
        radius around this state which will count as close enough.
    """
    def set_goal_state(self, state, reward_radius=0.5):
        self.goal_state_idx = self.markov_model.get_closest_state_idx(state)
        # print(self.goal_state_idx)
        # print(self.markov_model.model_states[self.goal_state_idx])
        self.reward_radius = reward_radius

    '''
        Function: set_rewards
        Inputs:

        Generates a vector assigning a low reward to all states not within the
        minimum radius around the goal. The high reward is assigned to all
        states that are within this radius, such as multiple orientations of
        the same state.

    '''
    def set_rewards(self):
        high_reward = 10
        low_reward = -1

        goal_state = self.markov_model.model_states[self.goal_state_idx]
        for state_idx in xrange(self.num_states):
            s = self.markov_model.model_states[state_idx]
            if goal_state.distance_to(s) < self.reward_radius:
                self.rewards[state_idx] = high_reward
            else:
                self.rewards[state_idx] = low_reward

    """
        Function: get_policy
        Inputs: State goal_state - new goal state to be set.

        Calculates the optimal policy for a given (or saved) goal state by
        iteratively solving a value function (how much payoff each action at
        a state will have in the long run) to update the action to take in each
        state (policy).
        Returns the policy, num of iterations, time of compuation (modelled after
        mdptoolbox outputs)

    """
    def get_policy(self, goal_state=None):
        if(goal_state != None):
            self.set_goal_state(goal_state)

        # Generate reward vector
        self.set_rewards()

        # Generate a starting, random policy from all available actions
        start_time = time.time()
        self.policy = self.get_random_policy()

        change = -1
        iteration_count = 0
        while change != 0 and iteration_count < 30:
            print("iteration {}".format(iteration_count))
            iteration_count += 1
            # Find V of this policy.
            solved = self.solve_value_function()
            if(solved):
                # Calculate new policy.
                change = self.get_new_policy()
                print(change)
            else:
                print("Failed to solve for a value function, trying a new random policy")
                # Singular matrix solution to V
                self.policy = self.get_random_policy()
        print("Converged in {} iterations".format(iteration_count))
        print("MDP solved in {}\n".format(time.time() - start_time))
        # print(self.policy)
        return (self.policy, iteration_count, time.time() - start_time)

    """
        Function: get_random_policy
        Inputs:

        Returns a randomly-assigned policy, or action to be taken in each state.

    """
    def get_random_policy(self):
        return np.random.choice(range(len(Action.get_all_actions())), self.num_states, replace=True)  # TODO: xrange good here?

    """
        Function: get_new_policy
        Inputs:

        Computes a new policy by looping through each start state and assigning
        the action that generates the highest reward long-term.
        Returns 0 if no changes to the policy were made. Any other integer
        means that different actions were chosen as compared to the previous
        policy.

    """
    def get_new_policy(self):
        gamma = 0.999
        all_actions = Action.get_all_actions()
        total_change = 0
        for state_idx in xrange(self.num_states):
            ps_matrix = self.build_ps_matrix(state_idx)
            state_rewards = self.rewards[state_idx] + gamma * ps_matrix.dot(self.value_function)
            idx_action = state_rewards.argmax()
            total_change += self.policy[state_idx] - idx_action
            self.policy[state_idx] = idx_action
        return total_change

    """
        Function: solve_value_function
        Inputs:

        Solves for the value function which calculates the infinite horizon or
        payoof at infinite time at each state if following the current policy.
        Returns false if the matrix is non-invertible.

    """
    def solve_value_function(self, gamma=0.999):
        print("Solving value function")
        # Build P matrix.
        p_matrix = self.build_p_matrix()
        # print(p_matrix)

        A_dense = np.identity(self.num_states) - gamma * p_matrix

        A = csr_matrix(A_dense)
        b = self.rewards

        # if(np.linalg.det(A_dense) == 0): # yolo
        #     return False
        self.value_function = spsolve(A, b)
        return True

    """
        Function: build_p_matrix
        Inputs:

        Returns a num_states x num_states matrix with each element representing
        the probability of transitioning from a start_state to end_state with
        the action defined in the policy.

    """
    def build_p_matrix(self):
        p_matrix = np.empty([self.num_states, self.num_states])
        for start_state_idx in xrange(self.num_states):
            action_idx = self.policy[start_state_idx]
            p_matrix[start_state_idx] = self.markov_model.roadmap[action_idx, start_state_idx, :]
        return p_matrix

    """
        Function: build_ps_matrix
        Inputs: int start_state_idx

        Returns a num_actions x num_states matrix with each element representing
        the probability of transitioning from a start_state to end_state with
        the action specified in each row.

    """
    def build_ps_matrix(self, start_state_idx):
        # print("Building PS matrix")
        return self.markov_model.roadmap[:, start_state_idx, :]

    def get_goal_state_idx(self, state):
        return self.markov_model.get_closest_state_idx(state)

    def visualize_policy(self, policy, goal_state):
        print("Visualizing policy")
        goal_state_idx = self.get_goal_state_idx(goal_state)
        # Visualize the goal state as sphere.
        self.goal_state_pub.publish(self.markov_model.model_states[goal_state_idx].get_marker())

        turn_left_array = PoseArray()
        turn_right_array = PoseArray()
        forward_array = PoseArray()

        turn_left_array.header.frame_id = "map"
        turn_right_array.header.frame_id = "map"
        forward_array.header.frame_id = "map"

        all_actions = Action.get_all_actions()
        # Add pose of each state to array wih corresponding policy action
        for state_idx in xrange(len(policy)):
            # print(len(policy))
            # print(policy[state_idx].shape)
            # print(type(policy[state_idx]))
            action = all_actions[policy[state_idx]]
            state_pose = self.markov_model.model_states[state_idx].get_pose()

            if(action == Action.LEFT):
                turn_left_array.poses.append(state_pose)
            elif(action == Action.RIGHT):
                turn_right_array.poses.append(state_pose)
            else:
                forward_array.poses.append(state_pose)

        self.turn_left_pub.publish(turn_left_array)
        self.turn_right_pub.publish(turn_right_array)
        self.forward_pub.publish(forward_array)
Beispiel #2
0
class MDPToolbox(object):
    def __init__(self, num_positions=1000, num_orientations=10, map=None, grid_debug=False):
        # Build the markov model
        self.markov_model = MarkovModel(num_positions=num_positions, num_orientations=num_orientations, map=map)
        self.markov_model.make_states(grid_debug=grid_debug)
        self.markov_model.build_roadmap()
        self.num_states = num_positions * num_orientations # S
        if(grid_debug):
            self.num_states = self.markov_model.num_states
        # Visualization
        self.turn_right_pub = rospy.Publisher('/right_pose_array', PoseArray, queue_size=10)
        self.turn_left_pub = rospy.Publisher('/left_pose_array', PoseArray, queue_size=10)
        self.forward_pub = rospy.Publisher('/forward_pose_array', PoseArray, queue_size=10)
        self.goal_state_pub = rospy.Publisher('/goal_state_marker', Marker, queue_size=10)

    '''
        Function: get_policy
        Inputs: State goal_state
                int reward_radius

        Uses mdptoolbox Policy iteration to generate a policy to reach the
        specified goal, within the specified radius.
        Returns policy as tuple with len S, num_iterations, and CPU time

    '''
    def get_policy(self, goal_state, reward_radius=0.15):
        transition_probabilities = self.markov_model.roadmap # Shape (A, S, S)
        goal_state_idx = self.get_goal_state_idx(goal_state)
        print("Goal state idx: {}".format(goal_state_idx))
        rewards = self.get_rewards(goal_state_idx, reward_radius) # Shape (S,)

        discount = 0.9 # How much weight to give to future values
        pi = mdptoolbox.mdp.PolicyIteration(transition_probabilities, rewards, discount)
        pi.run()

        return pi.policy, pi.iter, pi.time

    """
        Function: get_goal_state_idx
        Inputs: State state

        Gets the closest existing state in the roadmap to the goal.
        Returns the goal_state_idx.
    """
    def get_goal_state_idx(self, state):
        print(self.markov_model.get_closest_state_idx(state))
        return self.markov_model.get_closest_state_idx(state)

    '''
        Function: get_rewards
        Inputs: int goal_state_idx
                int reward_radius

        Generates a vector assigning a low reward to all states not within the
        minimum radius around the goal. The high reward is assigned to all
        states that are within this radius, such as multiple orientations of
        the same state.
        Returns an array of shape (S,) assigning a reward to being in each state.

    '''
    def get_rewards(self, goal_state_idx, reward_radius=0.15):
        high_reward = 10
        low_reward = -1

        rewards = np.empty((self.num_states,))
        goal_state = self.markov_model.model_states[goal_state_idx]

        for state_idx in range(self.num_states):
            s = self.markov_model.model_states[state_idx]
            if goal_state.distance_to(s) < reward_radius:
                rewards[state_idx] = high_reward
            else:
                rewards[state_idx] = low_reward
        return rewards

    '''
        Function: visualize_policy
        Inputs: array policy
                State goal_state

        Publishes a sphere marker at the goal state.
        Generates three pose arrays corresponding to the three different actions.
        Depending on which action the policy says to execute at a state, the
        pose of this state is added to the corresponding pose array.

    '''
    def visualize_policy(self, policy, goal_state):
        print("Visualizing policy")
        goal_state_idx = self.get_goal_state_idx(goal_state)
        # Visualize the goal state as sphere.
        self.goal_state_pub.publish(self.markov_model.model_states[goal_state_idx].get_marker())

        turn_left_array = PoseArray()
        turn_right_array = PoseArray()
        forward_array = PoseArray()

        turn_left_array.header.frame_id = "map"
        turn_right_array.header.frame_id = "map"
        forward_array.header.frame_id = "map"

        all_actions = Action.get_all_actions()
        # Add pose of each state to array wih corresponding policy action
        for state_idx in range(len(policy)):
            action = all_actions[policy[state_idx]]
            state_pose = self.markov_model.model_states[state_idx].get_pose()

            if(action == Action.LEFT):
                turn_left_array.poses.append(state_pose)
            elif(action == Action.RIGHT):
                turn_right_array.poses.append(state_pose)
            else:
                forward_array.poses.append(state_pose)

        self.turn_left_pub.publish(turn_left_array)
        self.turn_right_pub.publish(turn_right_array)
        self.forward_pub.publish(forward_array)