Example #1
0
def logLikelihoodOfTrajectory(trajectory,theta,feature_matrix,gw,discount):
     n_states, d_states = feature_matrix.shape
     r = feature_matrix.dot(theta)
     transition_probability=gw.transition_probability
     n_actions=gw.n_actions
     Q = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, r, discount)
     
     logLike = 0.0
     for i in range( (np.size(trajectory,0)-1) ):
         start_state=trajectory[i][0]
         next_state=trajectory[i+1][0]
         
         mostProbableAction=0
         mostProbableActionsProbability=gw.transition_probability[start_state][0][next_state]
         for action in range(n_actions):
             currentActionsProbability=gw.transition_probability[start_state][action][next_state]
             if(currentActionsProbability>mostProbableActionsProbability):
                 mostProbableActionsProbability=currentActionsProbability
                 mostProbableAction=action
                 
         actProb = Q[start_state][mostProbableAction]
         logLike += math.log(actProb)
        
     return logLike
Example #2
0
def expected_value_difference(n_states, n_actions, transition_probability,
                              reward, discount, p_start_state, optimal_value,
                              true_reward):
    """
    Calculate the expected value difference, which is a proxy to how good a
    recovered reward function is.

    n_states: Number of states. int.
    n_actions: Number of actions. int.
    transition_probability: NumPy array mapping (state_i, action, state_k) to
        the probability of transitioning from state_i to state_k under action.
        Shape (N, A, N).
    reward: Reward vector mapping state int to reward. Shape (N,).
    discount: Discount factor. float.
    p_start_state: Probability vector with the ith component as the probability
        that the ith state is the start state. Shape (N,).
    optimal_value: Value vector for the ground reward with optimal policy.
        The ith component is the value of the ith state. Shape (N,).
    true_reward: True reward vector. Shape (N,).
    -> Expected value difference. float.
    """

    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, reward,
                                         discount)
    value = value_iteration.value(policy.argmax(axis=1), n_states,
                                  transition_probability, true_reward,
                                  discount)

    evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
    return evd
Example #3
0
    def calculate_policy(self,
                         n_actions,
                         transition_probability,
                         feature_matrix,
                         trajectories,
                         valid_actions={},
                         return_rewards=False):

        ##IRL rewards
        rewards, _, feature_rewards = self.irl(n_actions,
                                               transition_probability,
                                               feature_matrix, trajectories,
                                               valid_actions)

        ##Reconstruct policy based on learned rewards
        policy = find_policy(self.n_states,
                             self.n_actions,
                             self.transition_probability,
                             rewards,
                             self.discount,
                             stochastic=False,
                             valid_actions=self.valid_actions,
                             consider_valid_only=True)

        if return_rewards:
            return policy, feature_rewards
        else:
            return policy
def find_expected_svf(n_states, r, n_actions, discount, transition_probability,
                      trajectories):
    """
    Find the expected state visitation frequencies 
    """

    n_trajectories = trajectories.shape[0]
    trajectory_length = trajectories.shape[1]

    # policy = find_policy(n_states, r, n_actions, discount,
    #                                 transition_probability)
    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, r, discount)

    start_state_count = np.zeros(n_states)
    for trajectory in trajectories:
        start_state_count[trajectory[0, 0]] += 1
    p_start_state = start_state_count / n_trajectories

    expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T
    for t in range(1, trajectory_length):
        expected_svf[:, t] = 0
        for i, j, k in product(range(n_states), range(n_actions),
                               range(n_states)):
            expected_svf[k, t] += (
                expected_svf[i, t - 1] * policy[i, j] *  # Stochastic policy
                transition_probability[i, j, k])

    return expected_svf.sum(axis=1)
Example #5
0
	def get_policy(self):

		##Calculate optimal policy for gridworld using value iteration
		rewards = np.dot(self.colors, self.reward_weights)

		policy, multiple_state_indices = find_policy(self.n_states, self.n_actions, self.transition_probas, rewards,
													 discount=0.9, stochastic=False, valid_actions=self.valid_actions, return_multiple=True)
		return policy, multiple_state_indices
Example #6
0
    def find_expected_svf_MC(self, r, p_start_state):

        ##Calculates probability for each action and state - stochastic policy (lines 1-3 alg 1)
        policy = find_policy(self.n_states,
                             self.n_actions,
                             self.transition_probability,
                             r,
                             self.discount,
                             valid_actions=self.valid_actions,
                             consider_valid_only=True)

        ##Initialize svf matrix and lock
        expected_svf = np.zeros((self.n_states, self.rollout_horizon))
        expected_svf[:, 0] = p_start_state

        if self.par:
            lock_expected_svf = Lock()

        ##Run MC rollouts parallely to calculate state visitation frequencies (run 100 threads parllely)
        rollouts = []
        for j in range(0, self.mc_rollouts, 100):
            if self.par:
                par_rollouts = 100 if self.mc_rollouts - j > 100 else self.mc_rollouts - j
                rollouts = []
                for i in range(par_rollouts):
                    rollout_i = Thread(name='rollout' + str(i),
                                       target=self.MC_rollout_par,
                                       args=(p_start_state, policy,
                                             expected_svf, lock_expected_svf))
                    rollouts.append(rollout_i)

                ##Start all threads
                for r in rollouts:
                    r.start()

                ##Wait for all threads to finish
                for r in rollouts:
                    r.join()
            else:  ##Option not to parallel
                rollouts.append(self.MC_rollout(p_start_state, policy))

        if not self.par:
            for state_visitation_counts in rollouts:
                np.add(expected_svf, state_visitation_counts,
                       expected_svf)  ##Add in place, right?

        expected_svf[:, 1:] = expected_svf[:, 1:] / self.mc_rollouts
        return expected_svf.sum(axis=1), policy
Example #7
0
def maxent_irl(sample_paths, feature_matrix, transition_probability, discount,
               iterations, learning_rate):
    """
    Find the reward function from the list of games (sample_paths)

    sample_paths: A list of paths. One path = one game
    feature_matrix: NxD matrix (N = number of states, D = Dimensionality of the state_
    transition_probability: NxNxA (N = number of states, A = Number of actions), each element contains P(next state | current state, action a)
    discount: Discount factor for the MDP
    iterations: Number of gradient descent steps
    learning_rate: Gradient descent rate

    -> Reward vector of size N
    """
    N_STATES, N_FEATURES, N_ACTIONS = np.shape(transition_probability)

    # Initialize the reward weights to random probabilities since we are going to adjust them as we look at the samples
    theta = rn.uniform(size=(3))

    # Calculate feature expectations
    feature_expectations = np.zeros(feature_matrix.shape[1])
    for path in sample_paths:
        for state, _, _ in path:
            feature_expectations += feature_matrix[state]
    feature_expectations /= sample_paths.shape[
        0]  # Divide each element by the total number of paths

    for _ in range(iterations):
        # 1. Solve for optimal policy w.r.t. rewards with value iteration
        rewards = feature_matrix.dot(theta)  # Vector of reward values

        policy = value_iteration.find_policy(N_STATES, N_ACTIONS,
                                             transition_probability, rewards,
                                             discount)

        # 2. Solve for state visitation frequences P(s | theta, T)
        svf = compute_svf(sample_paths, transition_probability, discount,
                          policy)

        # 3. Compute gradient
        gradient = feature_expectations - feature_matrix.T.dot(svf)

        # 4. Update theta with one gradient step
        theta += learning_rate * gradient
    # return feature_matrix.dot(theta).reshape((N_STATES, ))
    return theta
def expected_value_difference(n_states, n_actions, transition_probability,
                              reward, discount, p_start_state, optimal_value,
                              true_reward):
    """
    Calculate the expected value difference, which is a proxy to how good a
    recovered reward function is.
    """

    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, reward,
                                         discount)
    value = value_iteration.value(policy.argmax(axis=1), n_states,
                                  transition_probability, true_reward,
                                  discount)

    evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
    return evd
Example #9
0
def find_expected_svf(n_states, r, n_actions, discount, transition_probability,
                      trajectories):
    """
    Find the expected state visitation frequencies using algorithm 1 from
    Ziebart et al. 2008.

    n_states: Number of states N. int.
    alpha: Reward. NumPy array with shape (N,).
    n_actions: Number of actions A. int.
    discount: Discount factor of the MDP. float.
    transition_probability: NumPy array mapping (state_i, action, state_k) to
        the probability of transitioning from state_i to state_k under action.
        Shape (N, A, N).
    trajectories: 3D array of state/action pairs. States are ints, actions
        are ints. NumPy array with shape (T, L, 2) where T is the number of
        trajectories and L is the trajectory length.
    -> Expected state visitation frequencies vector with shape (N,).
    """

    n_trajectories = trajectories.shape[0]
    trajectory_length = trajectories.shape[1]

    # policy = find_policy(n_states, r, n_actions, discount,
    #                                 transition_probability)
    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, r, discount)

    start_state_count = np.zeros(n_states)
    for trajectory in trajectories:
        start_state_count[trajectory[0]] += 1
    p_start_state = start_state_count / n_trajectories

    expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T

    for s in xrange(n_states):
        for t in range(1, len(trajectories[0])):
            expected_svf[s, t] = sum([
                expected_svf[pre_s, t - 1] *
                transition_probability[pre_s, int(policy[pre_s]), s]
                for pre_s in xrange(n_states)
            ])

    return expected_svf.sum(axis=1)
Example #10
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability,
                         ground_r, ow.discount, stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s])

    feature_matrix = ow.feature_matrix(discrete=False)
    print(feature_matrix)
    r = maxent.irl(feature_matrix, ow.n_actions, discount,
        ow.transition_probability, trajectories, epochs, learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Example #11
0
    def find_expected_svf(self, r, p_start_state):

        ##Calculates probability for each action and state (lines 1-3 alg 1)
        policy = find_policy(self.n_states,
                             self.n_actions,
                             self.transition_probability,
                             r,
                             self.discount,
                             valid_actions=self.valid_actions,
                             consider_valid_only=True)

        expected_svf = np.tile(p_start_state, (self.rollout_horizon, 1)).T
        for t in range(1, self.rollout_horizon):
            expected_svf[:, t] = 0
            for i, j, k in product(range(self.n_states), range(self.n_actions),
                                   range(self.n_states)):  #line 5 alg 1
                expected_svf[k, t] += (
                    expected_svf[i, t - 1] *
                    policy[i, j] *  ##Stochastic policy
                    self.transition_probability[i, j, k])

        return expected_svf.sum(axis=1), policy
Example #12
0
def repeat_find_policy(N):

    for _ in range(N):
        find_policy(N_STATES, rewards, N_ACTIONS, 0.99, tprob)