Esempio n. 1
0
def main(grid_size, discount_factor, epochs, learning_rate):
    # Creates an object for using the RobotMarkovModel class
    robot_mdp = RobotMarkovModel()
    # Reads the trajectory data from the csv files provided and returns the trajectories
    # The trajectories is numpy array of the number of trajectories provided
    trajectories = robot_mdp.generate_trajectories()
    # Finds the length of the trajectories data
    n_trajectories = len(trajectories)
    # Initialize the IRL class object, provide trajectory length as input, currently its value is 3
    irl = MaxEntIRL(n_trajectories, grid_size)
    # For storing results
    filename = "/home/vignesh/PycharmProjects/dvrk_automated_suturing/data/trial4_grid11_parallel/weights_grid11.txt"
    # Calculates the reward function based on the Max Entropy IRL algorithm
    reward, weights = irl.max_ent_irl(trajectories, discount_factor,
                                      n_trajectories, epochs, learning_rate,
                                      filename)

    print "r is ", reward.reshape((grid_size, grid_size), order='F')
    # print "r shape ", reward.shape
    print "weights is ", weights.reshape((grid_size, grid_size), order='F')
    # print "policy is ", policy[0][0]
    file_open = open(filename, 'a')
    file_open.write("\n \n \n \n Final result \n")
    savetxt(filename, weights, delimiter=',', fmt="%10.5f", newline=", ")
    file_open.close()
Esempio n. 2
0
def main(discount, epochs, learning_rate, weights, trajectory_length,
         n_policy_iter):
    # Creates an object for using the RobotMarkovModel class
    robot_mdp = RobotMarkovModel()
    # Initialize the IRL class object, provide trajectory length as input, currently its value is 1
    irl = MaxEntIRL(trajectory_length)

    # trajectory_features_array, trajectory_rewards_array = robot_mdp.trajectories_features_rewards_array(weights)
    # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory
    sum_trajectory_features, complete_features_array = robot_mdp.generate_trajectories(
    )
    # print "traj array ", trajectory_features_array.shape, trajectory_features_array
    # print "complete array ", complete_features_array.shape, complete_features_array
    # print trajectory_rewards_array
    # print model_rot_par_r
    # Returns the state and actions spaces of the expert trajectory
    state_trajectory_array, action_trajectory_array = robot_mdp.trajectories_data(
    )
    # Finds the length of the trajectories data
    n_trajectories = len(state_trajectory_array)
    # Calculates the reward function based on the Max Entropy IRL algorithm
    reward, alpha = irl.max_ent_irl(sum_trajectory_features,
                                    complete_features_array, discount,
                                    n_trajectories, epochs, learning_rate,
                                    n_policy_iter, weights)

    print "r is ", reward
    print "alpha is ", alpha
Esempio n. 3
0
def main(grid_size, discount_factor, epochs, learning_rate):
    # Creates an object for using the RobotMarkovModel class
    robot_mdp = RobotMarkovModel()
    # Reads the trajectory data from the csv files provided and returns the trajectories
    # The trajectories is numpy array of the number of trajectories provided
    trajectories = robot_mdp.generate_trajectories()
    # Finds the length of the trajectories data
    n_trajectories = len(trajectories)
    # Initialize the IRL class object, provide trajectory length as input, currently its value is 3
    irl = MaxEntIRL(n_trajectories, grid_size)
    # Calculates the reward function based on the Max Entropy IRL algorithm
    reward, weights = irl.max_ent_irl(trajectories, discount_factor, n_trajectories, epochs, learning_rate)

    print "r is ", reward.reshape((grid_size, grid_size), order='F')
    # print "r shape ", reward.shape
    print "weights is ", weights.reshape((grid_size, grid_size), order='F')
Esempio n. 4
0
def main(discount, epochs, learning_rate):
    # Creates an object for using the RobotMarkovModel class
    robot_mdp = RobotMarkovModel()
    # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory
    sum_trajectory_features, feature_array_all_trajectories = robot_mdp.generate_trajectories()
    # Finds the length of the trajectories data
    n_trajectories = len(feature_array_all_trajectories)
    # Initialize the IRL class object, provide trajectory length as input, currently its value is 1
    irl = MaxEntIRL(n_trajectories)
    # Calculates the reward function based on the Max Entropy IRL algorithm
    reward, weights = irl.max_ent_irl(sum_trajectory_features, feature_array_all_trajectories, discount,
                                    n_trajectories, epochs, learning_rate)

    print "r is ", reward
    # print "r shape ", reward.shape
    print "weights is ", weights
Esempio n. 5
0
 def __init__(self,
              window_size=800,
              grid_size=5,
              cell_size=1,
              gamma=0.9,
              world='obj',
              stochastic=False):
     MaxEntIRL.__init__(self,
                        window_size,
                        grid_size,
                        cell_size,
                        gamma=gamma,
                        world=world,
                        stochastic=stochastic)
     self.object_world = self.mdp_gw
     self.sigma_sq = 0.5e-2
Esempio n. 6
0
    def main(self):
        np.random.seed(0)
        
        # phi
        phi = [self._feature_map(s) for s in range(self.observation_space().n)]
        phi = np.array(phi)

        print('Collect dataset ...')
        in_dataset = self.collect_demo()
        plot_dataset_distribution(in_dataset, (self.f1_length, self.f2_length), "Dataset State Distribution")
        print('Done!')

        # IRL
        print('Building IRL object ...')
        me_irl = MaxEntIRL(
            observation_space=self.observation_space(),
            action_space=self.action_space(),
            transition=self._construct_transition_fn(),
            goal_states=self.goal_states,
            dataset= in_dataset,
            feature_map=phi,
            max_iter=20,
            anneal_rate=0.9)
        print('Done!')

        print('\nStart training ...')
        Rprime = me_irl.train()
        Rprime = Rprime.reshape((self.f1_length, self.f2_length)).T

        print('Boundary index = ', self.get_boundary(Rprime))
        print('Radius = ', self.f1_idx[self.get_boundary(Rprime)])
        print(self.f1_idx)

        # plot results
        plot_grid_map(Rprime, "Reward (IRL)", print_values=True, cmap=plt.cm.Blues)
        plt.show()
        return
Esempio n. 7
0
def test_feature_gridworld_maxent_irl():
    np.random.seed(0)

    # env
    N = 15

    init_pos = np.zeros((N, N), dtype=float)
    for i, j in product(range(N // 2 + 2, N), range(N // 2 + 2, N)):
        init_pos[i, j] = i**2 + j**2
    init_pos /= np.sum(init_pos)
    goal_pos = np.array([[n, 0] for n in range(N)])

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_feature_boundary_reward(
        grid,
        boundary_axis=0,
        boundary_value=N // 2,
        reward=-10,
        exp_constant=0.2,
    )

    plot_grid_map(init_pos.T,
                  "Initial Position Distribution",
                  cmap=plt.cm.Blues)
    plot_grid_map(grid.T, "Reward (Ground Truth)", cmap=plt.cm.Reds)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=init_pos,
        goal_pos=goal_pos,
        reward_grid=grid,
        action_success_rate=1,
        render=False,
    )

    # learn a policy
    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    policy = StochasticGreedyPolicy(env.action_space(), mdp_algo,
                                    env.transition)

    # roll out trajectories
    dataset = collect_trajectories(policy=policy,
                                   env=env,
                                   num_trajectories=20,
                                   maxlen=N * 2)
    plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution")

    # IRL feature map
    feature_map = [
        env._feature_map(s) for s in range(env.observation_space().n)
    ]
    feature_map = np.array(feature_map)

    # IRL
    me_irl = MaxEntIRL(observation_space=env.observation_space(),
                       action_space=env.action_space(),
                       transition=env.transition,
                       goal_states=env.goal_states,
                       dataset=dataset,
                       feature_map=feature_map,
                       max_iter=10,
                       lr=0.1,
                       anneal_rate=0.9)
    Rprime = me_irl.train()
    Rprime = Rprime.reshape((N, N)).T

    # plot results
    plot_grid_map(Rprime, "Reward (IRL)", cmap=plt.cm.Blues)
    plt.show()
Esempio n. 8
0
def test_gridworld_maxent_irl():
    np.random.seed(0)

    # env
    N = 10
    goal_pos = np.array([[N - 1, N - 1]])
    human_pos = np.array([[3, 3]])
    human_radius = 2

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=1,
        render=False,
    )

    # learn a policy
    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    # mdp_algo = q_learning(env.transition, env.reward, gamma=0.99)
    # policy = GreedyPolicy(env.action_space(), mdp_algo)
    # policy = EpsGreedyPolicy(env.action_space(), mdp_algo, epsilon=0.1)
    policy = StochasticGreedyPolicy(env.action_space(), mdp_algo,
                                    env.transition)

    V = np.asarray(mdp_algo.V).reshape((N, N)).T
    R = env.reward.reshape((N, N)).T
    plot_grid_map(R, "Reward (Ground Truth)", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)

    # roll out trajectories
    dataset = collect_trajectories(policy=policy,
                                   env=env,
                                   num_trajectories=200,
                                   maxlen=N * 2)
    plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution")

    # feature map
    feature_map = [
        env._feature_map(s) for s in range(env.observation_space().n)
    ]
    feature_map = np.array(feature_map)

    # IRL
    me_irl = MaxEntIRL(observation_space=env.observation_space(),
                       action_space=env.action_space(),
                       transition=env.transition,
                       goal_states=env.goal_states,
                       dataset=dataset,
                       feature_map=feature_map,
                       max_iter=10,
                       lr=0.1,
                       anneal_rate=0.9)
    Rprime = me_irl.train()
    Rprime = Rprime.reshape((N, N)).T

    # plot results
    plot_grid_map(Rprime, "Reward (IRL)", print_values=True, cmap=plt.cm.Blues)
    plt.show()