def main(grid_size, discount_factor, epochs, learning_rate): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Reads the trajectory data from the csv files provided and returns the trajectories # The trajectories is numpy array of the number of trajectories provided trajectories = robot_mdp.generate_trajectories() # Finds the length of the trajectories data n_trajectories = len(trajectories) # Initialize the IRL class object, provide trajectory length as input, currently its value is 3 irl = MaxEntIRL(n_trajectories, grid_size) # For storing results filename = "/home/vignesh/PycharmProjects/dvrk_automated_suturing/data/trial4_grid11_parallel/weights_grid11.txt" # Calculates the reward function based on the Max Entropy IRL algorithm reward, weights = irl.max_ent_irl(trajectories, discount_factor, n_trajectories, epochs, learning_rate, filename) print "r is ", reward.reshape((grid_size, grid_size), order='F') # print "r shape ", reward.shape print "weights is ", weights.reshape((grid_size, grid_size), order='F') # print "policy is ", policy[0][0] file_open = open(filename, 'a') file_open.write("\n \n \n \n Final result \n") savetxt(filename, weights, delimiter=',', fmt="%10.5f", newline=", ") file_open.close()
def main(discount, epochs, learning_rate, weights, trajectory_length, n_policy_iter): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Initialize the IRL class object, provide trajectory length as input, currently its value is 1 irl = MaxEntIRL(trajectory_length) # trajectory_features_array, trajectory_rewards_array = robot_mdp.trajectories_features_rewards_array(weights) # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory sum_trajectory_features, complete_features_array = robot_mdp.generate_trajectories( ) # print "traj array ", trajectory_features_array.shape, trajectory_features_array # print "complete array ", complete_features_array.shape, complete_features_array # print trajectory_rewards_array # print model_rot_par_r # Returns the state and actions spaces of the expert trajectory state_trajectory_array, action_trajectory_array = robot_mdp.trajectories_data( ) # Finds the length of the trajectories data n_trajectories = len(state_trajectory_array) # Calculates the reward function based on the Max Entropy IRL algorithm reward, alpha = irl.max_ent_irl(sum_trajectory_features, complete_features_array, discount, n_trajectories, epochs, learning_rate, n_policy_iter, weights) print "r is ", reward print "alpha is ", alpha
def main(grid_size, discount_factor, epochs, learning_rate): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Reads the trajectory data from the csv files provided and returns the trajectories # The trajectories is numpy array of the number of trajectories provided trajectories = robot_mdp.generate_trajectories() # Finds the length of the trajectories data n_trajectories = len(trajectories) # Initialize the IRL class object, provide trajectory length as input, currently its value is 3 irl = MaxEntIRL(n_trajectories, grid_size) # Calculates the reward function based on the Max Entropy IRL algorithm reward, weights = irl.max_ent_irl(trajectories, discount_factor, n_trajectories, epochs, learning_rate) print "r is ", reward.reshape((grid_size, grid_size), order='F') # print "r shape ", reward.shape print "weights is ", weights.reshape((grid_size, grid_size), order='F')
def main(discount, epochs, learning_rate): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory sum_trajectory_features, feature_array_all_trajectories = robot_mdp.generate_trajectories() # Finds the length of the trajectories data n_trajectories = len(feature_array_all_trajectories) # Initialize the IRL class object, provide trajectory length as input, currently its value is 1 irl = MaxEntIRL(n_trajectories) # Calculates the reward function based on the Max Entropy IRL algorithm reward, weights = irl.max_ent_irl(sum_trajectory_features, feature_array_all_trajectories, discount, n_trajectories, epochs, learning_rate) print "r is ", reward # print "r shape ", reward.shape print "weights is ", weights
def __init__(self, window_size=800, grid_size=5, cell_size=1, gamma=0.9, world='obj', stochastic=False): MaxEntIRL.__init__(self, window_size, grid_size, cell_size, gamma=gamma, world=world, stochastic=stochastic) self.object_world = self.mdp_gw self.sigma_sq = 0.5e-2
def main(self): np.random.seed(0) # phi phi = [self._feature_map(s) for s in range(self.observation_space().n)] phi = np.array(phi) print('Collect dataset ...') in_dataset = self.collect_demo() plot_dataset_distribution(in_dataset, (self.f1_length, self.f2_length), "Dataset State Distribution") print('Done!') # IRL print('Building IRL object ...') me_irl = MaxEntIRL( observation_space=self.observation_space(), action_space=self.action_space(), transition=self._construct_transition_fn(), goal_states=self.goal_states, dataset= in_dataset, feature_map=phi, max_iter=20, anneal_rate=0.9) print('Done!') print('\nStart training ...') Rprime = me_irl.train() Rprime = Rprime.reshape((self.f1_length, self.f2_length)).T print('Boundary index = ', self.get_boundary(Rprime)) print('Radius = ', self.f1_idx[self.get_boundary(Rprime)]) print(self.f1_idx) # plot results plot_grid_map(Rprime, "Reward (IRL)", print_values=True, cmap=plt.cm.Blues) plt.show() return
def test_feature_gridworld_maxent_irl(): np.random.seed(0) # env N = 15 init_pos = np.zeros((N, N), dtype=float) for i, j in product(range(N // 2 + 2, N), range(N // 2 + 2, N)): init_pos[i, j] = i**2 + j**2 init_pos /= np.sum(init_pos) goal_pos = np.array([[n, 0] for n in range(N)]) grid = np.zeros((N, N), dtype=float) grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_feature_boundary_reward( grid, boundary_axis=0, boundary_value=N // 2, reward=-10, exp_constant=0.2, ) plot_grid_map(init_pos.T, "Initial Position Distribution", cmap=plt.cm.Blues) plot_grid_map(grid.T, "Reward (Ground Truth)", cmap=plt.cm.Reds) env = GridWorld( dimensions=(N, N), init_pos=init_pos, goal_pos=goal_pos, reward_grid=grid, action_success_rate=1, render=False, ) # learn a policy mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99) policy = StochasticGreedyPolicy(env.action_space(), mdp_algo, env.transition) # roll out trajectories dataset = collect_trajectories(policy=policy, env=env, num_trajectories=20, maxlen=N * 2) plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution") # IRL feature map feature_map = [ env._feature_map(s) for s in range(env.observation_space().n) ] feature_map = np.array(feature_map) # IRL me_irl = MaxEntIRL(observation_space=env.observation_space(), action_space=env.action_space(), transition=env.transition, goal_states=env.goal_states, dataset=dataset, feature_map=feature_map, max_iter=10, lr=0.1, anneal_rate=0.9) Rprime = me_irl.train() Rprime = Rprime.reshape((N, N)).T # plot results plot_grid_map(Rprime, "Reward (IRL)", cmap=plt.cm.Blues) plt.show()
def test_gridworld_maxent_irl(): np.random.seed(0) # env N = 10 goal_pos = np.array([[N - 1, N - 1]]) human_pos = np.array([[3, 3]]) human_radius = 2 grid = np.zeros((N, N), dtype=float) grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_human_radius_reward(grid, human_pos, human_radius, -10) env = GridWorld( dimensions=(N, N), init_pos=(0, 0), goal_pos=goal_pos, reward_grid=grid, human_pos=human_pos, action_success_rate=1, render=False, ) # learn a policy mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99) # mdp_algo = q_learning(env.transition, env.reward, gamma=0.99) # policy = GreedyPolicy(env.action_space(), mdp_algo) # policy = EpsGreedyPolicy(env.action_space(), mdp_algo, epsilon=0.1) policy = StochasticGreedyPolicy(env.action_space(), mdp_algo, env.transition) V = np.asarray(mdp_algo.V).reshape((N, N)).T R = env.reward.reshape((N, N)).T plot_grid_map(R, "Reward (Ground Truth)", cmap=plt.cm.Reds) plot_grid_map(V, "Value Function", cmap=plt.cm.Blues) # roll out trajectories dataset = collect_trajectories(policy=policy, env=env, num_trajectories=200, maxlen=N * 2) plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution") # feature map feature_map = [ env._feature_map(s) for s in range(env.observation_space().n) ] feature_map = np.array(feature_map) # IRL me_irl = MaxEntIRL(observation_space=env.observation_space(), action_space=env.action_space(), transition=env.transition, goal_states=env.goal_states, dataset=dataset, feature_map=feature_map, max_iter=10, lr=0.1, anneal_rate=0.9) Rprime = me_irl.train() Rprime = Rprime.reshape((N, N)).T # plot results plot_grid_map(Rprime, "Reward (IRL)", print_values=True, cmap=plt.cm.Blues) plt.show()