def main(grid_size, discount_factor, epochs, learning_rate): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Reads the trajectory data from the csv files provided and returns the trajectories # The trajectories is numpy array of the number of trajectories provided trajectories = robot_mdp.generate_trajectories() # Finds the length of the trajectories data n_trajectories = len(trajectories) # Initialize the IRL class object, provide trajectory length as input, currently its value is 3 irl = MaxEntIRL(n_trajectories, grid_size) # For storing results filename = "/home/vignesh/PycharmProjects/dvrk_automated_suturing/data/trial4_grid11_parallel/weights_grid11.txt" # Calculates the reward function based on the Max Entropy IRL algorithm reward, weights = irl.max_ent_irl(trajectories, discount_factor, n_trajectories, epochs, learning_rate, filename) print "r is ", reward.reshape((grid_size, grid_size), order='F') # print "r shape ", reward.shape print "weights is ", weights.reshape((grid_size, grid_size), order='F') # print "policy is ", policy[0][0] file_open = open(filename, 'a') file_open.write("\n \n \n \n Final result \n") savetxt(filename, weights, delimiter=',', fmt="%10.5f", newline=", ") file_open.close()
def find_expected_svf(self, weights, discount, total_states, trajectories, reward): # Creates object of robot markov model robot_mdp = RobotMarkovModel() # Returns the state and action values of the state space being created state_values_from_trajectory, _ = robot_mdp.return_trajectories_data() # Finds the terminal state value from the expert trajectory # 0 indicates that the first trajectory data is being used # total_states indicates that the last value of that trajectory data should be used as terminal state terminal_state_val_from_trajectory = state_values_from_trajectory[0][ total_states - 1] # Pass the gridsize required # To create the state space model, we need to run the below commands env_obj = RobotStateUtils(self.grid_size, discount, terminal_state_val_from_trajectory) states = env_obj.create_state_space_model_func() action = env_obj.create_action_set_func() # print "State space created is ", states P_a = env_obj.get_transition_mat_deterministic() # print "P_a is ", P_a # Calculates the reward and feature for the trajectories being created in the state space policy = env_obj.value_iteration(reward) # policy = np.random.randint(27, size=1331) # print "policy is ", policy # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory expected_svf = env_obj.compute_state_visitation_frequency( trajectories, policy) # Returns the policy, state features of the trajectories considered in the state space and expected svf return policy, expected_svf
def main(discount, epochs, learning_rate, weights, trajectory_length, n_policy_iter): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Initialize the IRL class object, provide trajectory length as input, currently its value is 1 irl = MaxEntIRL(trajectory_length) # trajectory_features_array, trajectory_rewards_array = robot_mdp.trajectories_features_rewards_array(weights) # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory sum_trajectory_features, complete_features_array = robot_mdp.generate_trajectories( ) # print "traj array ", trajectory_features_array.shape, trajectory_features_array # print "complete array ", complete_features_array.shape, complete_features_array # print trajectory_rewards_array # print model_rot_par_r # Returns the state and actions spaces of the expert trajectory state_trajectory_array, action_trajectory_array = robot_mdp.trajectories_data( ) # Finds the length of the trajectories data n_trajectories = len(state_trajectory_array) # Calculates the reward function based on the Max Entropy IRL algorithm reward, alpha = irl.max_ent_irl(sum_trajectory_features, complete_features_array, discount, n_trajectories, epochs, learning_rate, n_policy_iter, weights) print "r is ", reward print "alpha is ", alpha
def find_expected_svf(self, weights, discount): # Robot Object called # Pass the gridsize required env_obj = RobotStateUtils(11, weights) states = env_obj.create_state_space_model_func() action = env_obj.create_action_set_func() # print "State space created is ", states P_a = env_obj.get_transition_mat_deterministic() # print "P_a is ", P_a # print "shape of P_a ", P_a.shape # Calculates the reward and feature for the trajectories being created in the state space rewards = [] state_features = [] for i in range(len(states)): r, f = env_obj.reward_func(states[i][0], states[i][1], states[i][2], weights) rewards.append(r) state_features.append(f) # print "rewards is ", rewards value, policy = env_obj.value_iteration(P_a, rewards, gamma=discount) # policy = np.random.randint(27, size=1331) print "policy is ", policy robot_mdp = RobotMarkovModel() # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory expert_trajectory_states, _ = robot_mdp.return_trajectories_data() expected_svf = env_obj.compute_state_visition_frequency(P_a, expert_trajectory_states, policy) # Formats the features array in a way that it can be multiplied with the svf values state_features = np.array([state_features]).transpose().reshape((len(state_features[0]), len(state_features))) print "svf is ", expected_svf print "svf shape is ", expected_svf.shape # Returns the policy, state features of the trajectories considered in the state space and expected svf return policy, state_features, expected_svf
def find_expected_svf(self, discount, total_states, trajectories, reward): # Creates object of robot markov model robot_mdp = RobotMarkovModel() # Returns the state and action values of the state space being created state_values_from_trajectory, _ = robot_mdp.return_trajectories_data() # Finds the terminal state value from the expert trajectory # 0 indicates that the first trajectory data is being used # total_states indicates that the last value of that trajectory data should be used as terminal state terminal_state_val_from_trajectory = state_values_from_trajectory[0][ total_states - 1] # Pass the gridsize required # To create the state space model, we need to run the below commands env_obj = RobotStateUtils(self.grid_size, discount, terminal_state_val_from_trajectory) states = env_obj.create_state_space_model_func() action = env_obj.create_action_set_func() # print "State space created is ", states # V = {} # for state in env_obj.states: # V[state] = 0 # # Reinitialize policy # policy = {} # for state in env_obj.states: # policy[state] = [key for key in env_obj.action_space] # values_of_states, policy = env_obj.value_iteration(V, policy, 0.01) # policy = np.random.randint(27, size=1331) # print "policy is ", policy # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory expected_svf = env_obj.compute_state_visitation_frequency(trajectories) # Returns the policy, state features of the trajectories considered in the state space and expected svf return expected_svf
def main(grid_size, discount_factor, epochs, learning_rate): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Reads the trajectory data from the csv files provided and returns the trajectories # The trajectories is numpy array of the number of trajectories provided trajectories = robot_mdp.generate_trajectories() # Finds the length of the trajectories data n_trajectories = len(trajectories) # Initialize the IRL class object, provide trajectory length as input, currently its value is 3 irl = MaxEntIRL(n_trajectories, grid_size) # Calculates the reward function based on the Max Entropy IRL algorithm reward, weights = irl.max_ent_irl(trajectories, discount_factor, n_trajectories, epochs, learning_rate) print "r is ", reward.reshape((grid_size, grid_size), order='F') # print "r shape ", reward.shape print "weights is ", weights.reshape((grid_size, grid_size), order='F')
def main(discount, epochs, learning_rate): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory sum_trajectory_features, feature_array_all_trajectories = robot_mdp.generate_trajectories() # Finds the length of the trajectories data n_trajectories = len(feature_array_all_trajectories) # Initialize the IRL class object, provide trajectory length as input, currently its value is 1 irl = MaxEntIRL(n_trajectories) # Calculates the reward function based on the Max Entropy IRL algorithm reward, weights = irl.max_ent_irl(sum_trajectory_features, feature_array_all_trajectories, discount, n_trajectories, epochs, learning_rate) print "r is ", reward # print "r shape ", reward.shape print "weights is ", weights
def find_expected_svf(self, grid_size, weights, discount, total_states): # Creates object of robot markov model robot_mdp = RobotMarkovModel() # Returns the state and action values of the state space being created state_values_from_trajectory, _ = robot_mdp.return_trajectories_data() # Finds the terminal state value from the expert trajectory # 0 indicates that the first trajectory data is being used # total_states indicates that the last value of that trajectory data should be used as terminal state terminal_state_val_from_trajectory = state_values_from_trajectory[0][ total_states - 1] # Pass the gridsize required # To create the state space model, we need to run the below commands env_obj = RobotStateUtils(grid_size, weights, terminal_state_val_from_trajectory) states = env_obj.create_state_space_model_func() action = env_obj.create_action_set_func() # print "State space created is ", states P_a = env_obj.get_transition_mat_deterministic() # print "P_a is ", P_a # Calculates the reward and feature for the trajectories being created in the state space rewards = [] state_features = [] for i in range(len(states)): r, f = env_obj.reward_func(states[i][0], states[i][1], states[i][2], weights) rewards.append(r) state_features.append(f) # print "rewards is ", rewards value, policy = env_obj.value_iteration(P_a, rewards, gamma=discount) # policy = np.random.randint(27, size=1331) print "policy is ", policy # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory expert_trajectory_states, _ = robot_mdp.return_trajectories_data() expected_svf = env_obj.compute_state_visitation_frequency( P_a, expert_trajectory_states, policy) # Formats the features array in a way that it can be multiplied with the svf values state_features = np.array([state_features]).transpose().reshape( (len(state_features[0]), len(state_features))) print "svf is ", expected_svf print "svf shape is ", expected_svf.shape # Returns the policy, state features of the trajectories considered in the state space and expected svf return policy, state_features, expected_svf
def calculate_optimal_policy_func(self, alpha, discount): # Creates an object for using the RobotMarkovModel class robot_mdp = RobotMarkovModel() self.create_state_space_model_func() self.create_action_set_func() n_actions = len(self.action_set) self.discount = discount self.model_index_pos_x, self.model_index_pos_y, self.model_index_pos_z = self.get_indices( self.model_end_pos_x, self.model_end_pos_y, self.model_end_pos_z) self.rewards, self.features, n_features = self.reward_func( self.model_end_pos_x, self.model_end_pos_y, self.model_end_pos_z, alpha) # self.features_arr = robot_mdp.features_func(self.model_end_pos_x, self.model_end_pos_y, # self.model_end_pos_z) # print "rewards is ", self.rewards.shape # print "reward 0 ", self.rewards[0][3][8] # return 0 return self.rewards, self.features, n_features
print(i, 'sweeps of state space for value iteration') return V, policy if __name__ == '__main__': # goal = np.array([0.005, 0.055, -0.125]) goal = np.array([0.5, 0.5, 0]) # term_state = np.random.randint(0, grid_size ** 3)] env_obj = RobotStateUtilsP(11, 0.9, goal) states = env_obj.create_state_space_model_func() action = env_obj.create_action_set_func() print "State space created is ", states print "actions is ", action index_val = env_obj.get_state_val_index(goal) print "index val is ", index_val mdp_obj = RobotMarkovModel() trajectories = mdp_obj.generate_trajectories() index_vals = np.zeros(len(trajectories[0])) rewards = np.zeros(len(states)) for i in range(len(trajectories[0])): # print "traj is ", trajectories[0][i] index_vals[i] = env_obj.get_state_val_index(trajectories[0][i]) for _, ele in enumerate(index_vals): if ele == index_val: rewards[int(ele)] = 10 else: rewards[int(ele)] = 1 env_obj.rewards = rewards print "rewards is ", rewards # initialize V(s) V = {}