def get_policy(weights, n_iter, n_time): global model global r, v, index_x, index_y, index_vel_theta, index_speed, state_x, state_y, state_vel_theta, state_speed model = mdp(np.array([0, 0, 5, 0], dtype='float64')) x = np.linspace(-1.5, 1.5, 301, dtype='float64') vtheta = np.linspace(-math.pi, math.pi, 101, dtype='float64') s = np.linspace(0, 0.1, 11, dtype='float64') print 'Creating state space...' state_x, state_y, state_vel_theta, state_speed = np.meshgrid( x, x, vtheta, s) print 'State space created.' # plot_x = np.linspace(-1.5,1.5,21, dtype = 'float32') # plot_z = np.linspace(0.9, 1, 3, dtype = 'float32') # plot_xv, plot_yv, plot_zv = np.meshgrid(plot_x, plot_x, plot_z) # print xv.shape action_set = [] for j1 in [-0.01, 0, 0.01]: for j2 in [-0.01, 0, 0.01]: for j3 in [-0.1, 0, 0.1]: # for j4 in [-0.01, 0, 0.01]: action_set.append(np.array([j1, j2, j3, 0])) # print len(action_set) # r = reward(state_x, state_y, state_vel_theta, state_speed) r, f = features.reward(state_x, state_y, state_vel_theta, state_speed, weights) index_x, index_y, index_vel_theta, index_speed = get_indices( state_x, state_y, state_vel_theta, state_speed) policy = [] for iter in range(0, n_iter): action_value = [] policy = [] print "Policy Iteration:", iter # start_time = t.time() with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: if iter == 0: func = initial_loop else: func = main_loop for q, p in executor.map(func, action_set): action_value.append(q) policy.append(p) print "Evaluating Policy..." policy = policy / sum(policy) v = sum(policy * action_value) # end_time = t.time() # print end_time-start_time # mu = np.empty([301,301,101,11]) print "Final Policy evaluated." print "Calulating State Visitation Frequency..." mu = np.exp(-(state_x + 0.15)**2 / 0.25**2) * np.exp( -(state_y - 0.27)**2 / 0.5**2) * np.exp(0.004 * state_speed) mu_reshape = np.reshape(mu, [301 * 301 * 101 * 11, 1]) mu = mu / sum(mu_reshape) mu_last = mu print "Initial State Frequency calculated..." for time in range(0, n_time): s = np.zeros([301, 301, 101, 11]) for act_index, action in enumerate(action_set): new_state_x, new_state_v, new_state_vel_theta, new_state_speed = model.get_next_state( state_x, state_y, state_vel_theta, state_speed, action) new_index_x, new_index_y, new_index_vel_theta, new_index_speed = get_indices( new_state_x, new_state_v, new_state_vel_theta, new_state_speed) p = policy[act_index, index_x, index_y, index_vel_theta, index_speed] s = s + p * mu_last[new_index_x, new_index_y, new_index_vel_theta, new_index_speed] mu_last = s mu = mu + mu_last mu = mu / n_time state_visitation = mu_last * f print "State Visitation Frequency calculated." return np.sum(state_visitation.reshape(2, 301 * 301 * 101 * 11), axis=1), policy
Z = np.empty([0, 1]) trajectories_probability = np.empty([len(state_trajectories), 1], dtype='float32') for n in range(0, n_iterations): print "Iteration: ", n trajectories_reward = [] trajectories_features = [] for state_trajectory in state_trajectories: trajectory_reward = np.zeros([1, 1], dtype='float32') trajectory_features = np.zeros([2, 1], dtype='float32') for iter in range(0, state_trajectory.shape[0]): x = np.atleast_2d(state_trajectory[iter, 0]) y = np.atleast_2d(state_trajectory[iter, 1]) vtheta = np.atleast_2d(state_trajectory[iter, 2]) speed = np.atleast_2d(state_trajectory[iter, 3]) r, f = features.reward(x, y, vtheta, speed, weights) trajectory_reward = trajectory_reward + r trajectory_features = trajectory_features + np.vstack((f[0], f[1])) trajectories_reward.append(trajectory_reward) trajectories_features.append(trajectory_features) # print trajectory_features # print len(trajectories_reward) trajectories_probability = np.exp(trajectories_reward) feature_state, policy = cudatrial.get_policy(weights, rl_iter, svf_iter) # print sum(feature_state.reshape(301*301*101*11,1)) Z = np.vstack((Z, sum(trajectories_reward))) # # trajectories_probability.reshape((len(trajectories_reward),1)) # L=np.vstack((L,sum(trajectories_reward)/n_traj - np.log(Z))) # # if L[n]<L[n-1]: # # break #
print "Iteration: ", n trajectories_reward = [] trajectories_features = [] trajectory_reward = np.zeros([1, 1], dtype='float32') trajectory_features = np.zeros([2, 1], dtype='float32') for iter in range(0, state_trajectories.shape[0]): rot_par_r = state_trajectories[iter, 0] rot_par_p = state_trajectories[iter, 1] rot_par_y = state_trajectories[iter, 2] end_pos_x = state_trajectories[iter, 3] end_pos_y = state_trajectories[iter, 4] end_pos_z = state_trajectories[iter, 5] r, f = features.reward( np.array([ rot_par_r, rot_par_p, rot_par_y, end_pos_x, end_pos_y, end_pos_z ]), weights) trajectory_reward = trajectory_reward + r trajectory_features = trajectory_features + np.vstack((f[0], f[1])) trajectories_reward.append(trajectory_reward) trajectories_features.append(trajectory_features) # print trajectory_features # print len(trajectories_reward) trajectories_probability = np.exp(trajectories_reward) feature_state, policy = mdp_obj.get_policy(weights, rl_iter, svf_iter) # print sum(feature_state.reshape(301*301*101*11,1)) Z = np.vstack((Z, sum(trajectories_reward))) # # trajectories_probability.reshape((len(trajectories_reward),1)) # L=np.vstack((L,sum(trajectories_reward)/n_traj - np.log(Z))) # # if L[n]<L[n-1]:
def get_policy(self, weights, n_iter, n_time): print 'Creating state space...' self.model_state_values = np.meshgrid(self.model_rot_r_val, self.model_rot_p_val, self.model_rot_y_val, self.model_pos_x_val, self.model_pos_y_val, self.model_pos_z_val, sparse=True) print 'State space created.' # Creating action set # The rotation values have accuracy of 0.01 and position values have 0.001 accuracy for rot_r in [-0.01, 0, 0.01]: for rot_p in [-0.01, 0, 0.01]: for rot_y in [-0.01, 0, 0.01]: for pos_x in [-0.001, 0, 0.001]: for pos_y in [-0.001, 0, 0.001]: for pos_z in [-0.001, 0, 0.001]: self.action_set.append(np.array([rot_r, rot_p, rot_y, pos_x, pos_y, pos_z])) # Get the reward and feature values for all the model state values self.r, self.f = features.reward(self.model_state_values, weights) # Get the index value for each of the model state value self.model_index_values = self.get_indices(self.model_state_values) policy = [] for iter in range(0, n_iter): action_value = [] policy = [] print "Policy Iteration:", iter # start_time = t.time() with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: if iter == 0: func = self.initial_loop else: func = self.main_loop for q, p in executor.map(func, self.action_set): action_value.append(q) policy.append(p) print "Evaluating Policy..." policy = policy/sum(policy) self.v = sum(policy*action_value) # end_time = t.time() # print end_time-start_time # mu = np.empty([301,301,101,11]) print "Final Policy evaluated." print "Calulating State Visitation Frequency..." mu = np.exp(-(float(self.model_state_values[0]))**2)*np.exp(-(float(self.model_state_values[1]))**2) * \ np.exp(-(float(self.model_state_values[2]))**2)*np.exp(-(float(self.model_state_values[3]))**2) * \ np.exp(-(float(self.model_state_values[4]))**2)*np.exp(-(float(self.model_state_values[5]))**2) mu_reshape = np.reshape(mu, [11*11*11*11*11*11, 1]) mu = mu/sum(mu_reshape) mu_last = mu print "Initial State Frequency calculated..." for time in range(0, n_time): s = np.zeros([11, 11, 11, 11, 11, 11]) for act_index, action in enumerate(self.action_set): new_state_values = self.get_next_state(self.model_state_values, action) new_index_values = self.get_indices(new_state_values) p = policy[act_index, self.model_index_values] s = s + p*mu_last[new_index_values] mu_last = s mu = mu + mu_last mu = mu/n_time state_visitation = mu_last*self.f print "State Visitation Frequency calculated." return np.sum(state_visitation.reshape(2, 11*11*11*11*11*11), axis=1), policy