def main(method): if (method == "linear"): print("linear method") theta = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) elif (method == "deep"): print("deep method") l1 = l2 = 0 theta = deep_maxent.irl( (feature_matrix.shape[1], ) + network_structure, feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) print(theta.shape) recovered_reward = feature_matrix.dot(theta).reshape((n_states, )) scaler = StandardScaler() standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1)) plot.plot(ground_r, standardised_reward, grid_size)
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) #trajectories = gw.generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) trajectories = gw.my_generate_trajectories(n_trajectories, trajectory_length, gw.optimal_policy) feature_matrix = gw.feature_matrix() print(trajectories.shape) #feature_matrix = gw.feature_matrix_goalVsOther() #feature_matrix = gw.feature_matrix_goalVsOtherTwo() #feature_matrix = gw.feature_matrix_goalVsOtherThree() #ground truth given by us as we know which states are good vs bad ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #reard recovered using IRL algorithm recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) #let's standardiese it scaler = StandardScaler() standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1)) #print(recovered_reward) #print(standardised_reward) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(standardised_reward.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def execute_maxent(world, terminal, trajectories): """ Maximum Entropy Inverse Reinforcement Learning """ # set up features: we use one feature vector per state features = world.state_features() # choose our parameter initialization strategy: # initialize parameters with constant init = optimizer.Constant(0.1) # choose our optimization strategy: # we select exponentiated gradient descent with linear learning-rate decay optim = optimizer.ExpSga(lr=optimizer.linear_decay(lr0=0.2)) # actually do some inverse reinforcement learning reward = maxent.irl(world.p_transition, features, terminal, trajectories, optim, init) return reward
def maxent(world, terminal, trajectories, avoid_states=None): """ Maximum Entropy Inverse Reinforcement Learning """ # set up features: we use one feature vector per state # features = W.state_features(world) features = W.state_custom_features(world, avoid_states, terminal) # choose our parameter initialization strategy: # initialize parameters with constant init = O.Constant(1.0) # choose our optimization strategy: # we select exponentiated gradient descent with linear learning-rate decay optim = O.ExpSga(lr=O.linear_decay(lr0=0.2)) # actually do some inverse reinforcement learning reward = M.irl(world.p_transition, features, terminal, trajectories, optim, init) return reward
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) feature_matrix = ow.feature_matrix(discrete=False) print(feature_matrix) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_trajectories, epochs, learning_rate): wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) #trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) #trajectories = gw.my_generate_trajectories_some_without_goal(n_trajectories,trajectory_length,gw.optimal_policy) trajectories = gw.my_generate_trajectories_multiple( n_trajectories, trajectory_length, gw.optimal_policy) feature_matrix = gw.feature_matrix() #feature_matrix = gw.feature_matrix_goalVsOther() #feature_matrix = gw.feature_matrix_goalVsOtherTwo() #feature_matrix = gw.feature_matrix_goalVsOtherThree() #ground truth given by us as we know which states are good vs bad ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #reard recovered using IRL algorithm recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) #let's standardiese it scaler = StandardScaler() standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1)) #print(recovered_reward) #print(standardised_reward) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(standardised_reward.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_trajectories, epochs, learning_rate, trajectory_length, trust, expert_type, random_start): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 1 - trust gw = gridworld.Gridworld(grid_size, wind, discount, expert_type) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, gw.optimal_policy, random_start=random_start) feature_matrix = gw.feature_matrix() ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) r = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) print r.reshape((grid_size, grid_size)) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
feature = [ feature_1, feature_2, feature_3, feature_4, feature_5, feature_6 ] feature = np.concatenate( (feature_1, feature_2, feature_3, feature_4, feature_5, feature_6), axis=1) return feature if __name__ == '__main__': obstacle_leftup = [[0, 0], [0, 26], [30, 20], [40, 0]] obstacle_hei_wid = [[10, 27], [12, 18], [20, 10], [15, 6]] obstacles = (obstacle_leftup, obstacle_hei_wid) grid_map = GridMap(50, 50, obstacles) traj = load_trajectories('data/trajectories') print traj features = create_features(grid_map) r = maxent.irl(features, 4, 0.99, grid_map.transition_mat, traj, 20, 0.05) f = open('data/reward/grid_world_reward', 'w') pickle.dump(r, f) f.close() plt.pcolor(r.reshape((grid_map.x, grid_map.y))) plt.colorbar() plt.title("Recorvered Reward") plt.show()
df_passive_u = RT_p_u.append(rp_p_u, ignore_index=True) df_passive_u = df_passive_u.append(m_p_u, ignore_index=True) df_passive_u = df_passive_u.sort_values(by='time') df_passive_u.reset_index(inplace=True) del df_passive_u['index'] # create total dataframe df_total = merge_df(df_active_u, df_passive_u) # IRL trajectories, state_sequence, n_states, n_actions, feature_matrix, t_dict, c_dict = tweet_traj_next_reduced( df_total) # compute trajectories and other information tp = compute_tp(state_sequence, n_states, n_actions) # compute transition probabilities r = maxent.irl( feature_matrix, n_actions, discount, tp, trajectories, epochs, learning_rate ) # maximum entropy IRL (comment this line if you want to use deep maximum entropy IRL --> line below) # r = deep_maxent.irl((feature_matrix.shape[1],) + structure, feature_matrix,n_actions, discount, tp, trajectories, epochs,learning_rate, l1=l1, l2=l2) # deep maximum entropy IRL (comment this line if you want to use maximum entropy IRL --> line above) w = np.linalg.lstsq(feature_matrix, r)[0] # compute weights of each feauture # save results row = [user_name, r, w] df_results.loc[count] = row count += 1 else: print("user %s has %s actions and %s states" % (user_name, n_a, n_p)) df_results.to_csv( "df_results_trolls_IRL.csv", index=False ) # save rewards in a csv file (NB:change the name of the file for generic users in df_results_users_IRL.csv)
def train(q_learning_rate, inverse_learning_rate): e = env.Env(N_STEP, N_STATES, N_ACTIONS, N_FEATURES) demonstrations = np.loadtxt('test.csv', delimiter=',', dtype=str) e.get_init(demonstrations) q_table = np.random.uniform(size=(N_STEP, N_STATES, N_ACTIONS)) feature_expectations = np.zeros(N_FEATURES) maxent.find_feature_expectations(demonstrations, feature_expectations, e) irl_feature_expectations = np.zeros(N_FEATURES) alpha = np.random.uniform(size=(N_FEATURES, )) e.set_alpha(alpha) grad = [] for episode in range(2500000): state = e.reset() t = 0 if episode != 0 and episode % 50000 == 0: # update alpha # print(episode, q_table) # print(episode, irl_feature_expectations) learner = irl_feature_expectations / float(episode) gradient = maxent.irl(feature_expectations, learner, alpha, inverse_learning_rate) print(gradient) grad.append(np.linalg.norm(gradient)) e.set_alpha(alpha) series = [state] irl_feature_expectations += e.feature_vector(series) while True: action = choose_action(q_table[t][int(state)]) next_state = e.step(action) series.append(next_state) reward = e.get_reward(series) update_q_table(t, state, action, reward, next_state, q_learning_rate, q_table) irl_feature_expectations += e.feature_vector(series) t += 1 state = next_state if t == 5: break print(alpha) print(grad) plt.plot(grad, label='q_learning_rate: ' + str(q_learning_rate) + ' inverse_learning_rate: ' + str(inverse_learning_rate)) # plt.ylim(0, int(max(grad))+1) plt.title('q_learning_rate: ' + str(q_learning_rate) + ' inverse_learning_rate: ' + str(inverse_learning_rate)) plt.savefig('train_' + str(q_learning_rate) + '_' + str(inverse_learning_rate) + '.png') episodes = [] for demo in demonstrations: episode = demo[0] state = demo[0] t = 0 while True: action = choose_action(q_table[t][int(state)], greedy=0) next_state = e.step(action) t += 1 state = next_state episode += state if t == 5: break episodes.append(episode) with open( str(q_learning_rate) + '_' + str(inverse_learning_rate) + 'out.csv', 'w') as w: for episode in episodes: w.write(episode) w.write('\n')