def __init__(self, args, env, sess): self.firstIter = 1 self.count = 1 self.args = args self.sess = sess self.env = env self.torque_bound = 15. self.max_speed = 60. #Set up observation space and action space self.observation_space = env.observation_space self.action_space = env.action_space print('Observation space', self.observation_space) print('Action space', self.action_space) #Determine dimensions of observation & action space self.observation_size = self.env.observation_space.shape[0] self.action_size = self.action_space.shape[0] # Build neural network model for observations/actions self.build_model() # Build barrier function model cbf.build_barrier(self) # Build GP model of dynamics dynamics_gp.build_GP_model(self)
def main(args, reward_result): with tf.Session() as sess: env = gym.make(args['env']) np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) # Set environment parameters for pendulum env.unwrapped.max_torque = 15. env.unwrapped.max_speed = 60. env.unwrapped.action_space = spaces.Box(low=-env.unwrapped.max_torque, high=env.unwrapped.max_torque, shape=(1, )) high = np.array([1., 1., env.unwrapped.max_speed]) env.unwrapped.observation_space = spaces.Box(low=-high, high=high) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) agent = LEARNER(env) cbf.build_barrier(agent) dynamics_gp.build_GP_model(agent) agent.bar_comp = BARRIER(sess, 3, 1) [summary_ops, summary_vars, paths] = train(sess, env, args, actor, critic, actor_noise, reward_result, agent) return [summary_ops, summary_vars, paths]
def __init__(self, args, env, sess): self.args = args self.sess = sess self.env = env self.firstIter = 1 self.torque_bound = 100 #Determine dimensions of observation & action space self.observation_size = 15 self.action_size = 1 # Build neural network model for observations/actions self.build_model() # Build barrier function model cbf.build_barrier(self) # Build GP model dynamics_gp.build_GP_model(self)
def __init__(self, env, sess): self.firstIter = 1 self.count = 1 self.env = env self.torque_bound = 100. ''' #Set up observation space and action space self.observation_space = env.observation_space self.action_space = env.action_space print('Observation space', self.observation_space) print('Action space', self.action_space) ''' #Determine dimensions of observation & action space self.observation_size = 15 self.action_size = 1 # Build barrier function model cbf.build_barrier(self) # Build GP model of dynamics dynamics_gp.build_GP_model(self) self.bar_comp = BARRIER(sess, 15, 1)
def rollout(self): #Initialize variables paths = list() timesteps = 0 self.num_epi = 0 #Utilize GP from previous iteration while training current iteration if (self.firstIter == 1): pass else: self.GP_model_prev = self.GP_model.copy() dynamics_gp.build_GP_model(self) #Iterate through the specified number of episodes while timesteps < self.args.timesteps_per_batch: self.num_epi += 1 #Reset the environment obs, action, rewards, done, action_dist_mu, action_dist_logstd, action_bar, action_BAR, action_RL_mu_, action_RL_ = [], [], [], [], [], [], [], [], [], [] prev_obs = self.env.reset() obs = np.expand_dims(np.squeeze(prev_obs), 0) #Simulate dynamics for specified time for i in range(self.args.max_path_length): #self.env.render() prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0) #prev_obs_expanded = prev_obs #Agent takes actions from sampled action and action distribution parameters based on observation #All have shape of [1, action size] action_rl, action_dist_mu_rl, action_dist_logstd_ = self.act( prev_obs) #Utilize compensation barrier function u_BAR_ = self.bar_comp.get_action(prev_obs) action_RL = action_rl + u_BAR_ action_dist_mu_RL = action_dist_mu_rl + u_BAR_ t = 0.05 * i # Get GP dynamics if (self.firstIter == 1): [f, g, x, std ] = dynamics_gp.get_GP_dynamics(self, prev_obs_expanded, action_RL, t) else: [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev( self, prev_obs_expanded, action_RL, t) #Utilize safety barrier function u_bar_ = cbf.control_barrier(self, np.squeeze(prev_obs_expanded), action_dist_mu_RL, f, g, x, std) #action_ = action_RL + u_bar_ action_dist_mu_ = action_dist_mu_RL + u_bar_ #Stochastic action action_ = np.random.normal(loc=action_dist_mu_, scale=np.exp(action_dist_logstd_)) #Store observation and action/distribution obs = np.append(obs, prev_obs_expanded, axis=0) action_RL_mu_.append(action_dist_mu_rl) action_RL_.append(action_rl) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) action_dist_mu.append(action_dist_mu_) action_dist_logstd.append(action_dist_logstd_) # Simulate dynamics after action next_obs, reward_, done_ = self.env.step(action_) reward_ = np.squeeze(reward_) #next_obs, reward_, done_, _ = self.env.step(action_) #Get results done.append(done_) rewards.append(reward_) prev_obs = next_obs if i == self.args.max_path_length - 1: obs = obs[1:self.args.max_path_length + 1, :] path = { "Observation": obs, "Action": np.concatenate(action), "Action_RL_mu": np.concatenate(action_RL_mu_), "Action_RL": np.concatenate(action_RL_), "Action_mu": np.concatenate(action_dist_mu), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Action_logstd": np.concatenate(action_dist_logstd), "Done": np.asarray(done), "Reward": np.asarray(rewards) } paths.append(path) break #For timing purposes, only update GP dynamics for certain number of timesteps if (timesteps < 500): dynamics_gp.update_GP_dynamics(self, path) timesteps += len(rewards) #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps)) self.firstIter = 0 return paths
def main(args, reward_result, log_path): with tf.Session() as sess: env = gym.make(args['env']) np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) # Create the log files if not os.path.isdir(log_path): os.mkdir(log_path) log_save_name = log_path + '/episode_performance.csv' f = open(log_save_name, "w+") f.write( "episode number, steps in evaluation, accumulated reward, done \n") f.close() log_save_name_cbf = log_path + '/episode_cbf_performance.csv' f = open(log_save_name_cbf, "w+") f.write( "episode number, steps in evaluation, accumulated reward, done \n") f.close() # Set environment parameters for pendulum env.unwrapped.max_torque = 15. env.unwrapped.max_speed = 60. env.unwrapped.action_space = spaces.Box(low=-env.unwrapped.max_torque, high=env.unwrapped.max_torque, shape=(1, )) high = np.array([1., 1., env.unwrapped.max_speed]) env.unwrapped.observation_space = spaces.Box(low=-high, high=high) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) agent = LEARNER(env) cbf.build_barrier(agent) dynamics_gp.build_GP_model(agent) agent.bar_comp = BARRIER(sess, 3, 1) [summary_ops, summary_vars, paths] = train(sess, env, args, actor, critic, actor_noise, reward_result, agent, log_save_name, log_save_name_cbf) # Evaluate the final model 100 times to get a better idea of the final model's performance f = open(args['log_path'] + '/final_eval.csv', "w+") f.write("reward, steps, done, safe\n") episode_length = int(args['max_episode_len']) for k in range(100): steps, reward, done, safe = evaluate(env, actor, episode_length) f.write( str(reward) + ', ' + str(steps) + ', ' + str(done) + ', ' + str(safe) + '\n') f.close() # Evaluate the final model 100 times to get a better idea of the final model's performance f = open(args['log_path'] + '/final_cbf_eval.csv', "w+") f.write("reward, steps, done, safe\n") episode_length = int(args['max_episode_len']) for k in range(100): steps, reward, done, safe = evaluate_with_cbf( env, actor, agent, episode_length) f.write( str(reward) + ', ' + str(steps) + ', ' + str(done) + ', ' + str(safe) + '\n') f.close() return [summary_ops, summary_vars, paths]
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent, log_name, log_cbf_name): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) paths = list() # Extract the arguments that will be used repeatedly episode_length = int(args['max_episode_len']) max_episodes = int(args['max_episodes']) num_evals = int(args['num_evals']) # Evaluate initial performance for j in range(num_evals): # Without the CBF steps, reward, done, _ = evaluate(env, actor, episode_length) with open(log_name, "a") as myfile: myfile.write( str(0) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') # With the CBF steps, reward, done, _ = evaluate_with_cbf(env, actor, agent, episode_length) with open(log_cbf_name, "a") as myfile: myfile.write( str(0) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') for i in range(max_episodes): # Utilize GP from previous iteration while training current iteration if agent.firstIter == 1: pass else: agent.GP_model_prev = list(agent.GP_model) dynamics_gp.build_GP_model(agent) for el in range(5): obs, action, rewards, action_bar, action_BAR = [], [], [], [], [] s = env.reset() # Ensure that starting position is in "safe" region while not (-0.09 <= env.unwrapped.state[0] <= 0.09 and -0.01 <= env.unwrapped.state[1] <= 0.01): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(episode_length): # env.render() # Added exploration noise # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape( s, (1, actor.s_dim))) + actor_noise() # Incorporate barrier function action_rl = a[0] # Utilize compensation barrier function if agent.firstIter == 1: u_BAR_ = [0] else: u_BAR_ = agent.bar_comp.get_action(s)[0] action_RL = action_rl + u_BAR_ # Utilize safety barrier function if agent.firstIter == 1: [f, g, x, std] = dynamics_gp.get_GP_dynamics(agent, s, action_RL) else: [f, g, x, std ] = dynamics_gp.get_GP_dynamics_prev(agent, s, action_RL) u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL, f, g, x, std) action_ = action_RL + u_bar_ s2, r, terminal, info = env.step(action_) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(action_, (actor.a_dim,)), r, # terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r obs.append(s) rewards.append(r) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) if terminal: # writer.add_summary(summary_str, i) # writer.flush() print( '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) reward_result[i] = ep_reward path = { "Observation": np.concatenate(obs).reshape((200, 3)), "Action": np.concatenate(action), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Reward": np.asarray(rewards) } paths.append(path) break if el <= 3: dynamics_gp.update_GP_dynamics(agent, path) if (i <= 4): agent.bar_comp.get_training_rollouts(paths) barr_loss = agent.bar_comp.train() else: barr_loss = 0. agent.firstIter = 0 # Evaluate performance of trained model after the episode for k in range(num_evals): # Without the CBF steps, reward, done, _ = evaluate(env, actor, episode_length) with open(log_name, "a") as myfile: myfile.write( str(i * 5 + 5) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') # With the CBF steps, reward, done, _ = evaluate_with_cbf(env, actor, agent, episode_length) with open(log_cbf_name, "a") as myfile: myfile.write( str(i * 5 + 5) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') # Save the final model as a matlab file relu1_vars = tflearn.variables.get_layer_variables_by_name('relu1') relu2_vars = tflearn.variables.get_layer_variables_by_name('relu2') out_vars = tflearn.variables.get_layer_variables_by_name('out_layer') weights = [ actor.model.get_weights(relu1_vars[0]), actor.model.get_weights(relu2_vars[0]), actor.model.get_weights(out_vars[0]) ] biases = [ actor.model.get_weights(relu1_vars[1]), actor.model.get_weights(relu2_vars[1]), actor.model.get_weights(out_vars[1]) ] savemat(args['log_path'] + '/final_model.mat', mdict={ 'W': weights, 'b': biases }) return [summary_ops, summary_vars, paths]
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. tflearn.is_training(True) paths = list() for i in range(int(args['max_episodes'])): #Utilize GP from previous iteration while training current iteration if (agent.firstIter == 1): pass else: agent.GP_model_prev = agent.GP_model.copy() dynamics_gp.build_GP_model(agent) for el in range(5): obs, action, rewards, action_bar, action_BAR = [], [], [], [], [] s1 = env.reset() s = np.copy(s1) ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): #env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape( s, (1, actor.s_dim))) + actor_noise() #Incorporate barrier function action_rl = a[0] #Utilize compensation barrier function if (agent.firstIter == 1): u_BAR_ = [0] #u_BAR_ = agent.bar_comp.get_action(s)[0] else: u_BAR_ = [0] #u_BAR_ = agent.bar_comp.get_action(s)[0] action_RL = action_rl + u_BAR_ t = 0.05 * j #Utilize safety barrier function if (agent.firstIter == 1): [f, g, x, std ] = dynamics_gp.get_GP_dynamics(agent, s, action_RL, t) else: [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev( agent, s, action_RL, t) u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL, f, g, x, std) action_ = action_RL + u_bar_ s2, r, terminal = env.step(action_) #replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, # terminal, np.reshape(s2, (actor.s_dim,))) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(action_, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() obs.append(s) rewards.append(r) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) s = np.copy(s2) ep_reward += r if j == 80 - 1: #writer.add_summary(summary_str, i) #writer.flush() print( '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) reward_result[i] = ep_reward path = { "Observation": np.concatenate(obs).reshape((80, 15)), "Action": np.concatenate(action), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Reward": np.asarray(rewards) } paths.append(path) break if el <= 3: dynamics_gp.update_GP_dynamics(agent, path) agent.bar_comp.get_training_rollouts(paths) barr_loss = agent.bar_comp.train() agent.firstIter = 0 return [summary_ops, summary_vars, paths]