def evaluate_with_cbf(env, actor, agent, episode_length): # Reset the environment s = env.reset() # Ensure that starting position is in "safe" region while not (-0.09 <= env.unwrapped.state[0] <= 0.09 and -0.01 <= env.unwrapped.state[1] <= 0.01): s = env.reset() ep_reward = 0 # Step through each step of the episode done = 0 safe = True steps = episode_length for i in range(episode_length): a = actor.predict(np.reshape(s, (1, actor.s_dim))) action_rl = a[0] u_BAR_ = agent.bar_comp.get_action(s)[0] action_RL = action_rl + u_BAR_ [f, g, x, std] = dynamics_gp.get_GP_dynamics(agent, s, action_RL) u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL, f, g, x, std) action_ = action_RL + u_bar_ s2, r, terminal, info = env.step(action_) if abs(env.unwrapped.state[0]) > 0.261799: safe = False s = s2 ep_reward += r if terminal: done = 1 steps = i + 1 break # Return the results return steps, ep_reward, done, safe
def sim(self): observation = self.env.reset() while (self.env.unwrapped.state[0] > 1 or self.env.unwrapped.state[0] < -1): observation = self.env.reset() total = 0 for t in range(200): #Render environment self.env.render() #Get action from NN policy obs_expanded = np.expand_dims(np.squeeze(observation), 0) #Get action distribution from policy network action_dist_mu, action_dist_logstd = self.sess.run( [self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs: obs_expanded}) #Sample action from gaussian distribution action_rl = np.random.normal(loc=action_dist_mu, scale=np.exp(action_dist_logstd)) #Get compensatory barrier action u_BAR_ = self.bar_comp.get_action(obs_expanded) u_RL = action_rl + u_BAR_ #Compensate with barrier-based control [f, g, x, std] = dynamics_gp.get_GP_dynamics(self, obs_expanded, u_RL) u_bar = cbf.control_barrier(self, obs_expanded, u_RL, f, g, x, std) action = u_bar + u_RL observation, reward, done, info = self.env.step(action) total = total + reward if done: print("Accumulated Reward: {}".format(total)) break
def rollout(self): #Initialize variables paths = list() timesteps = 0 self.num_epi = 0 #Utilize GP from previous iteration while training current iteration if (self.firstIter == 1): pass else: self.GP_model_prev = self.GP_model.copy() dynamics_gp.build_GP_model(self) #Iterate through the specified number of episodes while timesteps < self.args.timesteps_per_batch: self.num_epi += 1 #Reset the environment obs, action, rewards, done, action_dist_mu, action_dist_logstd, action_bar, action_BAR, action_RL_mu_, action_RL_ = [], [], [], [], [], [], [], [], [], [] prev_obs = self.env.reset() obs = np.expand_dims(np.squeeze(prev_obs), 0) #Simulate dynamics for specified time for i in range(self.args.max_path_length): #self.env.render() prev_obs_expanded = np.expand_dims(np.squeeze(prev_obs), 0) #prev_obs_expanded = prev_obs #Agent takes actions from sampled action and action distribution parameters based on observation #All have shape of [1, action size] action_rl, action_dist_mu_rl, action_dist_logstd_ = self.act( prev_obs) #Utilize compensation barrier function u_BAR_ = self.bar_comp.get_action(prev_obs) action_RL = action_rl + u_BAR_ action_dist_mu_RL = action_dist_mu_rl + u_BAR_ t = 0.05 * i # Get GP dynamics if (self.firstIter == 1): [f, g, x, std ] = dynamics_gp.get_GP_dynamics(self, prev_obs_expanded, action_RL, t) else: [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev( self, prev_obs_expanded, action_RL, t) #Utilize safety barrier function u_bar_ = cbf.control_barrier(self, np.squeeze(prev_obs_expanded), action_dist_mu_RL, f, g, x, std) #action_ = action_RL + u_bar_ action_dist_mu_ = action_dist_mu_RL + u_bar_ #Stochastic action action_ = np.random.normal(loc=action_dist_mu_, scale=np.exp(action_dist_logstd_)) #Store observation and action/distribution obs = np.append(obs, prev_obs_expanded, axis=0) action_RL_mu_.append(action_dist_mu_rl) action_RL_.append(action_rl) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) action_dist_mu.append(action_dist_mu_) action_dist_logstd.append(action_dist_logstd_) # Simulate dynamics after action next_obs, reward_, done_ = self.env.step(action_) reward_ = np.squeeze(reward_) #next_obs, reward_, done_, _ = self.env.step(action_) #Get results done.append(done_) rewards.append(reward_) prev_obs = next_obs if i == self.args.max_path_length - 1: obs = obs[1:self.args.max_path_length + 1, :] path = { "Observation": obs, "Action": np.concatenate(action), "Action_RL_mu": np.concatenate(action_RL_mu_), "Action_RL": np.concatenate(action_RL_), "Action_mu": np.concatenate(action_dist_mu), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Action_logstd": np.concatenate(action_dist_logstd), "Done": np.asarray(done), "Reward": np.asarray(rewards) } paths.append(path) break #For timing purposes, only update GP dynamics for certain number of timesteps if (timesteps < 500): dynamics_gp.update_GP_dynamics(self, path) timesteps += len(rewards) #print('%d episodes, %d steps collected for batch' % (self.num_epi, timesteps)) self.firstIter = 0 return paths
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent, log_name, log_cbf_name): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) paths = list() # Extract the arguments that will be used repeatedly episode_length = int(args['max_episode_len']) max_episodes = int(args['max_episodes']) num_evals = int(args['num_evals']) # Evaluate initial performance for j in range(num_evals): # Without the CBF steps, reward, done, _ = evaluate(env, actor, episode_length) with open(log_name, "a") as myfile: myfile.write( str(0) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') # With the CBF steps, reward, done, _ = evaluate_with_cbf(env, actor, agent, episode_length) with open(log_cbf_name, "a") as myfile: myfile.write( str(0) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') for i in range(max_episodes): # Utilize GP from previous iteration while training current iteration if agent.firstIter == 1: pass else: agent.GP_model_prev = list(agent.GP_model) dynamics_gp.build_GP_model(agent) for el in range(5): obs, action, rewards, action_bar, action_BAR = [], [], [], [], [] s = env.reset() # Ensure that starting position is in "safe" region while not (-0.09 <= env.unwrapped.state[0] <= 0.09 and -0.01 <= env.unwrapped.state[1] <= 0.01): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(episode_length): # env.render() # Added exploration noise # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape( s, (1, actor.s_dim))) + actor_noise() # Incorporate barrier function action_rl = a[0] # Utilize compensation barrier function if agent.firstIter == 1: u_BAR_ = [0] else: u_BAR_ = agent.bar_comp.get_action(s)[0] action_RL = action_rl + u_BAR_ # Utilize safety barrier function if agent.firstIter == 1: [f, g, x, std] = dynamics_gp.get_GP_dynamics(agent, s, action_RL) else: [f, g, x, std ] = dynamics_gp.get_GP_dynamics_prev(agent, s, action_RL) u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL, f, g, x, std) action_ = action_RL + u_bar_ s2, r, terminal, info = env.step(action_) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(action_, (actor.a_dim,)), r, # terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r obs.append(s) rewards.append(r) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) if terminal: # writer.add_summary(summary_str, i) # writer.flush() print( '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) reward_result[i] = ep_reward path = { "Observation": np.concatenate(obs).reshape((200, 3)), "Action": np.concatenate(action), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Reward": np.asarray(rewards) } paths.append(path) break if el <= 3: dynamics_gp.update_GP_dynamics(agent, path) if (i <= 4): agent.bar_comp.get_training_rollouts(paths) barr_loss = agent.bar_comp.train() else: barr_loss = 0. agent.firstIter = 0 # Evaluate performance of trained model after the episode for k in range(num_evals): # Without the CBF steps, reward, done, _ = evaluate(env, actor, episode_length) with open(log_name, "a") as myfile: myfile.write( str(i * 5 + 5) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') # With the CBF steps, reward, done, _ = evaluate_with_cbf(env, actor, agent, episode_length) with open(log_cbf_name, "a") as myfile: myfile.write( str(i * 5 + 5) + ', ' + str(steps) + ', ' + str(reward) + ', ' + str(done) + '\n') # Save the final model as a matlab file relu1_vars = tflearn.variables.get_layer_variables_by_name('relu1') relu2_vars = tflearn.variables.get_layer_variables_by_name('relu2') out_vars = tflearn.variables.get_layer_variables_by_name('out_layer') weights = [ actor.model.get_weights(relu1_vars[0]), actor.model.get_weights(relu2_vars[0]), actor.model.get_weights(out_vars[0]) ] biases = [ actor.model.get_weights(relu1_vars[1]), actor.model.get_weights(relu2_vars[1]), actor.model.get_weights(out_vars[1]) ] savemat(args['log_path'] + '/final_model.mat', mdict={ 'W': weights, 'b': biases }) return [summary_ops, summary_vars, paths]
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. tflearn.is_training(True) paths = list() for i in range(int(args['max_episodes'])): #Utilize GP from previous iteration while training current iteration if (agent.firstIter == 1): pass else: agent.GP_model_prev = agent.GP_model.copy() dynamics_gp.build_GP_model(agent) for el in range(5): obs, action, rewards, action_bar, action_BAR = [], [], [], [], [] s1 = env.reset() s = np.copy(s1) ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): #env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape( s, (1, actor.s_dim))) + actor_noise() #Incorporate barrier function action_rl = a[0] #Utilize compensation barrier function if (agent.firstIter == 1): u_BAR_ = [0] #u_BAR_ = agent.bar_comp.get_action(s)[0] else: u_BAR_ = [0] #u_BAR_ = agent.bar_comp.get_action(s)[0] action_RL = action_rl + u_BAR_ t = 0.05 * j #Utilize safety barrier function if (agent.firstIter == 1): [f, g, x, std ] = dynamics_gp.get_GP_dynamics(agent, s, action_RL, t) else: [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev( agent, s, action_RL, t) u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL, f, g, x, std) action_ = action_RL + u_bar_ s2, r, terminal = env.step(action_) #replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, # terminal, np.reshape(s2, (actor.s_dim,))) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(action_, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() obs.append(s) rewards.append(r) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) s = np.copy(s2) ep_reward += r if j == 80 - 1: #writer.add_summary(summary_str, i) #writer.flush() print( '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) reward_result[i] = ep_reward path = { "Observation": np.concatenate(obs).reshape((80, 15)), "Action": np.concatenate(action), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Reward": np.asarray(rewards) } paths.append(path) break if el <= 3: dynamics_gp.update_GP_dynamics(agent, path) agent.bar_comp.get_training_rollouts(paths) barr_loss = agent.bar_comp.train() agent.firstIter = 0 return [summary_ops, summary_vars, paths]