def __init__(self): dat = loadmat('car_data_formatted_arc') x = np.copy(np.squeeze(dat['car_dat'])) aaa = np.arange(len(x)) random.shuffle(aaa) self.data = x[aaa] y = np.copy(np.squeeze(dat['car_dat'])) self.data_orig = y[aaa] self.count = 0 self.episode = -1 self.L = 100 self.numCars = 5 self.dt = 0.1 self.collision_flag = 0 self.state = np.copy(np.squeeze(self.data[0][0])) self.bot_state = np.copy(np.squeeze(self.data[0][0][9:12])) self.prior = BasePrior() self.action_space = spaces.Box(low=-7.0, high = 3.0, shape = (1,)) high = np.array([ np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max]) self.observation_space = spaces.Box(-high, high)
def __init__(self, args, sess): self.args = args self.sess = sess [A, B] = get_linear_dynamics() self.prior = BasePrior(A, B) self.env = gym.make(self.args.env_name) self.args.max_path_length = self.env.spec.timestep_limit self.agent = TRPO(self.args, self.env, self.sess, self.prior)
def __init__(self): dat = loadmat('car_data_formatted_arc') x = np.copy(np.squeeze(dat['car_dat'])) aaa = np.arange(len(x)) random.shuffle(aaa) self.data = x[aaa] y = np.copy(np.squeeze(dat['car_dat'])) self.data_orig = y[aaa] self.count = 0 self.episode = -1 self.L = 100 self.numCars = 5 self.dt = 0.1 self.collision_flag = 0 self.state = np.copy(np.squeeze(self.data[0][0])) self.bot_state = np.copy(np.squeeze(self.data[0][0][9:12])) self.prior = BasePrior()
def train(self, replay_buffer, minibatch_size): # Get dynamics and initialize prior controller prior = BasePrior() #self.sess.as_default() # Needed to enable BatchNorm #tflearn.is_training(True) #Sample a batch from the replay buffer s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( minibatch_size) # Calculate targets target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (minibatch_size, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network()
def train(sess, env, args, actor, critic, actor_noise, reward_result): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Get dynamics and initialize prior controller prior = BasePrior() # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. tflearn.is_training(True) paths = list() lambda_store = np.zeros((int(args['max_episode_len']), 1)) for i in range(int(args['max_episodes'])): s = env.reset_inc() ep_reward = 0. ep_ave_max_q = 0 obs, action, act_prior, rewards, obs_ref, prior_ref, collisions = [], [], [], [], [], [], [] #Get reward using baseline controller s0 = np.copy(s) ep_reward_opt = 0. for kk in range(int(args['max_episode_len'])): a = env.getPrior() prior_ref.append(np.array([a])) s0, r, stop_c, act = env.step(a) ep_reward_opt += r obs_ref.append(s0) if (stop_c): break # Get reward using regRL algorithm s = env.reset() for j in range(int(args['max_episode_len'])): # Set control prior regularization weight lambda_mix = 15. lambda_store[j] = lambda_mix # Get control prior a_prior = env.getPrior() # Rl control with exploration noise ab = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() # Mix the actions (RL controller + control prior) act = ab[0] / (1 + lambda_mix) + (lambda_mix / (1 + lambda_mix)) * a_prior # Take action and observe next state/reward s2, r, terminal, act = env.step(act) collisions.append(env.collision_flag) act = np.array(act, ndmin=1) # Add info from time step to the replay buffer replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(ab, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, )), np.reshape(a_prior, (actor.a_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): #Sample a batch from the replay buffer s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) a_batch = a_batch_0 # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r obs.append(s) rewards.append(r) action.append(act) act_prior.append(np.array([a_prior])) # Collect results at end of episode if terminal: print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward-ep_reward_opt), \ i, (ep_ave_max_q / float(j)))) reward_result[0, i] = ep_reward reward_result[1, i] = ep_reward_opt reward_result[2, i] = np.mean(lambda_store) reward_result[3, i] = max(collisions) path = { "Observation": np.concatenate(obs).reshape((-1, 6)), "Observation_ref": np.concatenate(obs_ref).reshape( (-1, 6)), "Action": np.concatenate(action), "Action_Prior": np.concatenate(act_prior), "Action_Prior_Ref": np.concatenate(prior_ref), "Reward": np.asarray(rewards) } paths.append(path) break return [summary_ops, summary_vars, paths]
def train(sess, env, args, actor, critic, actor_noise, reward_result): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Get dynamics and initialize prior controller [A, B] = get_linear_dynamics() prior = BasePrior(A, B) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) paths = list() for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0. ep_ave_max_q = 0 obs, action, rewards = [], [], [] #Get optimal reward using optimal control s0 = np.copy(s) ep_reward_opt = 0. for kk in range(int(args['max_episode_len'])): a_prior = prior.getControl_h(s0) a = a_prior s0, r, stop_c, _ = env.step(a) ep_reward_opt += r if (stop_c): break # Get reward using regRL algorithm env.reset() s = env.unwrapped.reset(s) for j in range(int(args['max_episode_len'])): # Set control prior regularization weight lambda_mix = 5. # Prior control a_prior = prior.getControl_h(s) # Rl control with exploration noise a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() #a = actor.predict(np.reshape(s, (1, actor.s_dim))) + (1. / (1. + i)) # Mix the actions (RL controller + control prior) act = a[0] / (1 + lambda_mix) + (lambda_mix / (1 + lambda_mix)) * a_prior # Take action and observe next state/reward s2, r, terminal, info = env.step(act) # Add info from time step to the replay buffer replay_buffer.add( np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, )), np.reshape((lambda_mix / (1 + lambda_mix)) * a_prior, (actor.a_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): #Sample a batch from the replay buffer s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) a_batch = a_batch_0 # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # Calculate TD-Error for each state base_q = critic.predict_target(s_batch, actor.predict_target(s_batch)) target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) s = s2 ep_reward += r obs.append(s) rewards.append(r) action.append(a[0]) # Collect results at end of episode if terminal: for ii in range(len(obs)): obs[ii] = obs[ii].reshape((4, 1)) print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward - ep_reward_opt), \ i, (ep_ave_max_q / float(j)))) reward_result[0, i] = ep_reward reward_result[1, i] = ep_reward_opt path = { "Observation": np.concatenate(obs).reshape((-1, 4)), "Action": np.concatenate(action), "Reward": np.asarray(rewards) } paths.append(path) print(ep_reward) break return [summary_ops, summary_vars, paths]
TIMESTAMP) #env = gym.make(ENVIRONMENT) env = allCars() #env = wrappers.Monitor(env, os.path.join(SUMMARY_DIR, ENVIRONMENT), video_callable=None) ppo = PPO(env, SUMMARY_DIR, gpu=True) if MODEL_RESTORE_PATH is not None: ppo.restore_model(MODEL_RESTORE_PATH) t, terminal = 0, False buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] rolling_r = RunningStats() # Get prior and set tuning parameters for adaptive regularization weight prior = BasePrior() lambda_store = np.zeros(BATCH + 1) lambda_all = np.zeros(EP_MAX + 1) lambda_max = 8 factor = 0.2 reward_total, reward_diff = [], [] for episode in range(EP_MAX + 1): # Baseline reward using only control prior sp = env.reset_inc() reward_prior = 0. while True: a_prior = env.getPrior() sp, reward_p, done_p, _ = env.step(a_prior)
TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S") SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "PPO", ENVIRONMENT, TIMESTAMP) env = gym.make(ENVIRONMENT) ppo = PPO(env, SUMMARY_DIR, gpu=True) if MODEL_RESTORE_PATH is not None: ppo.restore_model(MODEL_RESTORE_PATH) t, terminal = 0, False buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] rolling_r = RunningStats() # Initialize control prior [A,B] = get_linear_dynamics() prior = BasePrior(A,B) # Set fixed regularization weight # lambda_mix = 4. reward_total, reward_diff, reward_lqr_prior, reward_h_prior = [], [], [], [] for episode in range(EP_MAX + 1): # Baseline reward using only control prior s0 = env.reset() sp = np.copy(s0) reward_prior = 0. while True: a_prior = prior.getControl_h(sp) a_prior = np.squeeze(np.asarray(a_prior)) sp, reward_p, done_p, _ = env.step(a_prior)