def augment_dataset(self, traj_list, states, actions, rewards, sprimes): new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(traj_list) new_a = new_a new_data_obtained = len(new_s) > 0 if new_data_obtained: if states is not None: n_new = len(new_s) n_dim_state = states.shape[1] states = np.r_[states, new_s.reshape((n_new, n_dim_state))] actions = np.r_[actions, new_a] rewards = np.r_[rewards, new_r] sprimes = np.r_[sprimes, new_sprime.reshape((n_new, n_dim_state))] else: states = new_s actions = new_a rewards = new_r sprimes = new_sprime else: pass if states is not None: terminal_state_idxs = np.where(np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0] nonterminal_mask = np.ones((sprimes.shape[0], 1)) nonterminal_mask[terminal_state_idxs, :] = 0 else: nonterminal_mask = None return states, actions, rewards, sprimes, nonterminal_mask, new_data_obtained
def train(self, problem, seed, epochs=500, d_lr=1e-3, g_lr=1e-4): K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() pfilename = self.save_folder + '/' + str(seed) + '_performance.txt' pfile = open(pfilename, 'wb') self.n_feasible_trajs = 0 traj_list = [] self.pfilename = self.save_folder + '/' + str( seed) + '_performance.txt' pfile = open(self.pfilename, 'wb') n_data = 0 n_remains = [] for i in range(1, epochs): self.epoch = i print "N simulations %d/%d" % (i, epochs) if 'convbelt' in problem.name: length_of_rollout = 20 else: length_of_rollout = 10 for n_iter in range( 1): # N = 5, T = 20, using the notation from PPO paper problem.init_saver.Restore() problem.objects_currently_not_in_goal = problem.objects traj, n_remain = problem.rollout_the_policy( self, length_of_rollout) if len(traj['a']) > 0: traj_list.append(traj) n_remains.append(n_remain) if len(traj['a']) > 0: avg_J = self.log_traj_performance([traj_list[-1]], n_remains[-1], i, n_data) lowest_possible_reward = -2 if avg_J > lowest_possible_reward: self.n_feasible_trajs += 1 else: avg_J = self.log_traj_performance(-2.0, 7, i, n_data) is_time_to_train = i % 10 == 0 if is_time_to_train and len(traj_list) > 0: new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) n_data += len(new_s) self.update_V(new_s, new_sumR) new_sumA = self.compute_advantage_values( new_s, new_a, new_sprime, new_r, new_traj_lengths) self.update_policy(new_s, new_a, new_sumA) traj_list = [] n_remains = []
def train(self,states,actions,rewards,sprimes,sumR,traj_lengths,\ epochs=500,d_lr=1e-3,g_lr=1e-4): states = states.squeeze() sprimes = sprimes.squeeze() true_performance_list = [] G_performance_list = [] mse_list = [] K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() print "Fitting V..." current_best_J = -np.inf stime = time.time() self.update_V(states, sumR) adv = self.compute_A(states, actions, sprimes, rewards, traj_lengths) self.update_pi(states, actions, adv) self.saveWeights(additional_name='epoch_' + str(0)) print time.time() - stime # train pi for i in range(1, epochs): stime = time.time() print 'Completed: %.2f%%' % (i / float(epochs) * 100) # Try policy - 5 trajectories, each 20 long traj_list = [] for n_iter in range( 5): # N = 5, T = 20, using the notation from PPO paper problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, visualize=self.visualize) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) pfile = open(self.save_folder + '/performance.txt', 'a') pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n') pfile.close() print 'Score of this policy', avg_J print time.time() - stime # Add new data to the buffer new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) new_a = self.a_scaler.transform(new_a) self.update_V(new_s, new_sumR) new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r, new_traj_lengths) self.update_pi(new_s, new_a, new_sumA) if avg_J > current_best_J: current_best_J = avg_J theta_star = self.save_folder + '/policy_search_' + str( i) + '.h5' self.saveWeights(additional_name='epoch_'+\ str(i)+'_'+str(avg_J))
def train(self,states,actions,rewards,sprimes,\ epochs=500,d_lr=1e-3,g_lr=1e-4): states = states.squeeze() sprimes = sprimes.squeeze() true_performance_list = [] G_performance_list = [] mse_list = [] n_data = states.shape[0] K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() pfile = open(self.save_folder + '/performance.txt', 'w') pfile.close() current_best_J = -np.inf n_score_train = 1 pfile = open(self.save_folder + '/performance.txt', 'w') for i in range(1, epochs): BATCH_SIZE = np.min([32, int(len(actions) * 0.1)]) if BATCH_SIZE == 0: BATCH_SIZE = 1 terminal_state_idxs = np.where( np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0] nonterminal_mask = np.ones((sprimes.shape[0], 1)) nonterminal_mask[terminal_state_idxs, :] = 0 stime = time.time() print 'Completed: %.2f%%' % (i / float(epochs) * 100) n_iter = len(range(0, max(actions.shape[0], n_data), BATCH_SIZE)) n_iter = min(100, n_iter) print "n_iter", n_iter #for idx in range(0,max(actions.shape[0],n_data),BATCH_SIZE): for _ in range(n_iter): for score_train_idx in range(n_score_train): # choose a batch of data - experience replay indices = np.random.randint(0, actions.shape[0], size=BATCH_SIZE) s_batch = np.array(states[indices, :]) # collision vector a_batch = np.array(actions[indices, :]) r_batch = np.array(rewards[indices, :]) sprime_batch = np.array(sprimes[indices, :]) mask_batch = np.array(nonterminal_mask[ indices, :]) # 0 if terminal state, 1 ow fake = self.a_gen.predict([sprime_batch]) real = a_batch # make their scores fake_targets = np.ones( (BATCH_SIZE, 1)) * INFEASIBLE_SCORE # marks fake data real_targets = r_batch + np.multiply( self.disc.predict([fake, sprime_batch]), mask_batch) # Q = r(s,a) if mask=0 if s is terminal batch_x = np.vstack([fake, real]) batch_w = np.vstack([s_batch, s_batch]) batch_targets = np.vstack([fake_targets, real_targets]) self.disc.fit({ 'x': batch_x, 'w': batch_w }, batch_targets, epochs=1, verbose=False) # train G y_labels = np.ones((BATCH_SIZE, )) #dummy variable self.DG.fit({'w': s_batch}, { 'disc_output': y_labels, 'a_gen_output': y_labels }, epochs=1, verbose=0) print "Training took: %.2fs" % (time.time() - stime) # Try policy - 5 trajectories, each 20 long """ traj_list = [] for n_iter in range(5): problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self,20) traj_list.append(traj) problem.env.Destroy() RaveDestroy() """ stime = time.time() traj_list = self.parallel_rollout() print "Rollout took: %.2fs" % (time.time() - stime) avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) pfile = open(self.save_folder + '/performance.txt', 'a') pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n') pfile.close() print 'Score of this policy', avg_J # Add new data to the buffer new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) new_a = self.a_scaler.transform(new_a) states = np.r_[states, new_s.squeeze()] actions = np.r_[actions, new_a] rewards = np.r_[rewards, new_r] sprimes = np.r_[sprimes, new_sprime.squeeze()] if avg_J > current_best_J: current_best_J = avg_J theta_star = self.save_folder + '/policy_search_' + str( i) + '.h5' self.saveWeights(additional_name='lambda_'+str(LAMBDA)+'epoch_'+\ str(i)+'_'+str(avg_J)) print "Epoch took: %.2fs" % (time.time() - stime)
def train(self,states,actions,\ epochs=500,d_lr=1e-3,g_lr=1e-4): states = states.squeeze() true_performance_list = [] G_performance_list = [] mse_list = [] n_data = states.shape[0] BATCH_SIZE = np.min([32, int(len(actions) * 0.1)]) if BATCH_SIZE == 0: BATCH_SIZE = 1 print BATCH_SIZE K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() current_best_J = -np.inf n_score_train = 1 performance_list = [] pfile = open(self.save_folder + '/performance.txt', 'w') for i in range(1, epochs): stime = time.time() # Rollouts # 5 trajectories, each 20 long stime = time.time() traj_list = [] for n_iter in range(5): problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, self.v) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) pfile = open(self.save_folder + '/performance.txt', 'a') pfile.write(str(avg_J) + ',' + str(std_J) + '\n') pfile.close() print 'Score of this policy', avg_J # new rollout dataset new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) new_a = self.a_scaler.transform(new_a) # choose a batch of data indices = np.random.randint(0, actions.shape[0], size=BATCH_SIZE) s_batch = np.array(states[indices, :]) # collision vector a_batch = np.array(actions[indices, :]) pi_indices = np.random.randint(0, new_a.shape[0], size=BATCH_SIZE) pi_s_batch = np.array(new_s[pi_indices, :]) # collision vector pi_a_batch = np.array(new_a[pi_indices, :]) # make their scores fake_scores = np.zeros((BATCH_SIZE, 1)) real_scores = np.ones((BATCH_SIZE, 1)) batch_x = np.vstack([pi_a_batch, a_batch]) batch_w = np.vstack([pi_s_batch, s_batch]) batch_scores = np.vstack([fake_scores, real_scores]) # Update D self.disc.fit({ 'x': batch_x, 'w': batch_w }, batch_scores, epochs=1, verbose=False) new_r, new_sumR = self.compute_r_using_D(traj_list) # update value function self.update_V(new_s, new_sumR) # update policy new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r, new_traj_lengths) self.update_pi(new_s, new_a, new_sumA) self.saveWeights(additional_name='epoch_'+\ str(i)+'_'+str(avg_J)) print 'Completed: %.2f%%' % (i / float(epochs) * 100) print "Epoch took: %.2fs" % (time.time() - stime)
def train(self,states,actions,rewards,sprimes,\ epochs=500,d_lr=1e-3,g_lr=1e-4): states = states.squeeze() sprimes = sprimes.squeeze() true_performance_list = [] G_performance_list = [] mse_list = [] n_data = states.shape[0] BATCH_SIZE = np.min([32, int(len(actions) * 0.1)]) if BATCH_SIZE == 0: BATCH_SIZE = 1 print BATCH_SIZE K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() current_best_J = -np.inf pfile = open(self.save_folder + '/performance.txt', 'w') # n_episodes = epochs*5 # T = 20, but we update it once we finish executing all T # This is because this is an episodic task - you can only learn meaningful moves # if you go deep in the trajectory. # So, we have 300*5*20 RL data for i in range(1, epochs): print 'Completed: %.2f%%' % (i / float(epochs) * 100) stime = time.time() terminal_state_idxs = np.where( np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0] nonterminal_mask = np.ones((sprimes.shape[0], 1)) nonterminal_mask[terminal_state_idxs, :] = 0 # make the targets fake = self.a_gen.predict([sprimes]) # predicted by pi real = actions real_targets = rewards + np.multiply( self.disc.predict([fake, sprimes]), nonterminal_mask) stime = time.time() self.update_disc(real, states, real_targets, BATCH_SIZE) self.update_pi(states, BATCH_SIZE) print 'Fitting time', time.time() - stime # Technically speaking, we should update the policy every timestep. # What if we update it 100 times after we executed 5 episodes, each with 20 timesteps?? stime = time.time() traj_list = [] for n_iter in range(5): problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, self.v) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) pfile = open(self.save_folder + '/performance.txt', 'a') pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n') pfile.close() print 'Score of this policy', avg_J # Add new data to the buffer - only if this was a non-zero trajectory if avg_J > 1.0: new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) new_a = self.a_scaler.transform(new_a) states = np.r_[states, new_s.squeeze()] actions = np.r_[actions, new_a] rewards = np.r_[rewards, new_r] sprimes = np.r_[sprimes, new_sprime.squeeze()] print "Rollout time", time.time() - stime if avg_J > current_best_J: current_best_J = avg_J theta_star = self.save_folder + '/policy_search_' + str( i) + '.h5' self.saveWeights(additional_name='tau_'+str(self.tau)+'epoch_'+\ str(i)+'_'+str(avg_J))