def DifferentTrainingSet(i, nTraj, TrainingSet_tot, Labels_tot, TimeBatch, seed): max_epoch = 100 TrainingSet = np.concatenate((TrainingSet_tot[0:max_epoch*nTraj[i],:],TrainingSet_tot[0:max_epoch*nTraj[i],:],TrainingSet_tot[0:max_epoch*nTraj[i],:]) ,axis=0) Labels = np.concatenate((Labels_tot[0:max_epoch*nTraj[i]],Labels_tot[0:max_epoch*nTraj[i]],Labels_tot[0:max_epoch*nTraj[i]]),axis=0) option_space = 2 #Stopping Time StoppingTime = TimeBatch[i] # Online BW for HIL with tabular parameterization: Training M_step_epoch = 30 optimizer = keras.optimizers.Adamax(learning_rate=1e-2) Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet, Labels, option_space, M_step_epoch, optimizer) T_min = len(TrainingSet)/(3) - 20 start_online_time = time.time() pi_hi_online, pi_lo_online, pi_b_online, likelihood_online, time_per_iteration = Agent_OnlineHIL.Online_Baum_Welch_together(T_min, StoppingTime) end_online_time = time.time() Online_time = end_online_time-start_online_time # Time_array_online = np.append(Time_array_online, Online_time) # Likelihood_online_list.append(likelihood_online) # time_likelihood_online_list.append(time_per_iteration) # Batch Agent Evaluation nTraj_eval = 100 # Online Agent Evaluation OnlineSim = expert.Simulation_NN(pi_hi_online, pi_lo_online, pi_b_online) [trajOnline, controlOnline, OptionsOnline, TerminationOnline, psiOnline, rewardOnline] = OnlineSim.HierarchicalStochasticSampleTrajMDP(max_epoch, nTraj_eval, seed) AverageRewardOnline = np.sum(rewardOnline)/nTraj_eval STDOnline = np.std(rewardOnline) # RewardOnline_array = np.append(RewardOnline_array, AverageRewardOnline) # STDOnline_array = np.append(STDOnline_array, STDOnline) return Online_time, likelihood_online, time_per_iteration, AverageRewardOnline, STDOnline
end_batch_time = time.time() Batch_time = end_batch_time-start_batch_time #evaluation max_epoch = 1000 nTraj = 3 BatchSim = World.MountainCar.Simulation(pi_hi_batch, pi_lo_batch, pi_b_batch) [trajBatch, controlBatch, OptionsBatch, TerminationBatch, flagBatch] = BatchSim.HierarchicalStochasticSampleTrajMDP(max_epoch,nTraj) x, u, o, b = BatchSim.HILVideoSimulation('Videos/VideosBatch/Simulation', max_epoch) World.MountainCar.Plot(x, u, o, b, 'Figures/FiguresBatch/Batch_simulation.eps') World.MountainCar.Animation.MakeAnimation(x, u, o, b, 'Videos/VideosBatch/animation/animation.mp4') # %% Online BW for HIL with tabular parameterization: Training M_step_epoch = 1 optimizer = keras.optimizers.Adamax(learning_rate=1e-2) Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet, Labels, option_space, M_step_epoch, optimizer) T_min = 900 start_online_time = time.time() pi_hi_online, pi_lo_online, pi_b_online, likelihood_online = Agent_OnlineHIL.Online_Baum_Welch_together(T_min) end_online_time = time.time() Online_time = end_online_time-start_online_time #evaluation OnlineSim = World.MountainCar.Simulation(pi_hi_online, pi_lo_online, pi_b_online) [trajOnline, controlOnline, OptionsOnline, TerminationOnline, flagOnline] = OnlineSim.HierarchicalStochasticSampleTrajMDP(max_epoch,nTraj) x, u, o, b = OnlineSim.HILVideoSimulation('Videos/VideosOnline/Simulation', max_epoch) World.MountainCar.Plot(x, u, o, b, 'Figures/FiguresOnline/Online_simulation.eps') World.MountainCar.Animation.MakeAnimation(x, u, o, b, 'Videos/VideosOnline/animation/animation.mp4')
ss = expert.Environment.stateSpace Labels, TrainingSet = BatchBW_HIL.ProcessData(trajExpert, controlExpert, psiExpert, ss) option_space = 2 #Batch BW for HIL with tabular parameterization: Training Agent_BatchHIL = BatchBW_HIL.BatchHIL(TrainingSet, Labels, option_space) N = 10 #number of iterations for the BW algorithm start_batch_time = time.time() pi_hi_batch, pi_lo_batch, pi_b_batch = Agent_BatchHIL.Baum_Welch(N) end_batch_time = time.time() Batch_time = end_batch_time - start_batch_time Time_array_batch = np.append(Time_array_batch, Batch_time) # Online BW for HIL with tabular parameterization: Training Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet, Labels, option_space) T_min = nTraj[i] / 2 start_online_time = time.time() pi_hi_online, pi_lo_online, pi_b_online, chi, rho, phi = Agent_OnlineHIL.Online_Baum_Welch( T_min) end_online_time = time.time() Online_time = end_online_time - start_online_time Time_array_online = np.append(Time_array_online, Online_time) # Expert AverageRewardExpert = np.sum(rewardExpert) / nTraj[i] STDExpert = np.std(rewardExpert) STDExpert_array = np.append(STDExpert_array, STDExpert) RewardExpert_array = np.append(RewardExpert_array, AverageRewardExpert) # Batch Agent Evaluation
#evaluation max_epoch = 20000 nTraj = 20 BatchSim = World.LunarLander.Simulation(pi_hi_batch, pi_lo_batch, pi_b_batch, Labels) [trajBatch, controlBatch, OptionsBatch, TerminationBatch, RewardBatch ] = BatchSim.HierarchicalStochasticSampleTrajMDP(max_epoch, nTraj, seed) x, u, o, b = BatchSim.HILVideoSimulation('Videos/VideosBatch/Simulation', max_epoch) World.LunarLander.Plot(x, u, o, b, 'Figures/FiguresBatch/Batch_simulation.eps') # %% Online BW for HIL with tabular parameterization: Training M_step_epoch = 1 optimizer = keras.optimizers.Adamax(learning_rate=1e-2) Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet[0:7000, :], Labels[0:7000], option_space, M_step_epoch, optimizer) T_min = 100 start_online_time = time.time() pi_hi_online, pi_lo_online, pi_b_online, likelihood_online, time_online = Agent_OnlineHIL.Online_Baum_Welch_together( T_min, Batch_time) end_online_time = time.time() Online_time = end_online_time - start_online_time #evaluation OnlineSim = World.LunarLander.Simulation(pi_hi_online, pi_lo_online, pi_b_online, Labels) [trajOnline, controlOnline, OptionsOnline, TerminationOnline, RewardOnline ] = OnlineSim.HierarchicalStochasticSampleTrajMDP(max_epoch, nTraj, seed) x, u, o, b = OnlineSim.HILVideoSimulation('Videos/VideosOnline/Simulation', max_epoch) World.LunarLander.Plot(x, u, o, b,