Example #1
0
def DifferentTrainingSet(i, nTraj, TrainingSet_tot, Labels_tot, TimeBatch, seed):
    max_epoch = 100
    TrainingSet = np.concatenate((TrainingSet_tot[0:max_epoch*nTraj[i],:],TrainingSet_tot[0:max_epoch*nTraj[i],:],TrainingSet_tot[0:max_epoch*nTraj[i],:]) ,axis=0)
    Labels = np.concatenate((Labels_tot[0:max_epoch*nTraj[i]],Labels_tot[0:max_epoch*nTraj[i]],Labels_tot[0:max_epoch*nTraj[i]]),axis=0)
    option_space = 2
        
    #Stopping Time
    StoppingTime = TimeBatch[i]
        
    # Online BW for HIL with tabular parameterization: Training
    M_step_epoch = 30
    optimizer = keras.optimizers.Adamax(learning_rate=1e-2)
    Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet, Labels, option_space, M_step_epoch, optimizer)
    T_min = len(TrainingSet)/(3) - 20
    start_online_time = time.time()
    pi_hi_online, pi_lo_online, pi_b_online, likelihood_online, time_per_iteration = Agent_OnlineHIL.Online_Baum_Welch_together(T_min, StoppingTime)
    end_online_time = time.time()
    Online_time = end_online_time-start_online_time
    # Time_array_online = np.append(Time_array_online, Online_time)  
    # Likelihood_online_list.append(likelihood_online)
    # time_likelihood_online_list.append(time_per_iteration)
    
    # Batch Agent Evaluation
    nTraj_eval = 100
    # Online Agent Evaluation
    OnlineSim = expert.Simulation_NN(pi_hi_online, pi_lo_online, pi_b_online)
    [trajOnline, controlOnline, OptionsOnline, 
    TerminationOnline, psiOnline, rewardOnline] = OnlineSim.HierarchicalStochasticSampleTrajMDP(max_epoch, nTraj_eval, seed)
    AverageRewardOnline = np.sum(rewardOnline)/nTraj_eval  
    STDOnline = np.std(rewardOnline)
    # RewardOnline_array = np.append(RewardOnline_array, AverageRewardOnline)
    # STDOnline_array = np.append(STDOnline_array, STDOnline)
    
    return Online_time, likelihood_online, time_per_iteration, AverageRewardOnline, STDOnline
Example #2
0
end_batch_time = time.time()
Batch_time = end_batch_time-start_batch_time
#evaluation
max_epoch = 1000
nTraj = 3
BatchSim = World.MountainCar.Simulation(pi_hi_batch, pi_lo_batch, pi_b_batch)
[trajBatch, controlBatch, OptionsBatch, 
 TerminationBatch, flagBatch] = BatchSim.HierarchicalStochasticSampleTrajMDP(max_epoch,nTraj)
x, u, o, b = BatchSim.HILVideoSimulation('Videos/VideosBatch/Simulation', max_epoch)
World.MountainCar.Plot(x, u, o, b, 'Figures/FiguresBatch/Batch_simulation.eps')
World.MountainCar.Animation.MakeAnimation(x, u, o, b, 'Videos/VideosBatch/animation/animation.mp4')

# %% Online BW for HIL with tabular parameterization: Training
M_step_epoch = 1
optimizer = keras.optimizers.Adamax(learning_rate=1e-2)
Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet, Labels, option_space, M_step_epoch, optimizer) 
T_min = 900
start_online_time = time.time()
pi_hi_online, pi_lo_online, pi_b_online, likelihood_online = Agent_OnlineHIL.Online_Baum_Welch_together(T_min)
end_online_time = time.time()
Online_time = end_online_time-start_online_time
#evaluation
OnlineSim = World.MountainCar.Simulation(pi_hi_online, pi_lo_online, pi_b_online)
[trajOnline, controlOnline, OptionsOnline, 
 TerminationOnline, flagOnline] = OnlineSim.HierarchicalStochasticSampleTrajMDP(max_epoch,nTraj)
x, u, o, b = OnlineSim.HILVideoSimulation('Videos/VideosOnline/Simulation', max_epoch)
World.MountainCar.Plot(x, u, o, b, 'Figures/FiguresOnline/Online_simulation.eps')
World.MountainCar.Animation.MakeAnimation(x, u, o, b, 'Videos/VideosOnline/animation/animation.mp4')


Example #3
0
    ss = expert.Environment.stateSpace
    Labels, TrainingSet = BatchBW_HIL.ProcessData(trajExpert, controlExpert,
                                                  psiExpert, ss)
    option_space = 2

    #Batch BW for HIL with tabular parameterization: Training
    Agent_BatchHIL = BatchBW_HIL.BatchHIL(TrainingSet, Labels, option_space)
    N = 10  #number of iterations for the BW algorithm
    start_batch_time = time.time()
    pi_hi_batch, pi_lo_batch, pi_b_batch = Agent_BatchHIL.Baum_Welch(N)
    end_batch_time = time.time()
    Batch_time = end_batch_time - start_batch_time
    Time_array_batch = np.append(Time_array_batch, Batch_time)

    # Online BW for HIL with tabular parameterization: Training
    Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet, Labels, option_space)
    T_min = nTraj[i] / 2
    start_online_time = time.time()
    pi_hi_online, pi_lo_online, pi_b_online, chi, rho, phi = Agent_OnlineHIL.Online_Baum_Welch(
        T_min)
    end_online_time = time.time()
    Online_time = end_online_time - start_online_time
    Time_array_online = np.append(Time_array_online, Online_time)

    # Expert
    AverageRewardExpert = np.sum(rewardExpert) / nTraj[i]
    STDExpert = np.std(rewardExpert)
    STDExpert_array = np.append(STDExpert_array, STDExpert)
    RewardExpert_array = np.append(RewardExpert_array, AverageRewardExpert)

    # Batch Agent Evaluation
Example #4
0
#evaluation
max_epoch = 20000
nTraj = 20
BatchSim = World.LunarLander.Simulation(pi_hi_batch, pi_lo_batch, pi_b_batch,
                                        Labels)
[trajBatch, controlBatch, OptionsBatch, TerminationBatch, RewardBatch
 ] = BatchSim.HierarchicalStochasticSampleTrajMDP(max_epoch, nTraj, seed)
x, u, o, b = BatchSim.HILVideoSimulation('Videos/VideosBatch/Simulation',
                                         max_epoch)
World.LunarLander.Plot(x, u, o, b, 'Figures/FiguresBatch/Batch_simulation.eps')

# %% Online BW for HIL with tabular parameterization: Training
M_step_epoch = 1
optimizer = keras.optimizers.Adamax(learning_rate=1e-2)
Agent_OnlineHIL = OnlineBW_HIL.OnlineHIL(TrainingSet[0:7000, :],
                                         Labels[0:7000], option_space,
                                         M_step_epoch, optimizer)
T_min = 100
start_online_time = time.time()
pi_hi_online, pi_lo_online, pi_b_online, likelihood_online, time_online = Agent_OnlineHIL.Online_Baum_Welch_together(
    T_min, Batch_time)
end_online_time = time.time()
Online_time = end_online_time - start_online_time
#evaluation
OnlineSim = World.LunarLander.Simulation(pi_hi_online, pi_lo_online,
                                         pi_b_online, Labels)
[trajOnline, controlOnline, OptionsOnline, TerminationOnline, RewardOnline
 ] = OnlineSim.HierarchicalStochasticSampleTrajMDP(max_epoch, nTraj, seed)
x, u, o, b = OnlineSim.HILVideoSimulation('Videos/VideosOnline/Simulation',
                                          max_epoch)
World.LunarLander.Plot(x, u, o, b,