# Initial rollouts # ------------------------------------------------------------------------------ X, Y = np.empty([0, 5]), np.empty([0, 5]) for j in range(NinitRolls): x, y = systemRollout(env, hpol, pol) X = np.concatenate((X, x)) Y = np.concatenate((Y, y)) # ------------------------------------------------------------------------------ # Policy iteration # ------------------------------------------------------------------------------ k = 1 while not solved: print('--------------------------------------') print('Run', k) k += 1 print('Fitting GP model...') gp.fit(X, Y) print('Simulating rollouts...') R, W, F = predictReward(M, hpol, pol, gp) p = computeSampleWeighting(R, F, eps) hpol.update(W, F, p) x, y = systemRollout(env, hpol, pol) X = np.concatenate((X, x)) Y = np.concatenate((Y, y)) muR, solved = bench(env, hpol, pol, True)
np.random.seed(0) # Benchmark of initial policy print('--------------------------------------') print('Initial policy...') muR, solved = bench(env, hpol, pol, True) # ------------------------------------------------------------------------------ # Policy iteration # ------------------------------------------------------------------------------ k = 0 total_time = 0 while not solved: print('--------------------------------------') print('Run', k + 1) k += 1 R, W, F = predictReward(env, M, hpol, pol) s = time.time() p = computeSampleWeighting(R, F, eps) hpol.update(W, F, p) t = time.time() - s print("Update time", t) total_time += t muR, solved = bench(env, hpol, pol, True) print("Average update time", total_time / k)
upper_a = np.array([[-2, 100, 2, -100, 0, 0, 0, 0, 0, 0, 0, 0]]) # Initial upper-policy parameters upper_A = np.zeros((2, 12)) upper_sigma = np.eye(upper_a.shape[1]) * [20, 200, 200, 20, 0, 0, 0, 0, 0, 0, 0, 0] # ------------------------------------------------------------------------------ # Initialization of necesary classes # ------------------------------------------------------------------------------ offset = np.array([150, 150]) pol = LowerPolicy(-324, 324, target, offset, maxI = 30, minI = -30, dt = dt) # Lower-policy hpol = UpperPolicy(2) # Upper-policy hpol.set_parameters(upper_a, upper_A, upper_sigma) cost = Cost(np.array([0.005, 100]), target) # Cost function mod = Model(dt, pol, cost, noise = False) # Rollout model # Benchmark of initial policy validatePolicy(100, H, dt, pol, hpol, verbose = 0) # ------------------------------------------------------------------------------ # Policy iteration # ------------------------------------------------------------------------------ for k in range(K): print('--------------------------------------') print('Run', k+1, 'out of', K) R, W, F = predictReward(mod, M, H, hpol) p = computeSampleWeighting(R, F, eps) hpol.update(W, F, p) # Benchmark of end policy validatePolicy(100, H, dt, pol, hpol, verbose = 0)