コード例 #1
# Initial rollouts
# ------------------------------------------------------------------------------
X, Y = np.empty([0, 5]), np.empty([0, 5])
for j in range(NinitRolls):
    x, y = systemRollout(env, hpol, pol)
    X = np.concatenate((X, x))
    Y = np.concatenate((Y, y))

# ------------------------------------------------------------------------------
# Policy iteration
# ------------------------------------------------------------------------------
k = 1
while not solved:
    print('Run', k)
    k += 1

    print('Fitting GP model...')
    gp.fit(X, Y)

    print('Simulating rollouts...')
    R, W, F = predictReward(M, hpol, pol, gp)
    p = computeSampleWeighting(R, F, eps)
    hpol.update(W, F, p)

    x, y = systemRollout(env, hpol, pol)
    X = np.concatenate((X, x))
    Y = np.concatenate((Y, y))

    muR, solved = bench(env, hpol, pol, True)
コード例 #2

# Benchmark of initial policy
print('Initial policy...')
muR, solved = bench(env, hpol, pol, True)

# ------------------------------------------------------------------------------
# Policy iteration
# ------------------------------------------------------------------------------
k = 0
total_time = 0
while not solved:
    print('Run', k + 1)
    k += 1

    R, W, F = predictReward(env, M, hpol, pol)

    s = time.time()

    p = computeSampleWeighting(R, F, eps)
    hpol.update(W, F, p)

    t = time.time() - s
    print("Update time", t)
    total_time += t

    muR, solved = bench(env, hpol, pol, True)

print("Average update time", total_time / k)
コード例 #3
upper_a  =  np.array([[-2, 100, 2, -100, 0, 0, 0, 0, 0, 0, 0, 0]]) # Initial upper-policy parameters
upper_A = np.zeros((2, 12))
upper_sigma = np.eye(upper_a.shape[1]) * [20, 200, 200, 20, 0, 0, 0, 0, 0, 0, 0, 0]

# ------------------------------------------------------------------------------
# Initialization of necesary classes
# ------------------------------------------------------------------------------
offset  = np.array([150, 150])
pol     = LowerPolicy(-324, 324, target, offset, maxI = 30, minI = -30, dt = dt) # Lower-policy
hpol    = UpperPolicy(2)  # Upper-policy
hpol.set_parameters(upper_a, upper_A, upper_sigma)
cost    = Cost(np.array([0.005, 100]), target) # Cost function
mod     = Model(dt, pol, cost, noise = False) # Rollout model

# Benchmark of initial policy
validatePolicy(100, H, dt, pol, hpol, verbose = 0)

# ------------------------------------------------------------------------------
# Policy iteration
# ------------------------------------------------------------------------------
for k in range(K):
    print('Run', k+1, 'out of', K)

    R, W, F = predictReward(mod, M, H, hpol)
    p = computeSampleWeighting(R, F, eps)
    hpol.update(W, F, p)

# Benchmark of end policy
validatePolicy(100, H, dt, pol, hpol, verbose = 0)