コード例 #1
0
ファイル: lqr_gauss_onpolicy.py プロジェクト: amoliu/tdlearn
states,_,_,_,_ = mdp.samples_cached(n_iter=15000, n_restarts=1,
                                policy=policy,seed=8000)

def make_slice(l, u, n):
    return slice(l, u + float(u - l) / (n - 1) / 2., float(u - l) / (n - 1))

n_slices = [3, 5, 7,10]
bounds = [[-0.012, 0.012], [-0.02, 0.02], [-.6, .6], [-.6, .6]]
s = [make_slice(b[0], b[1], n) for b, n in zip(bounds, n_slices)]
bounds = np.array(bounds, dtype="float")
means = np.mgrid[s[0], s[1], s[2], s[3]].reshape(4, -1).T

sigmas = np.ones_like(means) * (
    (bounds[:, 1] - bounds[:, 0]) / 2. / (np.array(n_slices) - 1)).flatten()
phi = features.gaussians(means, sigmas, constant=False)
A = util.apply_rowise(arr=states, f=phi)
a = np.nonzero(np.sum(A > 0.05, axis=0) > 20)[0]
phi = features.gaussians(means[a], sigmas[a], constant=True)
print phi.dim, "features are used"



theta0 = np.zeros(phi.dim)

task = LinearContinuousValuePredictionTask(
    mdp, gamma, phi, theta0, policy=policy, normalize_phi=False, mu_next=200)

methods = []
alpha = 0.001
mu = .01
コード例 #2
0
policy = policies.MarcsPolicy(noise=np.array([.05]))

states, _, _, _, _ = mdp.samples_cached(n_iter=200,
                                        n_restarts=30,
                                        policy=policy,
                                        seed=8000)

n_slices = [3, 5, 7, 10]
bounds = [[0, 35], [-3, 4], [-12, 12], [-3, 3]]
s = [make_slice(b[0], b[1], n) for b, n in zip(bounds, n_slices)]
bounds = np.array(bounds, dtype="float")
means = np.mgrid[s[0], s[1], s[2], s[3]].reshape(4, -1).T

sigmas = np.ones_like(means) * ((bounds[:, 1] - bounds[:, 0]) / 2. /
                                (np.array(n_slices) - 1)).flatten()
phi = features.gaussians(means, sigmas, constant=False)
A = util.apply_rowise(arr=states, f=phi)
a = np.nonzero(np.sum(A > 0.05, axis=0) > 20)[0]
phi = features.gaussians(means[a], sigmas[a], constant=True)
print phi.dim, "features are used"
theta0 = 0. * np.ones(phi.dim)

task = LinearContinuousValuePredictionTask(mdp,
                                           gamma,
                                           phi,
                                           theta0,
                                           policy=policy,
                                           normalize_phi=False,
                                           mu_seed=1100,
                                           mu_subsample=1,
                                           mu_iter=200,
コード例 #3
0
ファイル: swingup_regtd.py プロジェクト: xuxingc/tdlearn

states, _, _, _, _ = mdp.samples_cached(n_iter=200, n_restarts=30,
                                        policy=policy, seed=8000)

n_slices = [3, 5, 7, 10]
n_slices2 = [5, 5, 14, 20]
bounds = [[0, 35], [-3, 4], [-12, 12], [-3, 3]]
means, sigmas = features.make_grid(n_slices, bounds)
means2, sigmas2 = features.make_grid(n_slices2, bounds)
#means = np.vstack([means,means2])
#sigmas = np.vstack([sigmas, sigmas2])
#phi = features.gaussians(means, sigmas, constant=False)
#A = util.apply_rowise(arr=states, f=phi)
#a = np.nonzero(np.sum(A > 0.05, axis=0) > 5)[0]
phi = features.gaussians(means, sigmas, constant=True)
print phi.dim, "features are used"
theta0 = 0. * np.ones(phi.dim)

task = LinearContinuousValuePredictionTask(
    mdp, gamma, phi, theta0, policy=policy,
    normalize_phi=False, mu_seed=1100,
    mu_subsample=1, mu_iter=200,
    mu_restarts=150, mu_next=300)


methods = []
lam = 0.0
alpha = 0.3
mu = .1
tdc = td.TDCLambda(alpha=alpha, mu=mu, lam=lam, phi=phi, gamma=gamma)