Example #1
0
def lambda_errors(phi, lambdas, noises):
    mserrors = np.zeros((len(lambdas), len(noises)))
    variances = np.zeros(len(noises))
    a = np.ones((n, n)) / n
    b = np.zeros((n, n))
    for i in range(n - 1):
        b[i, i + 1] = 1.
    b[-1, 0] = 1.
    for i, noise in enumerate(noises):
        c = noise * a + (1 - noise) * b
        c /= c.sum(axis=1)[:, None]
        beh_pol = policies.Discrete(c)
        task = LinearDiscreteValuePredictionTask(mdp,
                                                 gamma,
                                                 phi,
                                                 np.zeros(phi.dim),
                                                 policy=beh_pol)
        d = globals().copy()
        d["phi"] = phi
        d["task"] = task
        mean, std, raw = run_experiment(n_jobs=-1, **d)
        val = mean[:, -1, n:]
        val[mean[:, -1, n:] > mean[0, -1, 0]] = mean[0, -1, 0]
        val = val.mean(axis=1)
        mserrors[:, i] = val - np.mean(val)
        print noise, lambdas[np.argmin(val)]
    #mserrors -= mserrors.min(axis=1)[:,None]
    #mserrors /= mserrors.max(axis=1)[:,None]
    return mserrors
Example #2
0
import numpy as np
import matplotlib.pyplot as plt
import features
import policies

n = 20
n_random = 800
mdp = examples.CorruptedChain(n_states=n)
phi = features.corrupted_rbfs(n_S=n, n_rbfs=5, n_random=n_random)
gamma = .9
n_feat = phi.dim
p0 = np.zeros(n_feat)
pol = np.zeros((n, 2))
pol[:10, 0] = 1
pol[10:, 1] = 1
policy = policies.Discrete(prop_table=pol)
task = LinearDiscreteValuePredictionTask(mdp, gamma, phi, p0, policy=policy)

# define the methods to examine
methods = []  # [td0, gtd, gtd2]

lstd = td.RecursiveLSTDLambdaJP(lam=0, eps=1000, phi=phi)
lstd.name = r"LSTD({}) $\ell_2 \tau={}$".format(0, 0)
lstd.color = "b"
methods.append(lstd)
#for eps in np.power(10,np.arange(-1,4)):
lstd = td.LSTDLambdaJP(lam=0, tau=0.8, phi=phi)
lstd.name = r"LSTD({}) $\ell_2 \tau={}$".format(0, .8)
lstd.color = "b"
#methods.append(lstd)
Example #3
0
import td
import examples
from task import LinearDiscreteValuePredictionTask
import numpy as np
import features
import policies
import regtd
n = 400
n_a = 10
n_feat = 200
mdp = examples.RandomMDP(n, n_a)
phi = features.lin_random(n_feat, n, constant=True)
gamma = .95
np.random.seed(3)
beh_pol = policies.Discrete(np.random.rand(n, n_a))
tar_pol = policies.Discrete(np.random.rand(n, n_a))
task = LinearDiscreteValuePredictionTask(mdp,
                                         gamma,
                                         phi,
                                         np.zeros(phi.dim),
                                         policy=beh_pol,
                                         target_policy=tar_pol)

methods = []
alpha = 0.007
mu = .0001
gtd = td.GTD(alpha=alpha, beta=mu * alpha, phi=phi)
gtd.name = r"GTD $\alpha$={} $\mu$={}".format(alpha, mu)
gtd.color = "r"
methods.append(gtd)
Example #4
0
"""
Experiment that shows arbitrary off-policy behavior of TD
"""
__author__ = "Christoph Dann <*****@*****.**>"
import td
import examples
import numpy as np
import features
import matplotlib.pyplot as plt
from task import LinearDiscreteValuePredictionTask
import policies
n = 7
beh_pi = np.ones((n + 1, 2))
beh_pi[:, 0] = float(n) / (n + 1)
beh_pi[:, 1] = float(1) / (n + 1)
beh_pol = policies.Discrete(prop_table=beh_pi)
target_pi = np.zeros((n + 1, 2))
target_pi[:, 0] = 0
target_pi[:, 1] = 1
target_pol = policies.Discrete(prop_table=target_pi)

mdp = examples.BairdStarExample(n)
phi = features.linear_blended(n + 1)

methods = []

gamma = 0.99
task = LinearDiscreteValuePredictionTask(mdp,
                                         gamma,
                                         phi,
                                         np.asarray(n * [1.] + [10., 1.]),