Ejemplo n.º 1
0
def srmax_loop(D, env, w, damping=0.001, rmax=1.0):
    """
    Sparse rmax loop.
    """
    k = len(w)
    A = sp.identity(k, format='csr') * damping
    b = sp_create(k, 1, 'csr')
    grmax = rmax / (1.0 - env.gamma)

    for (s, a, r, ns, na) in D:
        if D.known_pair(s, a) and D.known_state(ns):
            features = env.phi(s, a, sparse=True, format='csr')
            next = env.linear_policy(w, ns)
            newfeatures = env.phi(ns, next, sparse=True, format='csr')
            nf = features - env.gamma * newfeatures
            T = sp.kron(features, nf.T)
            A = A + T
            b = b + features * r
        elif D.known_pair(s, a):
            features = env.phi(s, a, sparse=True, format='csr')
            T = sp.kron(features, features.T)
            A = A + T
            b = b + features * (r + env.gamma * grmax)
        else:
            features = env.phi(s, a, sparse=True, format='csr')
            T = sp.kron(features, features.T)
            A = A + T
            b = b + features * grmax
        for una in D.unknown(s):
            features = env.phi(s, una, sparse=True, format='csr')
            T = sp.kron(features, features.T)
            A = A + T
            b = b + features * grmax

    return A, b
Ejemplo n.º 2
0
def ParallelLSTDQ(D,env,w,damping=0.001,ncpus=None):
    """
    D : source of samples (s,a,r,s',a')
    env: environment contianing k,phi,gamma
    w : weights for the linear policy evaluation
    damping : keeps the result relatively stable 
    ncpus : the number of cpus to use
    """

    if ncpus:
        nprocess = ncpus
    else:
        nprocess = cpu_count()

    pool = Pool(nprocess)
    indx = chunk(len(D),nprocess)
    results = []
    for (i,j) in indx:
        r = pool.apply_async(dict_loop,(D[i:j],env,w,0.0)) # note that damping needs to be zero here
        results.append(r)
        
    k = len(w)
    A = sp.identity(k,format='csr') * damping
    b = sp_create(k,1,'csr')
    for r in results:
        T,t = r.get()
        A = A + T
        b = b + t

    # close out the pool of workers
    pool.close()
    pool.join()

    w,info = solve(A,b,method="spsolve")
    return A,b,w,info
Ejemplo n.º 3
0
def srmax_loop(D, env, w, track, damping=0.001, rmax = 1.0):
    """
    Sparse rmax loop.
    """
    k = len(w)
    A = sp.identity(k,format='csr') * damping
    b = sp_create(k,1,'csr')
    grmax = rmax / (1.0 - env.gamma)

    for (s,a,r,ns,na) in D:
        if track.known_pair(s,a) and track.known_state(ns):
            features = env.phi(s, a, sparse=True, format='csr')
            next = env.linear_policy(w, ns)
            newfeatures = env.phi(ns, next, sparse=True, format='csr')
            nf = features - env.gamma * newfeatures
            T = sp.kron(features, nf.T)
            A = A + T
            b = b + features * r 
        elif track.known_pair(s,a):
            features = env.phi(s, a, sparse=True, format='csr')
            T = sp.kron(features, features.T)
            A = A + T
            b = b + features * (r + env.gamma * grmax)
        else:            
            features = env.phi(s, a, sparse=True, format='csr')
            T = sp.kron(features, features.T)
            A = A + T
            b = b + features * grmax
        for una in track.unknown(s):
            features = env.phi(s, una, sparse=True, format='csr')
            T = sp.kron(features, features.T)
            A = A + T
            b = b + features * grmax

    return A,b
Ejemplo n.º 4
0
def sparse_loop(D,env,w,damping=0.001):
    """
    This is somewhat surprisingly the slowest.
    """
    k = len(w)
    A = sp.identity(k,format='csr') * damping
    b = sp_create(k,1,'csr')

    for (s,a,r,ns,na) in D:
        features = env.phi(s, a, sparse=True, format='csr')
        next = env.linear_policy(w, ns)
        newfeatures = env.phi(ns, next, sparse=True, format='csr')

        nf = features - env.gamma * newfeatures
        T = sp.kron(features, nf.T)
        A = A + T
        b = b + features * r

    return A,b
Ejemplo n.º 5
0
def sparse_loop(D, env, w, damping=0.001):
    """
    This is somewhat surprisingly the slowest.
    """
    k = len(w)
    A = sp.identity(k, format='csr') * damping
    b = sp_create(k, 1, 'csr')

    for (s, a, r, ns, na) in D:
        features = env.phi(s, a, sparse=True, format='csr')
        next = env.linear_policy(w, ns)
        newfeatures = env.phi(ns, next, sparse=True, format='csr')

        nf = features - env.gamma * newfeatures
        T = sp.kron(features, nf.T)
        A = A + T
        b = b + features * r

    return A, b
Ejemplo n.º 6
0
def sopt_loop(D,env,w,damping=0.001):
    """
    Sparse matrix version that computes inverse iteratively.
    """
    k = len(w)
    B = sp.identity(k,format='csr') * 1.0/damping
    b = sp_create(k,1,'csr')

    for (s,a,r,ns,na) in D:
        features = env.phi(s,a,sparse = True, format='csr')
        next = env.linear_policy(w, ns)
        newfeatures = env.phi(ns, next, sparse = True, format='csr')

        nf = features - env.gamma * newfeatures
        uv = sp.kron(features,nf.T)
        N = B.dot(uv).dot(B)
        d = 1 + nf.T.dot(B).dot(features)[0,0]

        B = B - N / d
        b = b + features * r

    return B,b
Ejemplo n.º 7
0
def sopt_loop(D, env, w, damping=0.001):
    """
    Sparse matrix version that computes inverse iteratively.
    """
    k = len(w)
    B = sp.identity(k, format='csr') * 1.0 / damping
    b = sp_create(k, 1, 'csr')

    for (s, a, r, ns, na) in D:
        features = env.phi(s, a, sparse=True, format='csr')
        next = env.linear_policy(w, ns)
        newfeatures = env.phi(ns, next, sparse=True, format='csr')

        nf = features - env.gamma * newfeatures
        uv = sp.kron(features, nf.T)
        N = B.dot(uv).dot(B)
        d = 1 + nf.T.dot(B).dot(features)[0, 0]

        B = B - N / d
        b = b + features * r

    return B, b
Ejemplo n.º 8
0
def ParallelLSTDQ(D, env, w, damping=0.001, ncpus=None):
    """
    D : source of samples (s,a,r,s',a')
    env: environment contianing k,phi,gamma
    w : weights for the linear policy evaluation
    damping : keeps the result relatively stable 
    ncpus : the number of cpus to use
    """

    if ncpus:
        nprocess = ncpus
    else:
        nprocess = cpu_count()

    pool = Pool(nprocess)
    indx = chunk(len(D), nprocess)
    results = []
    for (i, j) in indx:
        r = pool.apply_async(
            dict_loop,
            (D[i:j], env, w, 0.0))  # note that damping needs to be zero here
        results.append(r)

    k = len(w)
    A = sp.identity(k, format='csr') * damping
    b = sp_create(k, 1, 'csr')
    for r in results:
        T, t = r.get()
        A = A + T
        b = b + t

    # close out the pool of workers
    pool.close()
    pool.join()

    w, info = solve(A, b, method="spsolve")
    return A, b, w, info
Ejemplo n.º 9
0
def ParallelLSTDQRmax(D,env,w,track,damping=0.001,rmax=1.0,ncpus=None):
    """
    D : source of samples (s,a,r,s',a')
    env: environment contianing k,phi,gamma
    w : weights for the linear policy evaluation
    track : an object that records what is known
    damping : keeps the result relatively stable (solves some difficulties with oscillation if A is singular)
    rmax : the maximum reward
    ncpus : the number of cpus to use
    """
    if ncpus:
        nprocess = ncpus
    else:
        nprocess = cpu_count()
    
    pool = Pool(nprocess)
    indx = chunk(len(D),nprocess)
    results = []
    for (i,j) in indx:
        r = pool.apply_async(drmax_loop,(D[i:j],env,w,track,0.0,rmax)) # note that damping needs to be zero here
        results.append(r)
        
    k = len(w)
    A = sp.identity(k,format='csr') * damping
    b = sp_create(k,1,'csr')
    for r in results:
        T,t = r.get()
        A = A + T
        b = b + t

    # close out the pool of workers
    pool.close()
    pool.join()

    w,info = solve(A,b,method="spsolve")
    return A,b,w,info