def srmax_loop(D, env, w, damping=0.001, rmax=1.0): """ Sparse rmax loop. """ k = len(w) A = sp.identity(k, format='csr') * damping b = sp_create(k, 1, 'csr') grmax = rmax / (1.0 - env.gamma) for (s, a, r, ns, na) in D: if D.known_pair(s, a) and D.known_state(ns): features = env.phi(s, a, sparse=True, format='csr') next = env.linear_policy(w, ns) newfeatures = env.phi(ns, next, sparse=True, format='csr') nf = features - env.gamma * newfeatures T = sp.kron(features, nf.T) A = A + T b = b + features * r elif D.known_pair(s, a): features = env.phi(s, a, sparse=True, format='csr') T = sp.kron(features, features.T) A = A + T b = b + features * (r + env.gamma * grmax) else: features = env.phi(s, a, sparse=True, format='csr') T = sp.kron(features, features.T) A = A + T b = b + features * grmax for una in D.unknown(s): features = env.phi(s, una, sparse=True, format='csr') T = sp.kron(features, features.T) A = A + T b = b + features * grmax return A, b
def ParallelLSTDQ(D,env,w,damping=0.001,ncpus=None): """ D : source of samples (s,a,r,s',a') env: environment contianing k,phi,gamma w : weights for the linear policy evaluation damping : keeps the result relatively stable ncpus : the number of cpus to use """ if ncpus: nprocess = ncpus else: nprocess = cpu_count() pool = Pool(nprocess) indx = chunk(len(D),nprocess) results = [] for (i,j) in indx: r = pool.apply_async(dict_loop,(D[i:j],env,w,0.0)) # note that damping needs to be zero here results.append(r) k = len(w) A = sp.identity(k,format='csr') * damping b = sp_create(k,1,'csr') for r in results: T,t = r.get() A = A + T b = b + t # close out the pool of workers pool.close() pool.join() w,info = solve(A,b,method="spsolve") return A,b,w,info
def srmax_loop(D, env, w, track, damping=0.001, rmax = 1.0): """ Sparse rmax loop. """ k = len(w) A = sp.identity(k,format='csr') * damping b = sp_create(k,1,'csr') grmax = rmax / (1.0 - env.gamma) for (s,a,r,ns,na) in D: if track.known_pair(s,a) and track.known_state(ns): features = env.phi(s, a, sparse=True, format='csr') next = env.linear_policy(w, ns) newfeatures = env.phi(ns, next, sparse=True, format='csr') nf = features - env.gamma * newfeatures T = sp.kron(features, nf.T) A = A + T b = b + features * r elif track.known_pair(s,a): features = env.phi(s, a, sparse=True, format='csr') T = sp.kron(features, features.T) A = A + T b = b + features * (r + env.gamma * grmax) else: features = env.phi(s, a, sparse=True, format='csr') T = sp.kron(features, features.T) A = A + T b = b + features * grmax for una in track.unknown(s): features = env.phi(s, una, sparse=True, format='csr') T = sp.kron(features, features.T) A = A + T b = b + features * grmax return A,b
def sparse_loop(D,env,w,damping=0.001): """ This is somewhat surprisingly the slowest. """ k = len(w) A = sp.identity(k,format='csr') * damping b = sp_create(k,1,'csr') for (s,a,r,ns,na) in D: features = env.phi(s, a, sparse=True, format='csr') next = env.linear_policy(w, ns) newfeatures = env.phi(ns, next, sparse=True, format='csr') nf = features - env.gamma * newfeatures T = sp.kron(features, nf.T) A = A + T b = b + features * r return A,b
def sparse_loop(D, env, w, damping=0.001): """ This is somewhat surprisingly the slowest. """ k = len(w) A = sp.identity(k, format='csr') * damping b = sp_create(k, 1, 'csr') for (s, a, r, ns, na) in D: features = env.phi(s, a, sparse=True, format='csr') next = env.linear_policy(w, ns) newfeatures = env.phi(ns, next, sparse=True, format='csr') nf = features - env.gamma * newfeatures T = sp.kron(features, nf.T) A = A + T b = b + features * r return A, b
def sopt_loop(D,env,w,damping=0.001): """ Sparse matrix version that computes inverse iteratively. """ k = len(w) B = sp.identity(k,format='csr') * 1.0/damping b = sp_create(k,1,'csr') for (s,a,r,ns,na) in D: features = env.phi(s,a,sparse = True, format='csr') next = env.linear_policy(w, ns) newfeatures = env.phi(ns, next, sparse = True, format='csr') nf = features - env.gamma * newfeatures uv = sp.kron(features,nf.T) N = B.dot(uv).dot(B) d = 1 + nf.T.dot(B).dot(features)[0,0] B = B - N / d b = b + features * r return B,b
def sopt_loop(D, env, w, damping=0.001): """ Sparse matrix version that computes inverse iteratively. """ k = len(w) B = sp.identity(k, format='csr') * 1.0 / damping b = sp_create(k, 1, 'csr') for (s, a, r, ns, na) in D: features = env.phi(s, a, sparse=True, format='csr') next = env.linear_policy(w, ns) newfeatures = env.phi(ns, next, sparse=True, format='csr') nf = features - env.gamma * newfeatures uv = sp.kron(features, nf.T) N = B.dot(uv).dot(B) d = 1 + nf.T.dot(B).dot(features)[0, 0] B = B - N / d b = b + features * r return B, b
def ParallelLSTDQ(D, env, w, damping=0.001, ncpus=None): """ D : source of samples (s,a,r,s',a') env: environment contianing k,phi,gamma w : weights for the linear policy evaluation damping : keeps the result relatively stable ncpus : the number of cpus to use """ if ncpus: nprocess = ncpus else: nprocess = cpu_count() pool = Pool(nprocess) indx = chunk(len(D), nprocess) results = [] for (i, j) in indx: r = pool.apply_async( dict_loop, (D[i:j], env, w, 0.0)) # note that damping needs to be zero here results.append(r) k = len(w) A = sp.identity(k, format='csr') * damping b = sp_create(k, 1, 'csr') for r in results: T, t = r.get() A = A + T b = b + t # close out the pool of workers pool.close() pool.join() w, info = solve(A, b, method="spsolve") return A, b, w, info
def ParallelLSTDQRmax(D,env,w,track,damping=0.001,rmax=1.0,ncpus=None): """ D : source of samples (s,a,r,s',a') env: environment contianing k,phi,gamma w : weights for the linear policy evaluation track : an object that records what is known damping : keeps the result relatively stable (solves some difficulties with oscillation if A is singular) rmax : the maximum reward ncpus : the number of cpus to use """ if ncpus: nprocess = ncpus else: nprocess = cpu_count() pool = Pool(nprocess) indx = chunk(len(D),nprocess) results = [] for (i,j) in indx: r = pool.apply_async(drmax_loop,(D[i:j],env,w,track,0.0,rmax)) # note that damping needs to be zero here results.append(r) k = len(w) A = sp.identity(k,format='csr') * damping b = sp_create(k,1,'csr') for r in results: T,t = r.get() A = A + T b = b + t # close out the pool of workers pool.close() pool.join() w,info = solve(A,b,method="spsolve") return A,b,w,info