def solve_HingeLoss(solver, queue, clA, Y, rng=None, E=None): import scipy.optimize import pyopencl_blas pyopencl_blas.setup() tstart = time.time() assert clA.shape[0] == Y.shape[0] m, n = clA.shape _, d = Y.shape Xshape = (n, d) # regularization sigma = solver.reg * cl.array.max(clA).get() lamb = m * sigma**2 # --- initialization X0 = rng.uniform(-1. / n, 1. / n, size=Xshape) # --- solve with L-BFGS yinds = Y.argmax(axis=1) clX = cl.array.Array(queue, (n, d), dtype=np.float32) clyinds = cl.array.to_device(queue, yinds.astype(np.int32)) clZ = cl.array.Array(queue, (m, d), dtype=np.float32) clc = cl.array.Array(queue, (m, ), dtype=np.float32) clE = cl.array.Array(queue, (m, d), dtype=np.float32) clG = cl.array.Array(queue, (n, d), dtype=np.float32) hingeloss_plan = plan_hingeloss(queue, clyinds, clZ, clc, clE) def f_df(x): clX.set(x.astype(np.float32).reshape(Xshape)) pyopencl_blas.gemm(queue, clA, clX, clZ) hingeloss_plan() cost = pyopencl.array.sum(clc).get() pyopencl_blas.gemm(queue, clA, clE, clG, transA=True) if lamb > 0: cost += 0.5 * lamb * pyopencl.array.sum(clX**2).get() # cost += 0.5 * lamb * sum_square(clX).get() clG[:] += lamb * clX G = clG.get().astype(np.float64) return cost, G.ravel() x0 = X0.ravel() x, mincost, info = scipy.optimize.fmin_l_bfgs_b(f_df, x0, maxfun=solver.n_epochs, iprint=solver.verbose) tend = time.time() A = clA.get() X = x.reshape(Xshape) return solver.mul_encoders(X, E), { 'rmses': npext.rms(np.dot(A, X) - Y, axis=1), 'time': tend - tstart }
def iddr_rid(queue, m, n, matvect, krank): id_srand = util.setup_rand(queue) blas.setup() dtype = 'float64' clx = cl_array.zeros(queue, m, dtype) rnorms = cl_array.Array(queue, n, dtype) lst = cl_array.Array(queue, n, np.int32) proj = cl_array.Array(queue, (krank+2,n), dtype) l = krank + 2 for i in range(l): id_srand(m, clx) matvect(clx, proj[i,:]) iddr_id(queue, l, n, proj, krank, lst, rnorms) blas.teardown() return lst, proj
def test_speed(rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns)] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv( queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)
from __future__ import print_function import numpy as np import pyopencl as cl from pyopencl.array import Array import pyopencl_blas as blas # start up OpenCL ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # start up the BLAS blas.setup() # generate some random data on the CPU m, n = 5, 4 dtype = 'float32' # also supports 'float64' A = np.zeros((m, n), dtype=dtype) x = np.zeros(n, dtype=dtype) y = np.zeros(m, dtype=dtype) rng = np.random.RandomState(1) # change the seed to see different data A[...] = rng.uniform(-1, 1, size=A.shape) x[...] = rng.uniform(-1, 1, size=x.shape) y[...] = rng.uniform(-1, 1, size=y.shape) # allocate OpenCL memory on the device clA = Array(queue, A.shape, A.dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype)
def solve_Softmax(solver, queue, clA, Y, rng=None, E=None): from nengo_extras.convnet import softmax import scipy.optimize import pyopencl_blas pyopencl_blas.setup() tstart = time.time() assert clA.shape[0] == Y.shape[0] m, n = clA.shape _, d = Y.shape Xshape = (n, d) # regularization sigma = solver.reg * cl.array.max(clA).get() lamb = m * sigma**2 # --- initialization # X0 = np.zeros(Xshape, dtype=np.float32) X0 = np.zeros(Xshape, dtype=np.float64) # --- solve with L-BFGS clY = cl.array.to_device(queue, Y.astype(np.float32)) clyi = cl.array.to_device(queue, np.argmax(Y, axis=1).astype(np.int32)) clX = cl.array.Array(queue, (n, d), dtype=np.float32) clE = cl.array.Array(queue, (m, d), dtype=np.float32) clG = cl.array.Array(queue, (n, d), dtype=np.float32) softmax_plan = plan_softmax(queue, clE, clE) # sum_square = cl.reduction.ReductionKernel( # queue.context, np.float32, neutral="0", # reduce_expr="a+b", map_expr="x[i]*x[i]", # arguments="__global float *x") sum_logloss = cl.reduction.ReductionKernel( queue.context, np.float32, neutral="0", reduce_expr="a+b", map_expr="-log(max(Y[i*%(d)d + yi[i]], 1e-16f))" % dict(d=d), arguments="__global const int *yi, __global const float *Y") assert clE.elemstrides[0] == d def f_df(x): clX.set(x.astype(np.float32).reshape(Xshape)) pyopencl_blas.gemm(queue, clA, clX, clE) softmax_plan() cost = sum_logloss(clyi, clE).get() clE[:] -= clY pyopencl_blas.gemm(queue, clA, clE, clG, transA=True) if lamb > 0: cost += 0.5 * lamb * pyopencl.array.sum(clX**2).get() # cost += 0.5 * lamb * sum_square(clX).get() clG[:] += lamb * clX G = clG.get().astype(np.float64) return cost, G.ravel() x0 = X0.ravel() x, mincost, info = scipy.optimize.fmin_l_bfgs_b(f_df, x0, maxfun=solver.n_epochs, iprint=solver.verbose) tend = time.time() A = clA.get() X = x.reshape(Xshape) return solver.mul_encoders(X, E), { 'rmses': npext.rms(softmax(np.dot(A, X), axis=1) - Y, axis=1), 'time': tend - tstart }
def solve_lstsqclassifier(solver, queue, clA, Y, rng=None, E=None): # from nengo_ocl.builder.solvers import cho_solve import pyopencl_blas pyopencl_blas.setup() m, n = clA.shape _, d = Y.shape precompute_ai = solver.precompute_ai def XTdotX(clX): clXX = cl.array.Array(queue, (n, n), dtype=np.float32) pyopencl_blas.gemm(queue, clX, clX, clXX, transA=True) return clXX.get() def ATdotx(x): clx = cl.array.to_device(queue, x.astype(np.float32)) cly = cl.array.Array(queue, (n, ), dtype=np.float32) pyopencl_blas.gemv(queue, clA, clx, cly, transA=True) return cly.get() def AdotX(X): clX = cl.array.to_device(queue, X.astype(np.float32)) clAX = cl.array.Array(queue, (m, clX.shape[1]), dtype=np.float32) pyopencl_blas.gemm(queue, clA, clX, clAX) return clAX.get() def getAi(i, cache={}): if i in cache: return cache[i] clAi = clAis[i] AAi = XTdotX(clAi) if precompute_ai: cache[i] = AAi return AAi tstart = time.time() sigma = solver.reg * cl.array.max(clA).get() # Get Y inds Yi = np.argmax(Y, axis=1) Yd = np.diff(Yi) assert set(np.unique(Yd)) == set( (0, 1)), "Y not sorted, or missing some classes" clAis = [] for i in range(d): inds, = (Yi == i).nonzero() a, b = inds.min(), inds.max() + 1 clAis.append(clA[a:b]) if not precompute_ai: AA = XTdotX(clA) else: AA = np.zeros((n, n)) for i in range(d): AA += getAi(i) X = np.zeros((n, d)) for i in range(d): y = Y[:, i] # weight for classification p = y.mean() q = solver.weight_power wr = p * (1 - p)**q + (1 - p) * p**q w0 = p**q / wr w1 = (1 - p)**q / wr dw = w1 - w0 w = w0 + dw * y # form Gram matrix G = A.T W A + m * sigma**2 G = w0 * AA + dw * getAi(i) np.fill_diagonal(G, G.diagonal() + m * sigma**2) b = ATdotx(w * y) # X[:, i] = cho_solve(G, b, overwrite=True) X[:, i] = np.linalg.solve(G, b) tend = time.time() AX = AdotX(X) return solver.mul_encoders(X, E), { 'rmses': npext.rms(AX - Y, axis=1), 'time': tend - tstart }
def test_speed(ctx, rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [ rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns) ] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)
import numpy as np import pyopencl import pyopencl.array import pyopencl_blas pyopencl_blas.setup() # initialize the library ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue(ctx) dtype = 'float32' # also supports 'float64', 'complex64' and 'complex128' x = np.array([1, 2, 3, 4], dtype=dtype) y = np.array([4, 3, 2, 1], dtype=dtype) clx = pyopencl.array.to_device(queue, x) cly = pyopencl.array.to_device(queue, y) # call a BLAS function on the arrays pyopencl_blas.axpy(queue, clx, cly, alpha=0.8) print("Expected: %s" % (0.8 * x + y)) print("Actual: %s" % (cly.get()))