def test_speed(rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns)] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv( queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)
rng = np.random.RandomState(1) # change the seed to see different data A[...] = rng.uniform(-1, 1, size=A.shape) x[...] = rng.uniform(-1, 1, size=x.shape) y[...] = rng.uniform(-1, 1, size=y.shape) # allocate OpenCL memory on the device clA = Array(queue, A.shape, A.dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) # copy data to device clA.set(A) clx.set(x) # compute a matrix-vector product (gemv) blas.gemv(queue, clA, clx, cly) # check the result print("Expected: ", np.dot(A, x)) print("Actual: ", cly.get()) # try a matrix-vector product with the transpose cly.set(y) blas.gemv(queue, clA, cly, clx, transA=True) print("Expected: ", np.dot(A.T, y)) print("Actual: ", clx.get()) # tidy up the BLAS blas.teardown()
def ATdotx(x): clx = cl.array.to_device(queue, x.astype(np.float32)) cly = cl.array.Array(queue, (n, ), dtype=np.float32) pyopencl_blas.gemv(queue, clA, clx, cly, transA=True) return cly.get()
def matvec(x, y): blas.gemv(queue, cla, x, y)
def matvect(x, y): blas.gemv(queue, cla, x, y, transA=True) return
def test_speed(ctx, rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [ rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns) ] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)