def blas(self): tid = current_thread().ident blas = self._blas_.get(tid) if blas is None: blas = cublas.CUBLAS(self.context) self._blas_[tid] = blas return blas
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.old_env = os.environ.get("CUDA_DEVICE") if self.old_env is None: os.environ["CUDA_DEVICE"] = "0" self.ctx = cu.Devices().create_some_context() self.blas = blas.CUBLAS(self.ctx) self.path = os.path.dirname(__file__) if not len(self.path): self.path = "."
def _test_good(self): ctx = cu.Devices().create_some_context() a = Container() a.ctx = ctx b = Container() b.mem = cu.MemAlloc(ctx, 4096) b.module = cu.Module(ctx, source=""" __global__ void test(float *a) { a[blockIdx.x * blockDim.x + threadIdx.x] *= 1.1f; }""") b.blas = blas.CUBLAS(ctx) logging.debug("Remaining context count: %d", cu.Context.context_count) # self.assertEqual(cu.Context.context_count, 1) self.assertIsNotNone(ctx) # to hold ctx up to this point
def test_kernel(self): path = os.path.dirname(__file__) if not len(path): path = "." with cu.Devices().create_some_context() as ctx: cap = ctx.device.compute_capability if cap < (3, 5): return logging.info("Compiling...") m = ctx.create_module(source_file=(path + "/cublas_perf.cu"), nvcc_options2=cu.Module.OPTIONS_CUBLAS, compute_capability=(cap[0], 0) if cap >= (6, 0) else cap) # minor version of compute has to be set to 0 # to work on Pascal with CUDA 8.0 create_cublas = m.create_function("create_cublas") destroy_cublas = m.create_function("destroy_cublas") test = m.create_function("test") test_full = m.create_function("test_full") dummy = m.create_function("dummy") logging.info("Done") blas = numpy.zeros(1, dtype=numpy.uint64) blas_ = ctx.mem_alloc(blas) create_cublas.set_args(blas_) create_cublas((1, 1, 1)) blas_.to_host(blas) n = 256 one_ = ctx.mem_alloc(numpy.ones(1, dtype=numpy.float32)) zero_ = ctx.mem_alloc(numpy.zeros(1, dtype=numpy.float32)) a = numpy.random.rand(n, n).astype(numpy.float32) b = numpy.random.rand(n, n).astype(numpy.float32) c = numpy.zeros_like(a) c_gold = numpy.dot(a.transpose(), b.transpose()).transpose() a_ = ctx.mem_alloc(a) b_ = ctx.mem_alloc(b) c_ = ctx.mem_alloc(c) N = 10000 test.set_args(numpy.array([N], dtype=numpy.int64), blas, numpy.array([n], dtype=numpy.int64), one_, a_, b_, zero_, c_) ctx.synchronize() t0 = time.time() test((1, 1, 1)) ctx.synchronize() dt = time.time() - t0 logging.info("With external blas handle completed in %.6f sec", dt) destroy_cublas.set_args(blas) destroy_cublas((1, 1, 1)) c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.info("max_diff = %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) test_full.set_args(numpy.array([N], dtype=numpy.int64), numpy.array([n], dtype=numpy.int64), one_, a_, b_, zero_, c_) ctx.synchronize() t0 = time.time() test_full((1, 1, 1)) ctx.synchronize() dt = time.time() - t0 logging.info("With local blas handle completed in %.6f sec", dt) c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.info("max_diff = %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) blas = cublas.CUBLAS(ctx) ctx.synchronize() one = numpy.ones(1, dtype=numpy.float32) zero = numpy.zeros(1, dtype=numpy.float32) dummy.set_args(c_, c_) t0 = time.time() for i in range(N): blas.sgemm(cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, n, n, n, one, a_ if i & 1 else b_, b_ if i & 1 else a_, zero, c_, n, n, n) dummy((1, 1, 1)) # interleave with some dummy kernel ctx.synchronize() dt = time.time() - t0 logging.info("With shared library blas completed in %.6f sec", dt) c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.info("max_diff = %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) logging.info("Succeeded")