Exemple #1
0
 def blas(self):
     tid = current_thread().ident
     blas = self._blas_.get(tid)
     if blas is None:
         blas = cublas.CUBLAS(self.context)
         self._blas_[tid] = blas
     return blas
Exemple #2
0
 def setUp(self):
     logging.basicConfig(level=logging.DEBUG)
     self.old_env = os.environ.get("CUDA_DEVICE")
     if self.old_env is None:
         os.environ["CUDA_DEVICE"] = "0"
     self.ctx = cu.Devices().create_some_context()
     self.blas = blas.CUBLAS(self.ctx)
     self.path = os.path.dirname(__file__)
     if not len(self.path):
         self.path = "."
Exemple #3
0
    def _test_good(self):
        ctx = cu.Devices().create_some_context()
        a = Container()
        a.ctx = ctx
        b = Container()
        b.mem = cu.MemAlloc(ctx, 4096)
        b.module = cu.Module(ctx, source="""
            __global__ void test(float *a) {
                a[blockIdx.x * blockDim.x + threadIdx.x] *= 1.1f;
            }""")
        b.blas = blas.CUBLAS(ctx)

        logging.debug("Remaining context count: %d", cu.Context.context_count)
        # self.assertEqual(cu.Context.context_count, 1)
        self.assertIsNotNone(ctx)  # to hold ctx up to this point
Exemple #4
0
    def test_kernel(self):
        path = os.path.dirname(__file__)
        if not len(path):
            path = "."
        with cu.Devices().create_some_context() as ctx:
            cap = ctx.device.compute_capability
            if cap < (3, 5):
                return

            logging.info("Compiling...")
            m = ctx.create_module(source_file=(path + "/cublas_perf.cu"),
                                  nvcc_options2=cu.Module.OPTIONS_CUBLAS,
                                  compute_capability=(cap[0], 0) if cap >=
                                  (6, 0) else cap)
            # minor version of compute has to be set to 0
            # to work on Pascal with CUDA 8.0

            create_cublas = m.create_function("create_cublas")
            destroy_cublas = m.create_function("destroy_cublas")
            test = m.create_function("test")
            test_full = m.create_function("test_full")
            dummy = m.create_function("dummy")
            logging.info("Done")

            blas = numpy.zeros(1, dtype=numpy.uint64)
            blas_ = ctx.mem_alloc(blas)
            create_cublas.set_args(blas_)
            create_cublas((1, 1, 1))
            blas_.to_host(blas)

            n = 256
            one_ = ctx.mem_alloc(numpy.ones(1, dtype=numpy.float32))
            zero_ = ctx.mem_alloc(numpy.zeros(1, dtype=numpy.float32))

            a = numpy.random.rand(n, n).astype(numpy.float32)
            b = numpy.random.rand(n, n).astype(numpy.float32)
            c = numpy.zeros_like(a)

            c_gold = numpy.dot(a.transpose(), b.transpose()).transpose()

            a_ = ctx.mem_alloc(a)
            b_ = ctx.mem_alloc(b)
            c_ = ctx.mem_alloc(c)

            N = 10000

            test.set_args(numpy.array([N], dtype=numpy.int64), blas,
                          numpy.array([n], dtype=numpy.int64), one_, a_, b_,
                          zero_, c_)
            ctx.synchronize()
            t0 = time.time()
            test((1, 1, 1))
            ctx.synchronize()
            dt = time.time() - t0
            logging.info("With external blas handle completed in %.6f sec", dt)

            destroy_cublas.set_args(blas)
            destroy_cublas((1, 1, 1))

            c_.to_host(c)
            max_diff = numpy.fabs(c - c_gold).max()
            logging.info("max_diff = %.6f", max_diff)
            self.assertLess(max_diff, 1.0e-3)

            test_full.set_args(numpy.array([N], dtype=numpy.int64),
                               numpy.array([n], dtype=numpy.int64), one_, a_,
                               b_, zero_, c_)
            ctx.synchronize()
            t0 = time.time()
            test_full((1, 1, 1))
            ctx.synchronize()
            dt = time.time() - t0
            logging.info("With local blas handle completed in %.6f sec", dt)

            c_.to_host(c)
            max_diff = numpy.fabs(c - c_gold).max()
            logging.info("max_diff = %.6f", max_diff)
            self.assertLess(max_diff, 1.0e-3)

            blas = cublas.CUBLAS(ctx)
            ctx.synchronize()
            one = numpy.ones(1, dtype=numpy.float32)
            zero = numpy.zeros(1, dtype=numpy.float32)
            dummy.set_args(c_, c_)
            t0 = time.time()
            for i in range(N):
                blas.sgemm(cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, n, n, n,
                           one, a_ if i & 1 else b_, b_ if i & 1 else a_, zero,
                           c_, n, n, n)
                dummy((1, 1, 1))  # interleave with some dummy kernel
            ctx.synchronize()
            dt = time.time() - t0
            logging.info("With shared library blas completed in %.6f sec", dt)

            c_.to_host(c)
            max_diff = numpy.fabs(c - c_gold).max()
            logging.info("max_diff = %.6f", max_diff)
            self.assertLess(max_diff, 1.0e-3)

        logging.info("Succeeded")