Esempio n. 1
0
    def test_native_cast(self):
        float32_ptx, _ = cuda.compile_ptx(native_cast, (float32, ),
                                          device=True)
        self.assertIn("st.f32", float32_ptx)

        float16_ptx, _ = cuda.compile_ptx(native_cast, (float16, ),
                                          device=True)
        self.assertIn("st.u16", float16_ptx)
Esempio n. 2
0
def test_compile_arith_masked_vs_constant(op, ty, constant):
    def func(x):
        return op(x, constant)

    cc = (7, 5)
    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)

    assert isinstance(resty, MaskedType)

    # Check that the masked typing matches that of the unmasked typing
    um_ptx, um_resty = compile_ptx(func, (ty,), cc=cc, device=True)
    assert resty.value_type == um_resty
Esempio n. 3
0
    def test_kernel_with_debug(self):
        # Inspired by (but not originally affected by) Issue #6719
        def f():
            pass

        ptx, resty = compile_ptx(f, [], debug=True)
        self.check_debug_info(ptx)
Esempio n. 4
0
    def test_float16_to_float_ptx(self):
        pyfuncs = (to_float32, to_float64)
        postfixes = ("f32", "f64")

        for pyfunc, postfix in zip(pyfuncs, postfixes):
            ptx, _ = compile_ptx(pyfunc, [f2], device=True)
            self.assertIn(f"cvt.{postfix}.f16", ptx)
Esempio n. 5
0
def test_generic_ptx(dtype):

    size = 500

    lhs_arr = np.random.random(size).astype(dtype)
    lhs_col = Series(lhs_arr)._column

    rhs_arr = np.random.random(size).astype(dtype)
    rhs_col = Series(rhs_arr)._column

    def generic_function(a, b):
        return a ** 3 + b

    nb_type = numpy_support.from_dtype(cudf.dtype(dtype))
    type_signature = (nb_type, nb_type)

    ptx_code, output_type = compile_ptx(
        generic_function, type_signature, device=True
    )

    dtype = numpy_support.as_dtype(output_type).type

    out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype)

    result = lhs_arr ** 3 + rhs_arr

    np.testing.assert_almost_equal(result, out_col.to_array())
Esempio n. 6
0
    def test_float16_to_uint_ptx(self):
        pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
        sizes = (8, 16, 32, 64)

        for pyfunc, size in zip(pyfuncs, sizes):
            ptx, _ = compile_ptx(pyfunc, [f2], device=True)
            self.assertIn(f"cvt.rni.u{size}.f16", ptx)
Esempio n. 7
0
    def test_uint_to_float16_ptx(self):
        fromtys = (u1, u2, u4, u8)
        sizes = (8, 16, 32, 64)

        for ty, size in zip(fromtys, sizes):
            ptx, _ = compile_ptx(to_float16, [ty], device=True)
            self.assertIn(f"cvt.rn.f16.u{size}", ptx)
Esempio n. 8
0
    def test_mixed_fp16_comparison_promotion_ptx(self):
        functions = (simple_fp16_gt, simple_fp16_ge,
                     simple_fp16_lt, simple_fp16_le,
                     simple_fp16_eq, simple_fp16_ne)
        ops = (operator.gt, operator.ge, operator.lt, operator.le,
               operator.eq, operator.ne)

        types_promote = (np.int16, np.int32, np.int64,
                         np.float32, np.float64)
        opstring = {operator.gt:'setp.gt.',
                    operator.ge:'setp.ge.',
                    operator.lt:'setp.lt.',
                    operator.le:'setp.le.',
                    operator.eq:'setp.eq.',
                    operator.ne:'setp.neu.'}
        opsuffix = {np.dtype('int32'): 'f64',
                    np.dtype('int64'): 'f64',
                    np.dtype('float32'): 'f32',
                    np.dtype('float64'): 'f64'}

        for (fn, op), ty in itertools.product(zip(functions, ops),
                                              types_promote):
            with self.subTest(op=op, ty=ty):
                arg2_ty = np.result_type(np.float16, ty)
                args = (b1[:], f2, from_dtype(arg2_ty))
                ptx, _ = compile_ptx(fn, args, cc=(5, 3))

                ops = opstring[op] + opsuffix[arg2_ty]
                self.assertIn(ops, ptx)
Esempio n. 9
0
 def test_habs_ptx(self):
     args = (f2[:], f2)
     ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3))
     if cuda.runtime.get_version() < (10, 2):
         self.assertRegex(ptx, r'and\.b16.*0x7FFF;')
     else:
         self.assertIn('abs.f16', ptx)
Esempio n. 10
0
    def test_device_function_with_debug(self):
        # See Issue #6719
        def f():
            pass

        ptx, resty = compile_ptx(f, [], device=True, debug=True)
        self.check_debug_info(ptx)
Esempio n. 11
0
def test_compile_arith_constant_vs_masked(op, ty, constant):
    def func(x):
        return op(constant, x)

    cc = (7, 5)
    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)

    assert isinstance(resty, MaskedType)
Esempio n. 12
0
def test_compile_arith_masked_vs_na(op, ty):
    def func(x):
        return op(x, NA)

    cc = (7, 5)
    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)

    assert isinstance(resty, MaskedType)
Esempio n. 13
0
    def test_fastmath(self):
        def f(x, y, z, d):
            return sqrt((x * y + z) / d)

        args = (float32, float32, float32, float32)
        ptx, resty = compile_ptx(f, args, device=True)

        # Without fastmath, fma contraction is enabled by default, but ftz and
        # approximate div / sqrt is not.
        self.assertIn('fma.rn.f32', ptx)
        self.assertIn('div.rn.f32', ptx)
        self.assertIn('sqrt.rn.f32', ptx)

        ptx, resty = compile_ptx(f, args, device=True, fastmath=True)

        # With fastmath, ftz and approximate div / sqrt are enabled
        self.assertIn('fma.rn.ftz.f32', ptx)
        self.assertIn('div.approx.ftz.f32', ptx)
        self.assertIn('sqrt.approx.ftz.f32', ptx)
def test_compile_arith_masked_ops(op, ty1, ty2, masked):
    def func(x, y):
        return op(x, y)

    cc = (7, 5)

    if masked[0]:
        ty1 = MaskedType(ty1)
    if masked[1]:
        ty2 = MaskedType(ty2)

    ptx, resty = compile_ptx(func, (ty1, ty2), cc=cc, device=True)
Esempio n. 15
0
    def test_device_function_with_debug(self):
        # See Issue #6719 - this ensures that compilation with debug succeeds
        # with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
        # IR version metadata was not added when compiling device functions,
        # and NVVM assumed DBG version 1.0 if not specified, which is
        # incompatible with the 3.0 IR we use. This was specified only for
        # kernels.
        def f():
            pass

        ptx, resty = compile_ptx(f, [], device=True, debug=True)
        self.check_debug_info(ptx)
Esempio n. 16
0
    def test_fp16_comparison_ptx(self):
        functions = (simple_fp16_gt, simple_fp16_ge,
                     simple_fp16_lt, simple_fp16_le,
                     simple_fp16_eq, simple_fp16_ne)
        ops = (operator.gt, operator.ge, operator.lt, operator.le,
               operator.eq, operator.ne)
        opstring = ('setp.gt.f16', 'setp.ge.f16',
                    'setp.lt.f16', 'setp.le.f16',
                    'setp.eq.f16', 'setp.ne.f16')
        args = (b1[:], f2, f2)

        for fn, op, s in zip(functions, ops, opstring):
            with self.subTest(op=op):
                ptx, _ = compile_ptx(fn, args, cc=(5, 3))
                self.assertIn(s, ptx)
Esempio n. 17
0
def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked):
    def func(x, y):
        return op(x, y)

    cc = (7, 5)

    ty1 = from_dtype(np.dtype(left_dtype))
    ty2 = from_dtype(np.dtype(right_dtype))

    if masked[0]:
        ty1 = MaskedType(ty1)
    if masked[1]:
        ty2 = MaskedType(ty2)

    ptx, resty = compile_ptx(func, (ty1, ty2), cc=cc, device=True)
Esempio n. 18
0
    def test_device_function(self):
        def add(x, y):
            return x + y

        args = (float32, float32)
        ptx, resty = compile_ptx(add, args, device=True)

        # Device functions take a func_retval parameter for storing the
        # returned value in by reference
        self.assertIn('func_retval', ptx)
        # .visible .func is used to denote a device function
        self.assertIn('.visible .func', ptx)
        # .visible .entry would denote the presence of a global function
        self.assertNotIn('.visible .entry', ptx)
        # Inferred return type as expected?
        self.assertEqual(resty, float32)
Esempio n. 19
0
    def test_global_kernel(self):
        def f(r, x, y):
            i = cuda.grid(1)
            if i < len(r):
                r[i] = x[i] + y[i]

        args = (float32[:], float32[:], float32[:])
        ptx, resty = compile_ptx(f, args)

        # Kernels should not have a func_retval parameter
        self.assertNotIn('func_retval', ptx)
        # .visible .func is used to denote a device function
        self.assertNotIn('.visible .func', ptx)
        # .visible .entry would denote the presence of a global function
        self.assertIn('.visible .entry', ptx)
        # Return type for kernels should always be void
        self.assertEqual(resty, void)
Esempio n. 20
0
    def test_nanosleep(self):
        def use_nanosleep(x):
            # Sleep for a constant time
            cuda.nanosleep(32)
            # Sleep for a variable time
            cuda.nanosleep(x)

        ptx, resty = compile_ptx(use_nanosleep, (uint32, ), cc=(7, 0))

        nanosleep_count = 0
        for line in ptx.split('\n'):
            if 'nanosleep.u32' in line:
                nanosleep_count += 1

        expected = 2
        self.assertEqual(expected, nanosleep_count,
                         (f'Got {nanosleep_count} nanosleep instructions, '
                          f'expected {expected}'))
Esempio n. 21
0
    def test_fp16_int8_comparison_ptx(self):
        # Test that int8 can be safely converted to fp16
        # in a comparison
        functions = (simple_fp16_gt, simple_fp16_ge,
                     simple_fp16_lt, simple_fp16_le,
                     simple_fp16_eq, simple_fp16_ne)
        ops = (operator.gt, operator.ge, operator.lt, operator.le,
               operator.eq, operator.ne)

        opstring = {operator.gt:'setp.gt.f16',
                    operator.ge:'setp.ge.f16',
                    operator.lt:'setp.lt.f16',
                    operator.le:'setp.le.f16',
                    operator.eq:'setp.eq.f16',
                    operator.ne:'setp.ne.f16'}
        for fn, op in zip(functions, ops):
            with self.subTest(op=op):
                args = (b1[:], f2, from_dtype(np.int8))
                ptx, _ = compile_ptx(fn, args, cc=(5, 3))
                self.assertIn(opstring[op], ptx)
Esempio n. 22
0
def test_compile_arith_na_vs_masked(op, ty):
    def func(x):
        return op(NA, x)

    cc = (7, 5)
    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
Esempio n. 23
0
def test_compile_masked_unary(op, ty):
    def func(x):
        return op(x)

    cc = (7, 5)
    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
Esempio n. 24
0
    def _test_call_functions(self):
        # Strip off '__nv_' from libdevice name to get Python name
        apiname = libname[5:]
        apifunc = getattr(libdevice, apiname)
        retty, args = functions[libname]
        sig = create_signature(retty, args)

        # Construct arguments to the libdevice function. These are all
        # non-pointer arguments to the underlying bitcode function.
        funcargs = ", ".join(['a%d' % i for i, arg in enumerate(args) if not
                              arg.is_ptr])

        # Arguments to the Python function (`pyfunc` in the template above) are
        # the arguments to the libdevice function, plus as many extra arguments
        # as there are in the return type of the libdevice function - one for
        # scalar-valued returns, or the length of the tuple for tuple-valued
        # returns.
        if isinstance(sig.return_type, (types.Tuple, types.UniTuple)):
            # Start with the parameters for the return values
            pyargs = ", ".join(['r%d' % i for i in
                                range(len(sig.return_type))])
            # Add the parameters for the argument values
            pyargs += ", " + funcargs
            # Generate the unpacking of the return value from the libdevice
            # function into the Python function return values (`r0`, `r1`,
            # etc.).
            retvars = ", ".join(['r%d[0]' % i for i in
                                 range(len(sig.return_type))])
        else:
            # Scalar return is a more straightforward case
            pyargs = "r0, " + funcargs
            retvars = "r0[0]"

        # Create the string containing the function to compile
        d = { 'func': apiname,
              'pyargs': pyargs,
              'funcargs': funcargs,
              'retvars': retvars }
        code = function_template % d

        # Convert the string to a Python function
        locals = {}
        exec(code, globals(), locals)
        pyfunc = locals['pyfunc']

        # Compute the signature for compilation. This mirrors the creation of
        # arguments to the Python function above.
        pyargs = [ arg.ty for arg in args if not arg.is_ptr ]
        if isinstance(sig.return_type, (types.Tuple, types.UniTuple)):
            pyreturns = [ret[::1] for ret in sig.return_type]
            pyargs = pyreturns + pyargs
        else:
            pyargs.insert(0, sig.return_type[::1])

        ptx, resty = compile_ptx(pyfunc, pyargs)

        # If the function body was discarded by optimization (therefore making
        # the test a bit weak), there won't be any loading of parameters -
        # ensure that a load from parameters occurs somewhere in the PTX
        self.assertIn('ld.param', ptx)

        # Returning the result (through a passed-in array) should also require
        # a store to global memory, so check for at least one of those too.
        self.assertIn('st.global', ptx)
Esempio n. 25
0
    def test_device_function_with_line_info(self):
        def f():
            pass

        ptx, resty = compile_ptx(f, [], device=True, lineinfo=True)
        self.check_line_info(ptx)
Esempio n. 26
0
    def test_kernel_with_line_info(self):
        def f():
            pass

        ptx, resty = compile_ptx(f, [], lineinfo=True)
        self.check_line_info(ptx)
Esempio n. 27
0
 def test_hfma_ptx(self):
     args = (f2[:], f2, f2, f2)
     ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3))
     self.assertIn('fma.rn.f16', ptx)
Esempio n. 28
0
 def test_hneg_ptx(self):
     args = (f2[:], f2)
     ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3))
     self.assertIn('neg.f16', ptx)