def test_native_cast(self): float32_ptx, _ = cuda.compile_ptx(native_cast, (float32, ), device=True) self.assertIn("st.f32", float32_ptx) float16_ptx, _ = cuda.compile_ptx(native_cast, (float16, ), device=True) self.assertIn("st.u16", float16_ptx)
def test_compile_arith_masked_vs_constant(op, ty, constant): def func(x): return op(x, constant) cc = (7, 5) ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True) assert isinstance(resty, MaskedType) # Check that the masked typing matches that of the unmasked typing um_ptx, um_resty = compile_ptx(func, (ty,), cc=cc, device=True) assert resty.value_type == um_resty
def test_kernel_with_debug(self): # Inspired by (but not originally affected by) Issue #6719 def f(): pass ptx, resty = compile_ptx(f, [], debug=True) self.check_debug_info(ptx)
def test_float16_to_float_ptx(self): pyfuncs = (to_float32, to_float64) postfixes = ("f32", "f64") for pyfunc, postfix in zip(pyfuncs, postfixes): ptx, _ = compile_ptx(pyfunc, [f2], device=True) self.assertIn(f"cvt.{postfix}.f16", ptx)
def test_generic_ptx(dtype): size = 500 lhs_arr = np.random.random(size).astype(dtype) lhs_col = Series(lhs_arr)._column rhs_arr = np.random.random(size).astype(dtype) rhs_col = Series(rhs_arr)._column def generic_function(a, b): return a ** 3 + b nb_type = numpy_support.from_dtype(cudf.dtype(dtype)) type_signature = (nb_type, nb_type) ptx_code, output_type = compile_ptx( generic_function, type_signature, device=True ) dtype = numpy_support.as_dtype(output_type).type out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype) result = lhs_arr ** 3 + rhs_arr np.testing.assert_almost_equal(result, out_col.to_array())
def test_float16_to_uint_ptx(self): pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64) sizes = (8, 16, 32, 64) for pyfunc, size in zip(pyfuncs, sizes): ptx, _ = compile_ptx(pyfunc, [f2], device=True) self.assertIn(f"cvt.rni.u{size}.f16", ptx)
def test_uint_to_float16_ptx(self): fromtys = (u1, u2, u4, u8) sizes = (8, 16, 32, 64) for ty, size in zip(fromtys, sizes): ptx, _ = compile_ptx(to_float16, [ty], device=True) self.assertIn(f"cvt.rn.f16.u{size}", ptx)
def test_mixed_fp16_comparison_promotion_ptx(self): functions = (simple_fp16_gt, simple_fp16_ge, simple_fp16_lt, simple_fp16_le, simple_fp16_eq, simple_fp16_ne) ops = (operator.gt, operator.ge, operator.lt, operator.le, operator.eq, operator.ne) types_promote = (np.int16, np.int32, np.int64, np.float32, np.float64) opstring = {operator.gt:'setp.gt.', operator.ge:'setp.ge.', operator.lt:'setp.lt.', operator.le:'setp.le.', operator.eq:'setp.eq.', operator.ne:'setp.neu.'} opsuffix = {np.dtype('int32'): 'f64', np.dtype('int64'): 'f64', np.dtype('float32'): 'f32', np.dtype('float64'): 'f64'} for (fn, op), ty in itertools.product(zip(functions, ops), types_promote): with self.subTest(op=op, ty=ty): arg2_ty = np.result_type(np.float16, ty) args = (b1[:], f2, from_dtype(arg2_ty)) ptx, _ = compile_ptx(fn, args, cc=(5, 3)) ops = opstring[op] + opsuffix[arg2_ty] self.assertIn(ops, ptx)
def test_habs_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3)) if cuda.runtime.get_version() < (10, 2): self.assertRegex(ptx, r'and\.b16.*0x7FFF;') else: self.assertIn('abs.f16', ptx)
def test_device_function_with_debug(self): # See Issue #6719 def f(): pass ptx, resty = compile_ptx(f, [], device=True, debug=True) self.check_debug_info(ptx)
def test_compile_arith_constant_vs_masked(op, ty, constant): def func(x): return op(constant, x) cc = (7, 5) ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True) assert isinstance(resty, MaskedType)
def test_compile_arith_masked_vs_na(op, ty): def func(x): return op(x, NA) cc = (7, 5) ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True) assert isinstance(resty, MaskedType)
def test_fastmath(self): def f(x, y, z, d): return sqrt((x * y + z) / d) args = (float32, float32, float32, float32) ptx, resty = compile_ptx(f, args, device=True) # Without fastmath, fma contraction is enabled by default, but ftz and # approximate div / sqrt is not. self.assertIn('fma.rn.f32', ptx) self.assertIn('div.rn.f32', ptx) self.assertIn('sqrt.rn.f32', ptx) ptx, resty = compile_ptx(f, args, device=True, fastmath=True) # With fastmath, ftz and approximate div / sqrt are enabled self.assertIn('fma.rn.ftz.f32', ptx) self.assertIn('div.approx.ftz.f32', ptx) self.assertIn('sqrt.approx.ftz.f32', ptx)
def test_compile_arith_masked_ops(op, ty1, ty2, masked): def func(x, y): return op(x, y) cc = (7, 5) if masked[0]: ty1 = MaskedType(ty1) if masked[1]: ty2 = MaskedType(ty2) ptx, resty = compile_ptx(func, (ty1, ty2), cc=cc, device=True)
def test_device_function_with_debug(self): # See Issue #6719 - this ensures that compilation with debug succeeds # with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM # IR version metadata was not added when compiling device functions, # and NVVM assumed DBG version 1.0 if not specified, which is # incompatible with the 3.0 IR we use. This was specified only for # kernels. def f(): pass ptx, resty = compile_ptx(f, [], device=True, debug=True) self.check_debug_info(ptx)
def test_fp16_comparison_ptx(self): functions = (simple_fp16_gt, simple_fp16_ge, simple_fp16_lt, simple_fp16_le, simple_fp16_eq, simple_fp16_ne) ops = (operator.gt, operator.ge, operator.lt, operator.le, operator.eq, operator.ne) opstring = ('setp.gt.f16', 'setp.ge.f16', 'setp.lt.f16', 'setp.le.f16', 'setp.eq.f16', 'setp.ne.f16') args = (b1[:], f2, f2) for fn, op, s in zip(functions, ops, opstring): with self.subTest(op=op): ptx, _ = compile_ptx(fn, args, cc=(5, 3)) self.assertIn(s, ptx)
def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked): def func(x, y): return op(x, y) cc = (7, 5) ty1 = from_dtype(np.dtype(left_dtype)) ty2 = from_dtype(np.dtype(right_dtype)) if masked[0]: ty1 = MaskedType(ty1) if masked[1]: ty2 = MaskedType(ty2) ptx, resty = compile_ptx(func, (ty1, ty2), cc=cc, device=True)
def test_device_function(self): def add(x, y): return x + y args = (float32, float32) ptx, resty = compile_ptx(add, args, device=True) # Device functions take a func_retval parameter for storing the # returned value in by reference self.assertIn('func_retval', ptx) # .visible .func is used to denote a device function self.assertIn('.visible .func', ptx) # .visible .entry would denote the presence of a global function self.assertNotIn('.visible .entry', ptx) # Inferred return type as expected? self.assertEqual(resty, float32)
def test_global_kernel(self): def f(r, x, y): i = cuda.grid(1) if i < len(r): r[i] = x[i] + y[i] args = (float32[:], float32[:], float32[:]) ptx, resty = compile_ptx(f, args) # Kernels should not have a func_retval parameter self.assertNotIn('func_retval', ptx) # .visible .func is used to denote a device function self.assertNotIn('.visible .func', ptx) # .visible .entry would denote the presence of a global function self.assertIn('.visible .entry', ptx) # Return type for kernels should always be void self.assertEqual(resty, void)
def test_nanosleep(self): def use_nanosleep(x): # Sleep for a constant time cuda.nanosleep(32) # Sleep for a variable time cuda.nanosleep(x) ptx, resty = compile_ptx(use_nanosleep, (uint32, ), cc=(7, 0)) nanosleep_count = 0 for line in ptx.split('\n'): if 'nanosleep.u32' in line: nanosleep_count += 1 expected = 2 self.assertEqual(expected, nanosleep_count, (f'Got {nanosleep_count} nanosleep instructions, ' f'expected {expected}'))
def test_fp16_int8_comparison_ptx(self): # Test that int8 can be safely converted to fp16 # in a comparison functions = (simple_fp16_gt, simple_fp16_ge, simple_fp16_lt, simple_fp16_le, simple_fp16_eq, simple_fp16_ne) ops = (operator.gt, operator.ge, operator.lt, operator.le, operator.eq, operator.ne) opstring = {operator.gt:'setp.gt.f16', operator.ge:'setp.ge.f16', operator.lt:'setp.lt.f16', operator.le:'setp.le.f16', operator.eq:'setp.eq.f16', operator.ne:'setp.ne.f16'} for fn, op in zip(functions, ops): with self.subTest(op=op): args = (b1[:], f2, from_dtype(np.int8)) ptx, _ = compile_ptx(fn, args, cc=(5, 3)) self.assertIn(opstring[op], ptx)
def test_compile_arith_na_vs_masked(op, ty): def func(x): return op(NA, x) cc = (7, 5) ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
def test_compile_masked_unary(op, ty): def func(x): return op(x) cc = (7, 5) ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
def _test_call_functions(self): # Strip off '__nv_' from libdevice name to get Python name apiname = libname[5:] apifunc = getattr(libdevice, apiname) retty, args = functions[libname] sig = create_signature(retty, args) # Construct arguments to the libdevice function. These are all # non-pointer arguments to the underlying bitcode function. funcargs = ", ".join(['a%d' % i for i, arg in enumerate(args) if not arg.is_ptr]) # Arguments to the Python function (`pyfunc` in the template above) are # the arguments to the libdevice function, plus as many extra arguments # as there are in the return type of the libdevice function - one for # scalar-valued returns, or the length of the tuple for tuple-valued # returns. if isinstance(sig.return_type, (types.Tuple, types.UniTuple)): # Start with the parameters for the return values pyargs = ", ".join(['r%d' % i for i in range(len(sig.return_type))]) # Add the parameters for the argument values pyargs += ", " + funcargs # Generate the unpacking of the return value from the libdevice # function into the Python function return values (`r0`, `r1`, # etc.). retvars = ", ".join(['r%d[0]' % i for i in range(len(sig.return_type))]) else: # Scalar return is a more straightforward case pyargs = "r0, " + funcargs retvars = "r0[0]" # Create the string containing the function to compile d = { 'func': apiname, 'pyargs': pyargs, 'funcargs': funcargs, 'retvars': retvars } code = function_template % d # Convert the string to a Python function locals = {} exec(code, globals(), locals) pyfunc = locals['pyfunc'] # Compute the signature for compilation. This mirrors the creation of # arguments to the Python function above. pyargs = [ arg.ty for arg in args if not arg.is_ptr ] if isinstance(sig.return_type, (types.Tuple, types.UniTuple)): pyreturns = [ret[::1] for ret in sig.return_type] pyargs = pyreturns + pyargs else: pyargs.insert(0, sig.return_type[::1]) ptx, resty = compile_ptx(pyfunc, pyargs) # If the function body was discarded by optimization (therefore making # the test a bit weak), there won't be any loading of parameters - # ensure that a load from parameters occurs somewhere in the PTX self.assertIn('ld.param', ptx) # Returning the result (through a passed-in array) should also require # a store to global memory, so check for at least one of those too. self.assertIn('st.global', ptx)
def test_device_function_with_line_info(self): def f(): pass ptx, resty = compile_ptx(f, [], device=True, lineinfo=True) self.check_line_info(ptx)
def test_kernel_with_line_info(self): def f(): pass ptx, resty = compile_ptx(f, [], lineinfo=True) self.check_line_info(ptx)
def test_hfma_ptx(self): args = (f2[:], f2, f2, f2) ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3)) self.assertIn('fma.rn.f16', ptx)
def test_hneg_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3)) self.assertIn('neg.f16', ptx)