def ptx_nanosleep(context, builder, sig, args): nanosleep = ir.InlineAsm(ir.FunctionType(ir.VoidType(), [ir.IntType(32)]), "nanosleep.u32 $0;", 'r', side_effect=True) ns = args[0] builder.call(nanosleep, [ns])
def integer_to_float16_cast(context, builder, fromty, toty, val): bitwidth = fromty.bitwidth constraint = float16_int_constraint(bitwidth) signedness = 's' if fromty.signed else 'u' fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)]) asm = ir.InlineAsm(fnty, f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;", f"=h,{constraint}") return builder.call(asm, [val])
def float_to_float16_cast(context, builder, fromty, toty, val): if fromty.bitwidth == toty.bitwidth: return val ty, constraint = float16_float_ty_constraint(fromty.bitwidth) fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)]) asm = ir.InlineAsm(fnty, f"cvt.rn.f16.{ty} $0, $1;", f"=h,{constraint}") return builder.call(asm, [val])
def float16_to_integer_cast(context, builder, fromty, toty, val): bitwidth = toty.bitwidth constraint = float16_int_constraint(bitwidth) signedness = 's' if toty.signed else 'u' fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)]) asm = ir.InlineAsm(fnty, f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;", f"={constraint},h") return builder.call(asm, [val])
def ptx_fp16_comparison(context, builder, sig, args): fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)]) asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), '=h,h,h') result = builder.call(asm, args) zero = context.get_constant(types.int16, 0) int_result = builder.bitcast(result, ir.IntType(16)) return builder.icmp_unsigned("!=", int_result, zero)
def test_inline_assembly(self): mod = self.module() foo = ir.Function(mod, ir.FunctionType(ir.VoidType(), []), 'foo') builder = ir.IRBuilder(foo.append_basic_block('')) asmty = ir.FunctionType(ir.IntType(32), [ir.IntType(32)]) asm = ir.InlineAsm(asmty, "mov $1, $2", "=r,r", side_effect=True) builder.call(asm, [ir.Constant(ir.IntType(32), 123)]) builder.ret_void() pat = 'call i32 asm sideeffect "mov $1, $2", "=r,r" ( i32 123 )' self.assertInText(pat, str(mod)) self.assert_valid_ir(mod)
def mark_location(self, builder, line): # Avoid duplication if self._last_lineno == line: return self._last_lineno = line # Add call to an inline asm to mark line location asmty = ir.FunctionType(ir.VoidType(), []) asm = ir.InlineAsm(asmty, "// dbg {}".format(line), "", side_effect=True) call = builder.call(asm, []) md = self._di_location(line) call.set_metadata('numba.dbg', md)
def ptx_fp16_habs(context, builder, sig, args): if cuda.runtime.get_version() < (10, 2): # CUDA < 10.2 does not support abs.f16. For these versions, we mask # off the sign bit to compute abs instead. We determine whether or # not to do this based on the runtime version so that our behaviour # is consistent with the version of NVVM we're using to go from # NVVM IR -> PTX. inst = 'and.b16 $0, $1, 0x7FFF;' else: inst = 'abs.f16 $0, $1;' fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)]) asm = ir.InlineAsm(fnty, inst, '=h,h') return builder.call(asm, args)
def test_inline_rsqrt(self): mod = ir.Module(__name__) fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())]) fn = ir.Function(mod, fnty, 'cu_rsqrt') bldr = ir.IRBuilder(fn.append_basic_block('entry')) rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()]) inlineasm = ir.InlineAsm(rsqrt_approx_fnty, 'rsqrt.approx.f32 $0, $1;', '=f,f', side_effect=True) val = bldr.load(fn.args[0]) res = bldr.call(inlineasm, [val]) bldr.store(res, fn.args[0]) bldr.ret_void() # generate ptx nvvm.fix_data_layout(mod) nvvm.set_cuda_kernel(fn) nvvmir = str(mod) ptx = nvvm.llvm_to_ptx(nvvmir) self.assertTrue('rsqrt.approx.f32' in str(ptx))
def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False): elemcount = reduce(operator.mul, shape, 1) # Check for valid shape for this type of allocation. # Only 1d arrays can be dynamic. dynamic_smem = elemcount <= 0 and can_dynsized and len(shape) == 1 if elemcount <= 0 and not dynamic_smem: raise ValueError("array length <= 0") # Check that we support the requested dtype data_model = context.data_model_manager[dtype] other_supported_type = (isinstance(dtype, (types.Record, types.Boolean)) or isinstance(data_model, models.StructModel)) if dtype not in types.number_domain and not other_supported_type: raise TypeError("unsupported type: %s" % dtype) lldtype = context.get_data_type(dtype) laryty = ir.ArrayType(lldtype, elemcount) if addrspace == nvvm.ADDRSPACE_LOCAL: # Special case local address space allocation to use alloca # NVVM is smart enough to only use local memory if no register is # available dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name) else: lmod = builder.module # Create global variable in the requested address space gvmem = cgutils.add_global_variable(lmod, laryty, symbol_name, addrspace) # Specify alignment to avoid misalignment bug align = context.get_abi_sizeof(lldtype) # Alignment is required to be a power of 2 for shared memory. If it is # not a power of 2 (e.g. for a Record array) then round up accordingly. gvmem.align = 1 << (align - 1).bit_length() if dynamic_smem: gvmem.linkage = 'external' else: ## Comment out the following line to workaround a NVVM bug ## which generates a invalid symbol name when the linkage ## is internal and in some situation. ## See _get_unique_smem_id() # gvmem.linkage = lc.LINKAGE_INTERNAL gvmem.initializer = ir.Constant(laryty, ir.Undefined) # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, ir.IntType(8), addrspace) addrspaceptr = gvmem.bitcast(ir.PointerType(ir.IntType(8), addrspace)) dataptr = builder.call(conv, [addrspaceptr]) targetdata = _get_target_data(context) lldtype = context.get_data_type(dtype) itemsize = lldtype.get_abi_size(targetdata) # Compute strides laststride = itemsize rstrides = [] for i, lastsize in enumerate(reversed(shape)): rstrides.append(laststride) laststride *= lastsize strides = [s for s in reversed(rstrides)] kstrides = [context.get_constant(types.intp, s) for s in strides] # Compute shape if dynamic_smem: # Compute the shape based on the dynamic shared memory configuration. # Unfortunately NVVM does not provide an intrinsic for the # %dynamic_smem_size register, so we must read it using inline # assembly. get_dynshared_size = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []), "mov.u32 $0, %dynamic_smem_size;", '=r', side_effect=True) dynsmem_size = builder.zext(builder.call(get_dynshared_size, []), ir.IntType(64)) # Only 1-D dynamic shared memory is supported so the following is a # sufficient construction of the shape kitemsize = context.get_constant(types.intp, itemsize) kshape = [builder.udiv(dynsmem_size, kitemsize)] else: kshape = [context.get_constant(types.intp, s) for s in shape] # Create array object ndim = len(shape) aryty = types.Array(dtype=dtype, ndim=ndim, layout='C') ary = context.make_array(aryty)(context, builder) context.populate_array(ary, data=builder.bitcast(dataptr, ary.data.type), shape=kshape, strides=kstrides, itemsize=context.get_constant(types.intp, itemsize), meminfo=None) return ary._getvalue()
def ptx_hfma(context, builder, sig, args): argtys = [ir.IntType(16), ir.IntType(16), ir.IntType(16)] fnty = ir.FunctionType(ir.IntType(16), argtys) asm = ir.InlineAsm(fnty, "fma.rn.f16 $0,$1,$2,$3;", "=h,h,h,h") return builder.call(asm, args)
def ptx_fp16_hneg(context, builder, sig, args): fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)]) asm = ir.InlineAsm(fnty, 'neg.f16 $0, $1;', '=h,h') return builder.call(asm, args)
def ptx_fp16_binary(context, builder, sig, args): fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)]) asm = ir.InlineAsm(fnty, f'{op}.f16 $0,$1,$2;', '=h,h,h') return builder.call(asm, args)
def ptx_lanemask_lt(context, builder, sig, args): activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []), "mov.u32 $0, %lanemask_lt;", '=r', side_effect=True) return builder.call(activemask, [])
def ptx_activemask(context, builder, sig, args): activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []), "activemask.b32 $0;", '=r', side_effect=True) return builder.call(activemask, [])