q = op.args[2] s = op.args[3] # Don't use this intrinsic if we don't have a int32x4 vector # or if we are not multiplying q31 numbers if x.dtype != "int32x4" or q.value != 31: return op # Case 1, shift is negative sqrdmulh = tvm.tir.call_llvm_intrin(op.dtype, "llvm.aarch64.neon.sqrdmulh", tvm.tir.const(2, "uint32"), x, y) fixup = (sqrdmulh & (-s)) >> 31 fixed_up_x = sqrdmulh + fixup out_1 = tvm.tir.call_llvm_intrin(op.dtype, "llvm.aarch64.neon.srshl", tvm.tir.const(2, "uint32"), sqrdmulh, s) # Case 2, shift is positive x = x * (1 << (s)) out_2 = tvm.tir.call_llvm_intrin(op.dtype, "llvm.aarch64.neon.sqrdmulh", tvm.tir.const(2, "uint32"), x, y) # Select depending on the shift return tvm.tir.Select(s < 0, out_1, out_2) register_intrin_lowering("tir.q_multiply_shift", target="llvm.aarch64", f=_q_multiply_shift_arm, level=99)
assert name.startswith("tir.") dispatch_name = name[4:] if op.dtype == "float32": # call float function return tvm.tir.call_pure_extern("float32", "%sf" % dispatch_name, op.args[0]) elif op.dtype == "float64": # call double function return tvm.tir.call_pure_extern("float32", dispatch_name, op.args[0]) else: # cannot do translation, return self. return op register_intrin_lowering("tir.exp", target="cuda", f=my_cuda_math_rule, level=99) ###################################################################### # Register the rule to TVM with override option to override existing rule. # Notice the difference between the printed code from previous one: # our new rule uses math function :code:`expf` instead of # fast math version :code:`__expf`. # fcuda = tvm.build(s, [A, B], "cuda", name="myexp") print(fcuda.imported_modules[0].get_source()) ###################################################################### # Add Your Own Intrinsic # ---------------------- # If there is an intrinsic that is not provided by TVM. # User can easily add new intrinsic by using the intrinsic rule system.
op.args[1]) if op.dtype == "int32": return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) raise RuntimeError("only support int32, float32 and float64") def opencl_atomic_add_rule(op): if op.dtype == "int32": return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) raise RuntimeError("only support int32") register_intrin_lowering("tir.atomic_add", target="cuda", f=cuda_atomic_add_rule, level=99) register_intrin_lowering("tir.atomic_add", target="opencl", f=opencl_atomic_add_rule, level=99) def atomic_add(x, y): return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index): """Low level IR to identify bounding boxes given a score threshold.
Parameters ---------- op : PrimExpr The call expression of original intrinsic. Returns ------- ret : PrimExpr The translated intrinsic rule. Return same op if no translation is possible. See Also -------- register_intrin_lowering : The registration function for intrinsic lowering rule. """ if str(op.dtype).startswith("float"): return call_pure_extern(op.dtype, op.op.name[4:], *op.args) return None # opencl pattern for exp register_intrin_lowering("tir.exp", target="opencl", f=_rule_float_direct, level=99) # default pattern for exp register_intrin_lowering("tir.exp", target="default", f=_rule_float_suffix, level=99)