def partition_for_ethosn78(mod, params=None, **opts): """Partition the graph greedily offloading supported operators to Arm Ethos-N NPU. Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. Returns ------- ret : annotated and partitioned module. """ if not opts or opts.get("variant", "").lower() != "ethos-n78": raise ValueError("When targeting Ethos(TM)-N78, -variant=Ethos-N78 should be set.") if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential( [ transform.InferType(), transform.MergeComposite(pattern_table()), transform.AnnotateTarget("ethos-n"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ] ) return seq(mod)
def annotate(func, compiler): """ An annotator for Core ML. """ # Bind free variables to the constant values. bind_dict = {} for arg in func.params: name = arg.name_hint if name in params: bind_dict[arg] = relay.const(params[name]) func = relay.bind(func, bind_dict) # Annotate the entire graph for Core ML mod = tvm.IRModule() mod["main"] = func seq = tvm.transform.Sequential([ transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), transform.AnnotateTarget(compiler), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with relay.build_config(opt_level=3): mod = seq(mod) return mod
def partition_for_dnnl(mod, params=None): """Partition the graph greedily offloading supported operators to DNNL. Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. Returns ------- mod : Module Annotated and partitioned module. """ if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential([ transform.CanonicalizeOps(), transform.InferType(), transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu` transform.SimplifyExpr(), transform.FoldConstant(), transform.MergeComposite(pattern_table()), transform.AnnotateTarget("dnnl"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) return mod
def partition_for_vitis_ai(mod, params=None, dpu=None, **opts): """Partition the Relay expression for offloading operators to Vitis AI DPU Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. dpu : str The DPU identifier (e.g. DPUCZDX8G-zcu104, DPUCADX8G) Returns ------- ret : Module """ if dpu is None: raise ValueError( "Please pass Vitis AI DPU identifier to the partitioning function") if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential([ transform.InferType(), VitisAIAnnotationPass("vitis_ai", dpu, params), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) return seq(mod)
def test_byoc_microtvm_multiple_subgraphs(merge_compiler_regions): """This is a test case to check BYOC capabilities of AOT with multiple sub graphs""" use_unpacked_api = False interface_api = "packed" test_runner = AOT_DEFAULT_RUNNER x = relay.var("x", shape=(10, 10)) w0 = relay.var("w0", shape=(10, 10)) w1 = relay.var("w1", shape=(10, 10)) w2 = relay.var("w2", shape=(10, 10)) w3 = relay.var("w3", shape=(10, 10)) w4 = relay.var("w4", shape=(10, 10)) w5 = relay.var("w5", shape=(10, 10)) w6 = relay.var("w6", shape=(10, 10)) w7 = relay.var("w7", shape=(10, 10)) # C compiler z0 = relay.add(x, w0) p0 = relay.subtract(z0, w1) q0 = relay.multiply(p0, w2) z1 = relay.add(x, w3) p1 = relay.subtract(z1, w4) q1 = relay.multiply(p1, w5) # Other parts on TVM z2 = relay.add(x, w6) q2 = relay.subtract(z2, w7) r = relay.concatenate((q0, q1, q2), axis=0) f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r) mod = tvm.IRModule() ann = byoc.CcompilerAnnotator() mod["main"] = ann.visit(f) if merge_compiler_regions: mod = transform.MergeCompilerRegions()(mod) mod = tvm.relay.transform.PartitionGraph("mod_name")(mod) mod = tvm.relay.transform.InferType()(mod) x_data = np.random.rand(10, 10).astype("float32") w_data = [] for _ in range(8): w_data.append(np.random.rand(10, 10).astype("float32")) map_inputs = OrderedDict([("x", x_data)] + [("w{}".format(i), w_data[i]) for i in range(8)]) output_list = generate_ref_data(mod, map_inputs) input_list = [map_inputs["x"]] input_list.extend([map_inputs["w{}".format(i)] for i in range(8)]) compile_and_run( AOTTestModel(name="my_mod", module=mod, inputs=map_inputs, outputs=output_list), test_runner, interface_api, use_unpacked_api, )
def test_constant_tuples(): @reg.register("qnn.concatenate", "target.const_tuples") def add(attrs, args): # pylint: disable=unused-variable return True def create_graph(): a = relay.var('a', shape=(10, 10), dtype="uint8") b = relay.var('b', shape=(10, 10), dtype="uint8") a1 = relay.abs(a) zeroi = relay.const(1, "int32") zerof = relay.const(0, "float32") con = relay.qnn.op.concatenate((a1, b), input_scales=(zerof, zerof), input_zero_points=(zeroi, zeroi), output_scale=zerof, output_zero_point=zeroi, axis=1) f = relay.Function([a, b], con) mod = tvm.IRModule.from_expr(f) return mod seq = tvm.transform.Sequential([ transform.AnnotateTarget("const_tuples"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) partitioned = seq(create_graph()) concat = partitioned["const_tuples_0"].body assert type(concat.args[1]) == relay.Tuple assert type(concat.args[2]) == relay.Tuple assert type(concat.args[3]) == relay.Constant assert type(concat.args[4]) == relay.Constant
def partition_for_cmsisnn(mod, params=None, **opts): """Partition the graph greedily offloading supported operators on Cortex-M using CMSIS-NN Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. Returns ------- ret : Module annotated and partitioned module. """ if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential( [ transform.InferType(), transform.MergeComposite(pattern_table()), transform.AnnotateTarget("cmsisnn"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ] ) return seq(mod)
def partition_for_clml(mod, params=None): """Partition the graph greedily offloading supported operators to CLML Library. Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. Returns ------- ret : annotated and partitioned module. """ if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential( [ transform.InferType(), transform.FoldConstant(), transform.MergeComposite(clml_pattern_table()), transform.AnnotateTarget("clml", False), transform.MergeCompilerRegions(), transform.PartitionGraph(), ] ) result_mod = seq(mod) return result_mod
def test_extern_dnnl_mobilenet(): if not tvm.get_global_func("relay.ext.dnnl", True): print("skip because DNNL codegen is not available") return dtype = 'float32' ishape = (1, 3, 224, 224) mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32') mod["main"] = bind_params_by_name(mod["main"], params) mod = transform.AnnotateTarget(["dnnl"])(mod) mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph()(mod) i_data = np.random.uniform(0, 1, ishape).astype(dtype) ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32') ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0)) ref_res = ref_ex.evaluate()(i_data, **params) check_result(mod, {"data": i_data}, (1, 1000), ref_res.asnumpy(), tol=1e-5, params=params)
def test_duplicate_outputs(): target = "test_duplicate_outputs" @tvm.ir.register_op_attr("abs", "target." + target) def abs(attrs, args): # pylint: disable=unused-variable return True def create_graph(): data = relay.var("data", shape=(10, 10)) x = relay.abs(data) out_1 = relay.nn.relu(x) out_2 = relay.tanh(x) out_3 = relay.log(x) out = relay.Tuple([out_1, out_2, out_3]) func = relay.Function([data], out) return func def expected(): mod = tvm.IRModule() # function 0 f0_i0 = relay.var(target + "_0_i0", shape=(10, 10)) f0_o0 = relay.abs(f0_i0) func0 = relay.Function([f0_i0], f0_o0) func0 = func0.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) func0 = func0.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func0 = func0.with_attr("Compiler", target) func0 = func0.with_attr("global_symbol", target + "_0") gv0 = relay.GlobalVar(target + "_0") mod[gv0] = func0 mod = transform.InferType()(mod) # body data = relay.var("data", shape=(10, 10)) function_out = gv0(data) out_1 = relay.nn.relu(function_out) out_2 = relay.tanh(function_out) out_3 = relay.log(function_out) out = relay.Tuple([out_1, out_2, out_3]) func = relay.Function([data], out) mod["main"] = func mod = transform.InferType()(mod) return mod mod = tvm.IRModule() mod["main"] = create_graph() seq = tvm.transform.Sequential( [ transform.AnnotateTarget(target), transform.MergeCompilerRegions(), transform.PartitionGraph(), ] ) ref_mod = expected() partitioned = seq(mod) assert tvm.ir.structural_equal(partitioned, ref_mod, map_free_vars=True)
def test_different_output_region(): mod = get_mod() mod = WhiteListAnnotator(["subtract", "log"], "ccompiler")(mod) mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph()(mod) expected_mod = expected_different_output_region() assert tvm.ir.structural_equal(mod, expected_mod, map_free_vars=True)
def get_pass_order(use_patterns): """ Get the pass ordering based on using predicates or patterns. Parameters ---------- use_patterns: Bool True if pass needs to work with op patterns Returns ---------- ret : Sequential Pass object """ return (tvm.transform.Sequential([ transform.InferType(), RemoveDropoutPass(), transform.RemoveUnusedFunctions(), transform.ConvertLayout({ "nn.conv1d": ["NCW", "default"], "nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"], "nn.conv2d_transpose": ["NCHW", "default"], }), transform.FoldConstant(), transform.MergeComposite(pattern_table()), transform.AnnotateTarget("tensorrt"), transform.MergeCompilerRegions(), transform.PartitionGraph(), transform.InlineComposites("tensorrt"), transform.InferType(), ]) if use_patterns else tvm.transform.Sequential([ transform.InferType(), RemoveDropoutPass(), transform.RemoveUnusedFunctions(), transform.ConvertLayout({ "nn.conv1d": ["NCW", "default"], "nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"], "nn.conv2d_transpose": ["NCHW", "default"], }), transform.FoldConstant(), transform.AnnotateTarget("tensorrt"), transform.MergeCompilerRegions(), transform.PartitionGraph(), transform.InferType(), ]))
def test_byoc_microtvm(merge_compiler_regions): """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036""" use_unpacked_api = False interface_api = "packed" test_runner = AOTTestRunner(pass_config={"tir.usmp.enable": True}) x = relay.var("x", shape=(10, 10)) w0 = relay.var("w0", shape=(10, 10)) w1 = relay.var("w1", shape=(10, 10)) # z0 = x + w0 x_ = compiler_begin(x, "ccompiler") w0_ = compiler_begin(w0, "ccompiler") z0_ = relay.add(x_, w0_) z0 = compiler_end(z0_, "ccompiler") # z1 = z0 + w1 z0__ = compiler_begin(z0, "ccompiler") w1_ = compiler_begin(w1, "ccompiler") z1_ = relay.add(z0__, w1_) z1 = compiler_end(z1_, "ccompiler") # z2 = z0 + z1 z2 = relay.add(z0, z1) f = relay.Function([x, w0, w1], z2) mod = tvm.IRModule() mod["main"] = f if merge_compiler_regions: mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph("mod_name")(mod) mod = transform.InferType()(mod) x_data = [("x", np.random.rand(10, 10).astype("float32"))] w_data = [("w{}".format(i), np.random.rand(10, 10).astype("float32")) for i in range(2)] map_inputs = OrderedDict(x_data + w_data) output_list = generate_ref_data(mod, map_inputs) compiled_test_mods = compile_models( AOTTestModel(name="my_mod", module=mod, inputs=map_inputs, outputs=output_list), interface_api=interface_api, use_unpacked_api=use_unpacked_api, pass_config=test_runner.pass_config, ) for compiled_model in compiled_test_mods: check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib) run_and_check( models=compiled_test_mods, runner=test_runner, interface_api=interface_api, )
def partition_for_tensorrt( mod: tvm.IRModule, params: Optional[Dict[str, tvm.nd.NDArray]] = None, # CAUTION: Can't use default Target("tensorrt") here since the target kind is only available # if is_tensorrt_compiler_enabled() == True. target: Optional[tvm.target.Target] = None, ) -> tvm.IRModule: """Partition all functions in mod to greedily offload supported operators to TensorRT. Parameters ---------- mod : tvm.IRModule The module to partition. target : tvm.target.Target A target of kind "tensorrt" describing additional partitioning and compilation options. params : Optional[Dict[str, tvm.nd.NDArray]] Constant input parameters. Returns ------- partitioned_mod : tvm.IRModule The partitioned module. """ assert is_tensorrt_compiler_enabled( ), "Can only partition for TensorRT if it is enabled" if params: mod["main"] = bind_params_by_name(mod["main"], params) if target is None: # Use a default target. The get_tensorrt_target() function will similarly create an # equivalent default target when compilation continues after partitioning. target = tvm.target.Target("tensorrt") seq = tvm.transform.Sequential([ transform.InferType(), RemoveDropoutPass(), transform.RemoveUnusedFunctions(), transform.ConvertLayout({ "nn.conv1d": ["NCW", "default"], "nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"], "nn.conv2d_transpose": ["NCHW", "default"], }), transform.FoldConstant(), transform.MergeComposite(pattern_table()), transform.AnnotateTarget("tensorrt"), transform.MergeCompilerRegions(), transform.PartitionGraph(), transform.InferType(), ]) with target: mod = seq(mod) mod = prune_tensorrt_subgraphs(mod) return mod
def test_partial_constant(): """Test the subgraph with (const, var, const, var) arguments.""" if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True): print("skip because DNNL codegen is not available") return dtype = "float32" ishape = (10, 10) in_1 = relay.var("in_1", shape=ishape, dtype=dtype) in_2 = relay.var("in_2", shape=ishape, dtype=dtype) in_3 = relay.var("in_3", shape=ishape, dtype=dtype) in_4 = relay.var("in_4", shape=ishape, dtype=dtype) add1 = relay.add(in_1, in_2) add2 = relay.add(add1, in_3) add3 = relay.add(add2, in_3) add4 = relay.add(add3, in_3) func = relay.Function([in_1, in_2, in_3, in_4], add4) ref_mod = tvm.IRModule.from_expr(func) ref_mod = relay.transform.InferType()(ref_mod) data1 = np.random.uniform(0, 1, ishape).astype(dtype) data3 = np.random.uniform(0, 1, ishape).astype(dtype) params = { "in_1": tvm.nd.array(data1, device=tvm.cpu(0)), "in_3": tvm.nd.array(data3, device=tvm.cpu(0)), } ref_mod["main"] = bind_params_by_name(ref_mod["main"], params) opt_pass = tvm.transform.Sequential([ transform.InferType(), transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), transform.AnnotateTarget("dnnl"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): mod = opt_pass(ref_mod) data2 = np.random.uniform(0, 1, ishape).astype(dtype) data4 = np.random.uniform(0, 1, ishape).astype(dtype) check_result(mod, ref_mod, { "in_2": data2, "in_4": data4 }, (10, 10), tol=1e-5)
def test_byoc_microtvm(merge_compiler_regions): """ This is a simple test to check BYOC capabilities of AOT with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036 """ use_unpacked_api = False interface_api = "packed" test_runner = AOT_DEFAULT_RUNNER input_x = relay.var("x", shape=(10, 10)) input_w0 = relay.var("w0", shape=(10, 10)) input_w1 = relay.var("w1", shape=(10, 10)) # z0 = x + w0 marked_input_x = compiler_begin(input_x, "ccompiler") marked_input_w0 = compiler_begin(input_w0, "ccompiler") add_x_and_w0 = relay.add(marked_input_x, marked_input_w0) end_inner_add = compiler_end(add_x_and_w0, "ccompiler") # z1 = z0 + w1 marked_inner_add = compiler_begin(end_inner_add, "ccompiler") marked_w1 = compiler_begin(input_w1, "ccompiler") add_nested_and_w1 = relay.add(marked_inner_add, marked_w1) end_outer_add = compiler_end(add_nested_and_w1, "ccompiler") # z2 = z0 + z1 final_add = relay.add(end_inner_add, end_outer_add) relay_func = relay.Function([input_x, input_w0, input_w1], final_add) mod = tvm.IRModule() mod["main"] = relay_func if merge_compiler_regions: mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph("mod_name")(mod) mod = transform.InferType()(mod) x_data = [("x", np.random.rand(10, 10).astype("float32"))] w_data = [("w{}".format(i), np.random.rand(10, 10).astype("float32")) for i in range(2)] map_inputs = OrderedDict(x_data + w_data) output_list = generate_ref_data(mod, map_inputs) compile_and_run( AOTTestModel(name="my_mod", module=mod, inputs=map_inputs, outputs=output_list), test_runner, interface_api, use_unpacked_api, )
def test_byoc_microtvm(merge_compiler_regions): """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036""" use_unpacked_api = False interface_api = "packed" test_runner = AOT_DEFAULT_RUNNER x = relay.var("x", shape=(10, 10)) w0 = relay.var("w0", shape=(10, 10)) w1 = relay.var("w1", shape=(10, 10)) # z0 = x + w0 x_ = compiler_begin(x, "ccompiler") w0_ = compiler_begin(w0, "ccompiler") z0_ = relay.add(x_, w0_) z0 = compiler_end(z0_, "ccompiler") # z1 = z0 + w1 z0__ = compiler_begin(z0, "ccompiler") w1_ = compiler_begin(w1, "ccompiler") z1_ = relay.add(z0__, w1_) z1 = compiler_end(z1_, "ccompiler") # z2 = z0 + z1 z2 = relay.add(z0, z1) f = relay.Function([x, w0, w1], z2) mod = tvm.IRModule() mod["main"] = f if merge_compiler_regions: mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph("mod_name")(mod) mod = transform.InferType()(mod) x_data = [("x", np.random.rand(10, 10).astype("float32"))] w_data = [("w{}".format(i), np.random.rand(10, 10).astype("float32")) for i in range(2)] map_inputs = OrderedDict(x_data + w_data) output_list = generate_ref_data(mod, map_inputs) compile_and_run( AOTTestModel(name="my_mod", module=mod, inputs=map_inputs, outputs=output_list), test_runner, interface_api, use_unpacked_api, )
def partition_for_vitis_ai(mod, params=None, dpu=None, **opts): """Partition the Relay expression for offloading operators to Vitis AI DPU Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. dpu : str The DPU identifier (e.g. DPUCZDX8G-zcu104, DPUCADF8H) Returns ------- ret : Module """ if dpu is None: raise ValueError( "Please pass Vitis AI DPU identifier to the partitioning function") if params: mod["main"] = bind_params_by_name(mod["main"], params) desired_layouts_in_partition = { "nn.conv2d": ["NHWC", "default"], "nn.upsampling": ["NHWC"], "image.resize2d": ["NHWC"], } desired_layouts_in_main = { "nn.conv2d": ["NCHW", "default"], "nn.upsampling": ["NCHW"], "image.resize2d": ["NCHW"], } seq = tvm.transform.Sequential([ transform.RemoveUnusedFunctions(), transform.ConvertLayout(desired_layouts_in_partition), transform.FoldConstant(), transform.InferType(), VitisAIAnnotationPass("vitis_ai", dpu, params), transform.MergeCompilerRegions(), transform.PartitionGraph(), transform.RemoveUnusedFunctions(), transform.ConvertLayout(desired_layouts_in_main), transform.FoldConstant(), ]) with tvm.transform.PassContext(opt_level=3): return seq(mod)
def test_mobilenet_dnnl(): # if not tvm.get_global_func("relay.ext.dnnl", True): # print("skip because DNNL codegen is not available") # return dtype = "float32" ishape = (1, 3, 224, 224) mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32") mod = transform.AnnotateTarget(["dnnl"])(mod) mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph()(mod) i_data = np.random.uniform(0, 1, ishape).astype(dtype)
def get_partitoned_mod(mod): remove_bn_pass = tvm.transform.Sequential([ transform.InferType(), transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), ]) byoc_pass = tvm.transform.Sequential([ remove_bn_pass, transform.AnnotateTarget("dnnl"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): return byoc_pass(mod)
def build_module( mod, target, dpu_target="DPUCADX8G", params=None, enable_vitis_ai=True, #enable_vitis_ai=False, tvm_ops=0, vitis_ai_partitions=1, ): """Build module for Vitis-AI codegen.""" if isinstance(mod, tvm.relay.expr.Call): mod = tvm.IRModule.from_expr(mod) if params is None: params = {} mod = relay.transform.InferType()(mod) temp = utils.tempdir() print(temp) export_rt_mod_file = temp.relpath("vitis_ai.rtmod") with tvm.transform.PassContext( opt_level=3, config={"relay.ext.vitis_ai.options.target": dpu_target, 'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file} ): if enable_vitis_ai: mod["main"] = bind_params_by_name(mod["main"], params) mod = annotation(mod, params, dpu_target) mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph()(mod) tvm_op_count = get_cpu_op_count(mod) assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format( tvm_op_count, tvm_ops ) partition_count = 0 for global_var in mod.get_global_vars(): if "vitis_ai" in global_var.name_hint: partition_count += 1 assert ( vitis_ai_partitions == partition_count ), "Got {} Vitis-AI partitions, expected {}".format( partition_count, vitis_ai_partitions ) relay.backend.compile_engine.get().clear() return relay.build(mod, target, params=params)
def partition_for_ethosn(mod, params=None, **opts): """Partition the graph greedily offloading supported operators to Arm Ethos-N NPU. Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. Returns ------- ret : annotated and partitioned module. """ opts = opts or {} if "variant" not in opts: raise ValueError( "Please specify a variant in the target string, e.g. -variant=n78." ) # -variant=ethos-n78 deprecated in favour of -variant=n78 if opts["variant"].lower() == "ethos-n78": warnings.warn( "Please use '-variant=n78' instead of the deprecated " "'-variant=ethos-n78', which will be removed in TVM v0.9.", DeprecationWarning, ) elif opts["variant"] != "n78": raise ValueError( "When targeting Ethos(TM)-N78, -variant=n78 should be set.") if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential([ transform.InferType(), transform.MergeComposite(pattern_table()), transform.AnnotateTarget("ethos-n"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) return seq(mod)
def test_mobilenet_dnnl(): if not tvm.get_global_func("relay.ext.dnnl", True): print("skip because DNNL codegen is not available") return dtype = "float32" ishape = (1, 3, 224, 224) mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32") mod = transform.AnnotateTarget(["dnnl"])(mod) mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph()(mod) i_data = np.random.uniform(0, 1, ishape).astype(dtype) data = get_calibration_data(mod, {"data": i_data, **params}) # Check the number and orders check_data_size(mod, data)
def partition(dpu_target): data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32")) weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32")) bn_gamma = relay.var("bn_gamma", relay.TensorType((16, ), "float32")) bn_beta = relay.var("bn_beta", relay.TensorType((16, ), "float32")) bn_mmean = relay.var("bn_mean", relay.TensorType((16, ), "float32")) bn_mvar = relay.var("bn_var", relay.TensorType((16, ), "float32")) conv = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)) bn_output = relay.nn.batch_norm(conv, bn_gamma, bn_beta, bn_mmean, bn_mvar) func = relay.Function( [data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output.astuple()) mod = tvm.IRModule() mod["main"] = func params = {} params["weight"] = np.random.rand(16, 3, 3, 3).astype("float32") params["bn_gamma"] = np.random.rand(16).astype("float32") params["bn_beta"] = np.random.rand(16).astype("float32") params["bn_mean"] = np.random.rand(16).astype("float32") params["bn_var"] = np.random.rand(16).astype("float32") mod = annotation(mod, params, dpu_target) opt_pass = tvm.transform.Sequential([ transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with tvm.transform.PassContext(opt_level=3): mod = opt_pass(mod) return mod
def partition_for_ethosn77(mod, params=None, **opts): """Partition the graph greedily offloading supported operators to Arm Ethos-N NPU. Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. Returns ------- ret : annotated and partitioned module. """ if opts: tops = opts.get("tops", None) ple_ratio = opts.get("ple_ratio", None) sram_size = opts.get("sram_size", None) if tops or ple_ratio or sram_size: raise ValueError( "Setting tops, ple_ratio or sram_size has no effect when targeting Ethos(TM)-N77" ) if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential( [ transform.InferType(), transform.MergeComposite(pattern_table()), transform.AnnotateTarget("ethos-n"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ] ) return seq(mod)
def test_flatten_tuple_output(): target = "test_flatten_tuple_output" @reg.register("split", "target." + target) def split(attrs, args): # pylint: disable=unused-variable return True @reg.register("abs", "target." + target) def abs(attrs, args): # pylint: disable=unused-variable return True def create_graph(): a = relay.var('a', shape=(10, 10), dtype="uint8") a_split = relay.split(a, 2) a_split_0 = relay.TupleGetItem(a_split.astuple(), 0) a_split_0_abs = relay.abs(a_split_0) a_con = relay.concatenate(a_split, 0) a_split_0_relu = relay.nn.relu(a_split_0_abs) out = relay.Tuple((a_con, a_split_0_relu)) f = relay.Function([a], out) mod = tvm.IRModule.from_expr(f) return mod def expected(): mod = tvm.IRModule() # function 0 f0_i0 = relay.var(target + "_0_i0", shape=(10, 10), dtype="uint8") a_split = relay.split(f0_i0, 2) a_split_0 = relay.TupleGetItem(a_split.astuple(), 0) a_split_1 = relay.TupleGetItem(a_split.astuple(), 1) a_split_abs_in = relay.TupleGetItem(a_split.astuple(), 0) abs = relay.abs(a_split_abs_in) tuple_out = relay.Tuple((a_split_0, a_split_1, abs)) func0 = relay.Function([f0_i0], tuple_out) func0 = func0.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) func0 = func0.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func0 = func0.with_attr("Compiler", target) func0 = func0.with_attr("global_symbol", target + "_0") gv0 = relay.GlobalVar(target + "_0") mod[gv0] = func0 #body data = relay.var('a', shape=(10, 10), dtype="uint8") f_out = gv0(data) f_out_0 = relay.TupleGetItem(f_out, 0) f_out_1 = relay.TupleGetItem(f_out, 1) tuple = relay.Tuple((f_out_0, f_out_1)) concat = relay.concatenate(tuple, 0) f_out_2 = relay.TupleGetItem(f_out, 2) relu = relay.nn.relu(f_out_2) ret_tuple = relay.Tuple((concat, relu)) mod["main"] = relay.Function([data], ret_tuple) return mod seq = tvm.transform.Sequential([ transform.AnnotateTarget(target), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) partitioned = seq(create_graph()) assert tvm.ir.structural_equal(partitioned, expected(), map_free_vars=True)
def test_duplicate_merge_and_tuplegetitem(): target = "test_duplicate_merge_and_tuplegetitem" @reg.register("nn.batch_norm", "target." + target) def batch_norm(attrs, args): # pylint: disable=unused-variable return True @reg.register("nn.relu", "target." + target) def relu(attrs, args): # pylint: disable=unused-variable return True def create_graph(): data = relay.var('data', shape=(10, 10)) bn_gamma = relay.var("bn_gamma") bn_beta = relay.var("bn_beta") bn_mmean = relay.var("bn_mean") bn_mvar = relay.var("bn_var") x = relay.nn.batch_norm(data, bn_gamma, bn_beta, bn_mmean, bn_mvar) out_1 = relay.nn.relu(x[0]) bn_out_1 = x[1] out_2 = relay.tanh(bn_out_1) out_3 = relay.log(bn_out_1) out = relay.Tuple([out_1, out_2, out_3]) func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], out) return func def expected(): mod = tvm.IRModule() # function 0 f0_i0 = relay.var(target + "_0_i0", shape=(10, 10)) f0_i1 = relay.var(target + "_0_i1") f0_i2 = relay.var(target + "_0_i2") f0_i3 = relay.var(target + "_0_i3") f0_i4 = relay.var(target + "_0_i4") f0_n0 = relay.nn.batch_norm(f0_i0, f0_i1, f0_i2, f0_i3, f0_i4) f0_n1 = f0_n0[1] f0_n2 = relay.nn.relu(f0_n0[0]) f0_o0 = relay.Tuple([f0_n2, f0_n1]) func0 = relay.Function([f0_i0, f0_i1, f0_i2, f0_i3, f0_i4], f0_o0) func0 = func0.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) func0 = func0.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func0 = func0.with_attr("Compiler", target) func0 = func0.with_attr("global_symbol", target + "_0") gv0 = relay.GlobalVar(target + "_0") mod[gv0] = func0 # body data = relay.var('data', shape=(10, 10)) bn_gamma = relay.var("bn_gamma") bn_beta = relay.var("bn_beta") bn_mmean = relay.var("bn_mean") bn_mvar = relay.var("bn_var") function_out = gv0(data, bn_gamma, bn_beta, bn_mmean, bn_mvar) get_out0 = relay.TupleGetItem(function_out, 0) get_out1 = relay.TupleGetItem(function_out, 1) out_2 = relay.tanh(get_out1) out_3 = relay.log(get_out1) out = relay.Tuple([get_out0, out_2, out_3]) func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], out) mod["main"] = func return mod mod = tvm.IRModule() mod["main"] = create_graph() seq = tvm.transform.Sequential([ transform.AnnotateTarget(target), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) ref_mod = expected() partitioned = seq(mod) assert tvm.ir.structural_equal(partitioned, ref_mod, map_free_vars=True)
def partition_for_dnnl(mod, params=None, alter_layout=True): """Partition the graph greedily offloading supported operators to DNNL. Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. Returns ------- mod : Module Annotated and partitioned module. """ if params: mod["main"] = bind_params_by_name(mod["main"], params) with TempOpAttr("nn.conv2d", "FTVMLegalize", dnnl.legalize_group_conv): with TempOpAttr("nn.conv2d_transpose", "FTVMLegalize", dnnl.legalize_group_conv): seq = tvm.transform.Sequential([ transform.CanonicalizeOps(), transform.InferType(), transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu` transform.SimplifyExpr(), transform.FoldConstant(), # alter group conv /conv_transpose layout to `GOIHW` / `GIOHW` transform.Legalize(), transform.FoldConstant(), ]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) if alter_layout: with TempOpAttr("nn.conv1d", "FTVMAlterOpLayout", dnnl.alter_conv): with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", dnnl.alter_conv): with TempOpAttr("nn.conv3d", "FTVMAlterOpLayout", dnnl.alter_conv): with TempOpAttr("nn.conv2d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose): with TempOpAttr("nn.conv3d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose): alter_layout_seq = tvm.transform.Sequential([ transform.AlterOpLayout(), transform.FoldConstant(), ]) with tvm.transform.PassContext(opt_level=3): mod = alter_layout_seq(mod) byoc_seq = tvm.transform.Sequential([ transform.MergeComposite(dnnl.pattern_table()), transform.AnnotateTarget("dnnl"), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with tvm.transform.PassContext(opt_level=3): mod = byoc_seq(mod) return mod
def partition_for_tensorrt( mod, params=None, version=None, use_implicit_batch=True, remove_no_mac_subgraphs=False, max_workspace_size=1 << 30, ): """Partition the graph greedily offloading supported operators to TensorRT. Parameters ---------- mod : Module The module to run passes on. params : Optional[Dict[str, NDArray]] Constant input parameters. version : Optional[Tuple[int, int, int]] TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead. use_implicit_batch : Optional[bool] Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch mode which will widen supported operators to include those which modify the batch dimension, but may reduce performance for some models. remove_no_mac_subgraphs : Optional[bool] Removes subgraphs which have been partitioned for TensorRT if they do not have any multiply-accumulate operations. The removed subgraphs will go through TVM's standard compilation instead. Can improve performance. max_workspace_size : Optional[int] How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation. See TensorRT documentation for more info. Returns ------- mod_and_config : Tuple[Module, Dict[str, Any]] A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options" configuration which should be given to PassContext when building. """ config = { "use_implicit_batch": use_implicit_batch, "max_workspace_size": max_workspace_size, "remove_no_mac_subgraphs": remove_no_mac_subgraphs, } if version: assert isinstance(version, tuple) and len(version) == 3 config["tensorrt_version"] = version else: linked_version = tuple( tvm.get_global_func("relay.op.get_tensorrt_version")()) if not linked_version: logger.warning( "TVM was not built against TensorRT and no version was provided to " "partition_for_tensorrt. Defaulting to 6.0.1") linked_version = (6, 0, 1) config["tensorrt_version"] = linked_version if params: mod["main"] = bind_params_by_name(mod["main"], params) seq = tvm.transform.Sequential([ transform.InferType(), RemoveDropoutPass(), transform.RemoveUnusedFunctions(), transform.ConvertLayout({ "nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"], "nn.conv2d_transpose": ["NCHW", "default"], }), transform.FoldConstant(), transform.AnnotateTarget("tensorrt"), transform.MergeCompilerRegions(), transform.PartitionGraph(), transform.InferType(), ]) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): mod = seq(mod) mod = prune_tensorrt_subgraphs(mod) return mod, config
def test_byoc_microtvm_multiple_subgraphs(merge_compiler_regions): """This is a test case to check BYOC capabilities of AOT with multiple sub graphs""" use_unpacked_api = False interface_api = "packed" test_runner = AOT_DEFAULT_RUNNER input_x = relay.var("x", shape=(10, 10)) input_w0 = relay.var("w0", shape=(10, 10)) input_w1 = relay.var("w1", shape=(10, 10)) input_w2 = relay.var("w2", shape=(10, 10)) input_w3 = relay.var("w3", shape=(10, 10)) input_w4 = relay.var("w4", shape=(10, 10)) input_w5 = relay.var("w5", shape=(10, 10)) input_w6 = relay.var("w6", shape=(10, 10)) input_w7 = relay.var("w7", shape=(10, 10)) # C compiler ccompiler_add_1 = relay.add(input_x, input_w0) ccompiler_sub_1 = relay.subtract(ccompiler_add_1, input_w1) ccompiler_mul_1 = relay.multiply(ccompiler_sub_1, input_w2) ccompiler_add_2 = relay.add(input_x, input_w3) ccompiler_sub_2 = relay.subtract(ccompiler_add_2, input_w4) ccompiler_mul_2 = relay.multiply(ccompiler_sub_2, input_w5) # Other parts on TVM tvm_add = relay.add(input_x, input_w6) tvm_sub = relay.subtract(tvm_add, input_w7) concat_outputs = relay.concatenate( (ccompiler_mul_1, ccompiler_mul_2, tvm_sub), axis=0) relay_func = relay.Function( [ input_x, input_w0, input_w1, input_w2, input_w3, input_w4, input_w5, input_w6, input_w7 ], concat_outputs, ) mod = tvm.IRModule() ann = byoc.CcompilerAnnotator() mod["main"] = ann.visit(relay_func) if merge_compiler_regions: mod = transform.MergeCompilerRegions()(mod) mod = tvm.relay.transform.PartitionGraph("mod_name")(mod) mod = tvm.relay.transform.InferType()(mod) x_data = np.random.rand(10, 10).astype("float32") w_data = [] for _ in range(8): w_data.append(np.random.rand(10, 10).astype("float32")) map_inputs = OrderedDict([("x", x_data)] + [("w{}".format(i), w_data[i]) for i in range(8)]) output_list = generate_ref_data(mod, map_inputs) input_list = [map_inputs["x"]] input_list.extend([map_inputs["w{}".format(i)] for i in range(8)]) compile_and_run( AOTTestModel(name="my_mod", module=mod, inputs=map_inputs, outputs=output_list), test_runner, interface_api, use_unpacked_api, )