def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes)) C = tvm.placeholder((n,), name='C', dtype="int32") D = tvm.compute((n,), lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D') s = tvm.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, tvm.thread_axis("blockIdx.x")) s[D].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n,lanes)) np_b = np.random.randint(low=-128, high=127, size=(n,lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b) c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c) d = tvm.nd.empty((n,), D.dtype, ctx) fun(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), np_d)
def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal( size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal( size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. tstart = time.time() flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap)
def run(args): onnx_model = onnx.load_model(os.path.join(args.test_dir, 'model.onnx')) symbol, params = nnvm.frontend.from_onnx(onnx_model) input_names = symbol.list_input_names() output_names = symbol.list_output_names() test_data_dir = os.path.join(args.test_dir, 'test_data_set_0') inputs, outputs = load_test_data(test_data_dir, input_names, output_names) inputs = dict(inputs) # assert len(input_names) == len(inputs) + len(params) # assert len(output_names) == len(outputs) graph, lib, params = compile( symbol, args.target, input_names, inputs, params, args.opt_level, args.autotvm_log) if args.dump_nnvm: print(graph.ir()) print(graph.json()) ctx = tvm.gpu() # Prepare inputs. tvm_inputs = {} for name, value in inputs.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) for name, value in params.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) graph_module = None if args.debug: try: graph_module = debug_runtime.create(graph, lib, ctx) except: print('debug_runtime is disabled. ' 'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM') if graph_module is None: graph_module = graph_runtime.create(graph, lib, ctx) graph_module.set_input(**tvm_inputs) graph_module.run() for i, (name, expected) in enumerate(outputs): tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx) actual = graph_module.get_output(i, tvm_output).asnumpy() np.testing.assert_allclose(expected, actual, rtol=1e-3, atol=1e-4), name print('%s: OK' % name) print('ALL OK') if args.iterations > 1: num_iterations = args.iterations - 1 start = time.time() for t in range(num_iterations): graph_module.run() cupy.cuda.device.Device().synchronize() elapsed = time.time() - start print('Elapsed: %.3f msec' % (elapsed * 1000 / num_iterations))
def check_cuda(n, value): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return lanes = 4 dtype = 'int8' ctx = tvm.gpu(0) A = tvm.compute((n, lanes), lambda i,j: tvm.const(value, dtype=dtype)) s = tvm.create_schedule(A.op) y, x = s[A].op.axis s[A].vectorize(x) s[A].bind(y, tvm.thread_axis("blockIdx.x")) fun = tvm.build(s, [A], "cuda", name="make_int8x4") np_a = np.full((n, lanes), value, dtype=dtype) a = tvm.nd.empty(np_a.shape, dtype, ctx) fun(a) np.testing.assert_equal(a.asnumpy(), np_a)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return ctx = tvm.gpu(0) A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.compute((n,), lambda i: A[i], name='B') s = tvm.create_schedule(B.op) bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B], "cuda", name="vector_load") np_a = np.random.randint(low=-128, high=127, size=(n,lanes)) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, ctx) fun(a,b) tvm.testing.assert_allclose(a.asnumpy(), b.asnumpy())
def ctx_list(): """Get context list for testcases""" device_list = os.environ.get("NNVM_TEST_TARGETS", "") device_list = (device_list.split(",") if device_list else ["llvm", "cuda"]) device_list = set(device_list) res = [("llvm", tvm.cpu(0)), ("cuda", tvm.gpu(0))] return [x for x in res if x[1].exist and x[0] in device_list]
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"): global TASK TASK = "bcast_binary_" + typ + "_lhs" +\ "_".join([str(ele) for ele in lhs_shape]) +\ "rhs" + "_".join([str(ele) for ele in rhs_shape]) A = tvm.placeholder(shape=lhs_shape, name="A") B = tvm.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError s = topi.cuda.schedule_broadcast(C) fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) lhs_nd = tvm.nd.array(lhs_npy, tvm.gpu()) rhs_nd = tvm.nd.array(rhs_npy, tvm.gpu()) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu()) for _ in range(2): fcuda(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def test_ctx(): def test_ctx_func(ctx): assert tvm.gpu(7) == ctx return tvm.cpu(0) x = test_ctx_func(tvm.gpu(7)) assert x == tvm.cpu(0) x = tvm.opencl(10) x = tvm._api_internal._context_test(x, x.device_type, x.device_id) assert x == tvm.opencl(10)
def test_broadcast_to(in_shape, out_shape): global TASK TASK = "bcast_to_i" + "_".join([str(ele) for ele in in_shape])\ + "o" + "_".join([str(ele) for ele in out_shape]) # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") B = topi.broadcast_to(A, out_shape) s = topi.cuda.schedule_broadcast(B) fcuda = tvm.build(s, [A, B], "cuda", name="broadcast_to") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.broadcast_to(data_npy, out_shape) data_nd = tvm.nd.array(data_npy, tvm.gpu()) out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu()) for _ in range(2): fcuda(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device, name="clip") f(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def verify(): ctx = tvm.gpu(0) f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d") x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx) w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx) y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) f(x, w, y)
def enabled_ctx_list(): ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), ('vulkan', tvm.vulkan(0)), ('vpi', tvm.vpi(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] return ctx_list
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return target = topi.cpp.TEST_create_target(device) s = topi.cpp.generic.default_schedule(target, [B], False) ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device, name="clip") f(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("skip because gpu does not support fp16") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B], "cuda") ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom( np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, ctx) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0): global TASK # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") if type == "sum": TASK = "sum_map_id%d" %test_id B = topi.sum(A, axis=axis, keepdims=keepdims) elif type == "max": TASK = "max_map_id%d" %test_id B = topi.max(A, axis=axis, keepdims=keepdims) elif type == "min": TASK = "min_map_id%d" %test_id B = topi.min(A, axis=axis, keepdims=keepdims) else: raise NotImplementedError s = topi.cuda.schedule_reduce(B) with tvm.build_config(auto_unroll_max_step=16, auto_unroll_min_depth=0): fcuda = tvm.build(s, [A, B], "cuda", name="sum") # Test in_npy = np.random.normal(size=in_shape).astype(np.float32) if type == "sum": out_npy = in_npy.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy.min(axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu()) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu()) for _ in range(2): fcuda(data_tvm, out_tvm) tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
def verify(target="cuda"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.cublas.matmul", True): print("skip because extern function is not available") return ctx = tvm.gpu(0) f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
def test_allocate(): @tvm.hybrid.script def blur2d(a): b = output_tensor((30, 30), 'float32') for i in range(30): ha = allocate((3, 30), 'float32') for j in range(3): for k in range(30): ha[j, k] = a[i+j, k] + a[i+j, k+1] + a[i+j, k+2] for j in range(30): b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0 return b a = tvm.placeholder((32, 32), 'float32', 'a') b = blur2d(a) sch = tvm.create_schedule(b.op) func, ins, outs = run_and_check(blur2d, [a]) run_and_check(func, ins, outs=outs) if tvm.gpu().exist: @tvm.hybrid.script def share_vec_add(a, b): c = output_tensor((256, ), 'float32') shared = allocate((256, ), 'float32', 'shared') for i in bind("threadIdx.x", 256): shared[i] = a[i] local = allocate((256, ), 'float32', 'local') for i in bind("threadIdx.x", 256): local[i] = b[i] for i in bind("threadIdx.x", 256): c[i] = shared[i] + local[i] return c a = tvm.placeholder((256, ), dtype='float32', name='a') b = tvm.placeholder((256, ), dtype='float32', name='b') c = share_vec_add(a, b) func, ins, outs = run_and_check(share_vec_add, [a, b], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') else: print('[Warning] No GPU found! Skip shared mem test!')
def check_device(target): with tvm.build_config( detect_global_barrier=detect_global_barrier, auto_unroll_max_step=128, unroll_explicit=False): f = tvm.build(s, [s_scan, Whh], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros( (n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden//2:] = 0 res_a = tvm.nd.array(res_np, ctx) Whh_a = tvm.nd.array(Whh_np, ctx) # Skip first pass as it is compilation f(res_a, Whh_a) ctx.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness if not SKIP_CHECK: res_gpu = res_a.asnumpy() res_cmp = np.ones_like(res_np).astype("float64") Whh_np = Whh_np.astype("float64") for t in range(1, n_num_step): res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np) for i in range(n_num_step): for j in range(n_num_hidden): if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5: print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j])) tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal( size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal( size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000) eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) print("Time cost=%g" % eval_result.mean)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # Measure time cost of kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) print("Stride = (%d, %d)" % (stride_h, stride_w)) print("padding = %s\n" % padding) print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape))) print("average time cost of 1000 runs (depthwise_conv2d) = %g us" % (tcost_1 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us" % (tcost_2 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us" % (tcost_3 * 1e6)) # correctness depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape)) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[ c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success")
def test_ctx_func(ctx): assert tvm.gpu(7) == ctx return tvm.cpu(0)
def convert(self, lst, *, target='cpu', dev_id=0): """Converts the list of nodes to a runnable form. All the nodes in the list must represent linear flow (no calls, branches, ...) Returns: (fn, inputs, outputs): - fn: A callable function - inputs: the list of inputs nodes whose values should be provided to the function - outputs: the list of output nodes corresponding to the outputs of the function Notes: This implementation converts the nodes to NNVM and compiles it. """ self.c = count() self.eqv = {} self.inputs = [] self.input_names = [] self.constants = {} self.constant_vars = {} self.shapes = {} self.types = {} for n in lst: assert n.is_apply() assert n.inputs[0].is_constant(Primitive) fn = n.inputs[0].value conv = self.mapping.get(fn, None) if conv is not None: self.eqv[n] = conv(self, *n.inputs[1:]) else: raise NotImplementedError(fn) outputs = get_outputs(lst, lst[0].graph.manager.uses, set(self.eqv.keys())) inmap = dict((self.eqv[i], i) for i in self.inputs) # Check for empty functions if all(self.eqv[o] in inmap for o in outputs): return None, [inmap[self.eqv[o]] for o in outputs], outputs if target == 'cpu': target = 'llvm' g = nnvm.graph.create(sym.Group(list(self.eqv[o] for o in outputs))) dg, lib, params = nnvm.compiler.build(g, target=target, shape=self.shapes, dtype=self.types, params=self.constants) shape = dg.json_attr('shape') types = dg.json_attr('dtype') index = dg.index def spec(entry_id): return (shape[entry_id], graph_attr.TCODE_TO_DTYPE[types[entry_id]]) output_specs = [spec(index.entry_id(x)) for x in index.output_entries] assert len(output_specs) == len(outputs) if target == 'llvm': context = tvm.cpu(dev_id) elif target == 'cuda': # pragma: no cover context = tvm.gpu(dev_id) else: # pragma: no cover raise Exception(f"Unsupported target: {target}") module = graph_runtime.create(dg, lib, context) for n, p in params.items(): module.set_input(n, p) input_types = [self.types[i] for i in self.input_names] return (NNVMRunner(module, self.input_names, input_types, output_specs, context), self.inputs, outputs)
# into the lower intrinsic IR of the specified target backend, which is CUDA # in this example. Then the machine code will be generated as the module library. opt_level = 3 target = tvm.target.cuda() with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build( net, target, shape={"data": data_shape}, params=params) ##################################################################### # Run the generate library # ------------------------ # Now we can create graph runtime and run the module on Nvidia GPU. # create random input ctx = tvm.gpu() data = np.random.uniform(-1, 1, size=data_shape).astype("float32") # create module module = graph_runtime.create(graph, lib, ctx) # set input and parameters module.set_input("data", data) module.set_input(**params) # run module.run() # get output out = module.get_output(0, tvm.nd.empty(out_shape)) # convert to numpy out.asnumpy() # Print first 10 elements of output print(out.asnumpy().flatten()[0:10])
###################################################################### # Compile the Graph # ----------------- # Now we would like to port the Gluon model to a portable computational graph. # It's as easy as several lines. # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon input_shape = (1, 3, 224, 224) dtype = 'float32' net, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype) # we want a probability so add a softmax operator net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs) ###################################################################### # now compile the graph target = 'cuda' shape_dict = {'data': x.shape} with relay.build_config(opt_level=3): intrp = relay.build_module.create_executor('graph', net, tvm.gpu(0), target) ###################################################################### # Execute the portable graph on TVM # --------------------------------- # Now, we would like to reproduce the same forward computation using TVM. tvm_output = intrp.evaluate(net)(tvm.nd.array(x.astype(dtype)), **params) top1 = np.argmax(tvm_output.asnumpy()[0]) print('TVM prediction top-1:', top1, synset[top1])
def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: # The best implementation is not an AutoTVM template, # we then assume it's not necessary to alter this op. return None cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None topi_tmpl = workload[0] new_attrs = {k: attrs[k] for k in attrs.keys()} strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") data_layout = attrs["data_layout"] kernel_layout = attrs["kernel_layout"] data, kernel = tinfos out_dtype = out_type.dtype if topi_tmpl == "conv2d_NCHWc_int8.cuda": assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) new_layout = "NCHW4c" new_attrs["channels"] = CO new_attrs["data_layout"] = new_layout new_attrs["out_layout"] = new_layout new_attrs["kernel_layout"] = "OIHW4o4i" ic_block_factor = oc_block_factor = 4 # Store the same config for the altered operator (workload) new_data = te.placeholder( (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype) new_kernel = te.placeholder( ( CO // oc_block_factor, CI // ic_block_factor, KH, KW, oc_block_factor, ic_block_factor, ), dtype=kernel.dtype, ) new_workload = autotvm.task.args_to_workload( [ new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype ], "conv2d_NCHWc_int8.cuda", ) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) if topi_tmpl == "conv2d_nchw_winograd.cuda": if dilation != (1, 1): logger.warning( "Does not support weight pre-transform for dilated convolution." ) return None assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) # pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) weight = relay.nn.contrib_conv2d_winograd_weight_transform( inputs[1], tile_size=tile_size) weight = relay.transpose(weight, axes=[0, 1, 3, 2]) new_attrs["tile_size"] = tile_size new_attrs["channels"] = CO # Store the same config for the altered operator (workload) new_data = data new_weight = te.placeholder( (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, out_dtype], "conv2d_nchw_winograd_without_weight_transform.cuda", ) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_winograd_without_weight_transform( inputs[0], weight, **new_attrs) if topi_tmpl in ("conv2d_nhwc_winograd_direct.cuda", "conv2d_nhwc_winograd_tensorcore.cuda"): if dilation != (1, 1): logger.warning( "Does not support weight pre-transform for dilated convolution." ) return None assert data_layout == "NHWC" and kernel_layout == "HWIO" N, H, W, CI = get_const_tuple(data.shape) KH, KW, _, CO = get_const_tuple(kernel.shape) # Pre-compute weight transformation in winograd if H % 8 == 0: tile_size = 4 else: tile_size = 2 kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1]) weight = relay.nn.contrib_conv2d_winograd_weight_transform( kernel_transform, tile_size=tile_size) weight = relay.transpose(weight, axes=[0, 1, 3, 2]) new_attrs["tile_size"] = tile_size new_attrs["channels"] = CO # Store the same config for the altered operator (workload) new_data = data new_weight = te.placeholder( (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype) if topi_tmpl == "conv2d_nhwc_winograd_direct.cuda": new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, out_dtype], "conv2d_nhwc_winograd_direct_without_weight_transform.cuda", ) elif topi_tmpl == "conv2d_nhwc_winograd_tensorcore.cuda": new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, out_dtype], "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda", ) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_winograd_without_weight_transform( inputs[0], weight, **new_attrs) if topi_tmpl == "group_conv2d_NCHWc_int8.cuda": assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) new_layout = "NCHW4c" new_attrs["channels"] = CO new_attrs["data_layout"] = new_layout new_attrs["out_layout"] = new_layout new_attrs["kernel_layout"] = "OIHW4o4i" ic_block_factor = oc_block_factor = 4 # Store the same config for the altered operator (workload) new_data = te.placeholder( (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype) new_kernel = te.placeholder( ( CO // oc_block_factor, CI // ic_block_factor // groups, KH, KW, oc_block_factor, ic_block_factor, ), dtype=kernel.dtype, ) new_workload = autotvm.task.args_to_workload( [ new_data, new_kernel, strides, padding, dilation, groups, out_dtype ], "group_conv2d_NCHWc_int8.cuda", ) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) if topi_tmpl == "conv2d_HWNCnc_tensorcore.cuda": assert data_layout == "HWNC" and kernel_layout == "HWOI" assert float(tvm.gpu(0).compute_version) >= 7.5 H, W, N, CI = get_const_tuple(data.shape) KH, KW, CO, _ = get_const_tuple(kernel.shape) if (kernel.dtype in ["int4", "uint4"] and (CI % 32 != 0 or CO % 8 != 0) or kernel.dtype in ["int8", "uint8"] and (CI % 16 != 0 or CO % 32 != 0)): return relay.nn.conv2d(*inputs, **new_attrs) new_attrs["channels"] = CO if kernel.dtype in ["int4", "uint4"]: new_attrs["kernel_layout"] = "HWOI8o32i" ic_block_factor = 32 oc_block_factor = 8 else: new_attrs["kernel_layout"] = "HWOI32o16i" ic_block_factor = 16 oc_block_factor = 32 new_kernel = te.placeholder( ( KH, KW, CO // oc_block_factor, CI // ic_block_factor, oc_block_factor, ic_block_factor, ), dtype=kernel.dtype, ) new_workload = autotvm.task.args_to_workload( [data, new_kernel, strides, padding, dilation, out_dtype], "conv2d_HWNCnc_tensorcore.cuda", ) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) return None
def tracer(module, info, is_before): pass #global timing #if bool(is_before): # timing = time.time() #else: # print('Executes: ', info.name, (time.time() - timing) * 1000) passes = [(1, tensorizer.rewrite)] with tvm.transform.PassContext(opt_level=4, trace=tracer, config={'tir.add_lower_pass': passes}): #with tvm.transform.PassContext(opt_level=4, trace=tracer): #graph, lib, params = tvm.relay.build(module, target='cuda -libs=cublas,cudnn') graph, lib, params = tvm.relay.build(module, target='nvptx') module = runtime.create(graph, lib, tvm.gpu()) x_ = (np.random.randn(n, c, h, w) * 128).astype('float32') module.set_input('x', x_) timer = module.module.time_evaluator('run', ctx=tvm.gpu(), number=1, repeat=1) timed = timer() print((n * oc * (h - kh + 1) * (w - kw + 1)) * (kh * kw * ic) / timed.mean / 1e9) print('%d us' % int(timed.mean * 1e6))
def _register(self): self._target = 'cuda' self._ctx = tvm.gpu(0) self._compute_func = vanilla_sddmm self._schedule_func = schedule_vanilla_sddmm_cuda_tree_reduce
import tvm.contrib.graph_runtime as graph_runtime data_shape = (1, 3, 224, 224) # load the module back. loaded_lib = tvm.module.load('deploy_lib.tar') #dev_lib = tvm.module.load("deploy_cuda.ptx") #loaded_lib.import_module(dev_lib) loaded_graph = open("deploy_graph.json").read() loaded_params = bytearray(open("deploy_param.params", "rb").read()) cuda = True ctx = tvm.gpu(0) if cuda else tvm.cpu(0) print("=> [TVM on tune_run.py] creating TVM runtime module") fcreate = tvm.get_global_func("tvm.graph_runtime.create") gmodule = fcreate(loaded_graph, loaded_lib, ctx.device_type, ctx.device_id) set_input, get_output, run = gmodule["set_input"], gmodule[ "get_output"], gmodule["run"] print("=> [TVM] feeding inputs and params into TVM module") x = np.ones([1, 3, 224, 224]) set_input('0', tvm.nd.array(x.astype('float32'))) gmodule["load_params"](loaded_params) print("=> [TVM] running TVM module, saving output")
print("Tuning...") mod.tune_tvm(log_file=log_file, n_trial=20) print("Building...") tvm_mod = mod.build_tvm(export_dir) pytorch_mod = mod.build_pytorch_module(num_inputs=2, num_outputs=1) ## Or you can load from a prebuilt tvm module # mod = PyTorchTVMModule() # tvm_mod = mod.load_tvm(export_dir) # pytorch_mod = mod.build_pytorch_module(num_inputs=2, num_outputs=1, input_infos=input_shapes) print("Run TVM...") tvm_x = tvm.nd.array(x.cpu().numpy().astype(dtype), device=tvm.gpu(0)) tvm_y = tvm.nd.array(y.cpu().numpy().astype(dtype), device=tvm.gpu(0)) for i in range(20): t = time.time() tvm_mod.run(x=tvm_x, y=tvm_y) print(1000 * (time.time() - t)) tvm_output = tvm_mod.get_output(0) print(tvm_output.shape) print("Run PyTorch...") for i in range(20): t = time.time() outputs = pytorch_mod.forward([x, y]) torch.cuda.synchronize() print(1000 * (time.time() - t))
def run(name, N, H, W, CO, CI, KH, KW, stride, pad): N, H, W, CO, CI, KH, KW, strides, padding = N, H, W, CO, CI, KH, KW, ( stride, stride), (pad, pad) task = autotvm.task.create(conv2d_no_batching, args=(N, H, W, CO, CI, KH, KW, strides, padding), target='cuda') print(task.config_space) logfile = "conv2d_" + name + ".log" # Use local gpu, measure 10 times for every config to reduce variance # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( repeat=3, min_repeat_ms=100, timeout=4)) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) # tuner.tune(n_trial=1000, # measure_option=measure_option, # callbacks=[autotvm.callback.log_to_file(logfile)]) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best(logfile) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(logfile): with tvm.target.create("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) # c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty((N, CO, (H + 2 * pad - KH) // stride + 1, (W + 2 * pad - KW) // stride + 1), ctx=ctx) # func(a_tvm, w_tvm, c_tvm) # tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=10) cost = evaluator(a_tvm, w_tvm, c_tvm).mean * 1e3 print('Time cost of this operator: %f' % cost) with open("autotvm_conv_nchw.txt", "a") as f: f.write("name, {}\n".format(cost))
def test_tensor_core_batch_conv(): # The sizes of inputs and filters batch_size = 32 height = 14 width = 14 in_channels = 32 out_channels = 64 kernel_h = 3 kernel_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 block_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 warp_size = 32 chunk = 2 # Input feature map: (N, H, W, IC, n, ic) data_shape = ( batch_size // block_size, height, width, in_channels // block_size, block_size, block_size, ) # Kernel: (H, W, IC, OC, ic, oc) kernel_shape = ( kernel_h, kernel_w, in_channels // block_size, out_channels // block_size, block_size, block_size, ) # Output feature map: (N, H, W, OC, n, oc) output_shape = ( batch_size // block_size, height, width, out_channels // block_size, block_size, block_size, ) assert batch_size % block_size == 0 assert in_channels % block_size == 0 assert out_channels % block_size == 0 kh = te.reduce_axis((0, kernel_h), name="kh") kw = te.reduce_axis((0, kernel_w), name="kw") ic = te.reduce_axis((0, in_channels // block_size), name="ic") ii = te.reduce_axis((0, block_size), name="ii") # Algorithm A = te.placeholder(data_shape, name="A", dtype="float16") W = te.placeholder(kernel_shape, name="W", dtype="float16") Apad = te.compute( ( batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size, block_size, ), lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0.0, "float16"), ), name="Apad", ) Conv = te.compute( output_shape, lambda n, h, w, o, nn, oo: te.sum( Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") * W[kh, kw, ic, o, ii, oo].astype("float32"), axis=[ic, kh, kw, ii], ), name="Conv", ) s = te.create_schedule(Conv.op) s[Apad].compute_inline() AS = s.cache_read(Apad, "shared", [Conv]) WS = s.cache_read(W, "shared", [Conv]) AF = s.cache_read(AS, "wmma.matrix_a", [Conv]) WF = s.cache_read(WS, "wmma.matrix_b", [Conv]) ConvF = s.cache_write(Conv, "wmma.accumulator") block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") nc, hc, wc, oc, nnc, ooc = Conv.op.axis block_k = s[Conv].fuse(hc, wc) s[Conv].bind(block_k, block_z) nc, nci = s[Conv].split(nc, factor=warp_row_tiles) block_i, nc = s[Conv].split(nc, factor=block_row_warps) oc, oci = s[Conv].split(oc, factor=warp_col_tiles) block_j, oc = s[Conv].split(oc, factor=block_col_warps) s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) s[Conv].bind(block_i, block_x) s[Conv].bind(block_j, block_y) s[Conv].bind(nc, thread_y) s[Conv].bind(oc, thread_z) s[ConvF].compute_at(s[Conv], oc) n, h, w, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) s[WS].compute_at(s[ConvF], kh) s[AS].compute_at(s[ConvF], kh) n, h, w, i, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, yo = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, factor=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(ti, thread_x) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, yo = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a")) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b")) s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16))) s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16))) func = tvm.build(s, [A, W, Conv], "cuda") dev = tvm.gpu(0) a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev) evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3)) if VERIFY: func(a, w, c) a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels) w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape( kernel_h, kernel_w, in_channels, out_channels ) c_np = ( c.asnumpy() .transpose((0, 4, 1, 2, 3, 5)) .reshape(batch_size, height, width, out_channels) ) c_std = conv2d_nhwc_python( a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w) ).astype(Conv.dtype) np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
def setUp(self): self.ctx = tvm.gpu()
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8"): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda", ) _, _, kh, kw = get_const_tuple(kernel.shape) if ((2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5, ) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda", ) N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment ( judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler, ) = judge_winograd( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=False, ) if judge_winograd_autotvm: if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore ), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5, ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20, ) # register auto-scheduler implementations use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) if use_auto_scheduler and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWNC": assert kernel_layout in [ "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i" ] _, _, N, in_channels = get_const_tuple(data.shape) pre_computed = len(kernel.shape) == 6 if pre_computed: _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple( kernel.shape) out_channels = oc_chunk * oc_block_factor else: _, _, out_channels, _ = get_const_tuple(kernel.shape) tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"] if ((N % 16 == 0 and in_channels % 16 == 0 and out_channels % 16 == 0) or (N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0) or (N % 32 == 0 and in_channels % 16 == 0 and out_channels % 8 == 0) and (data.dtype in tensorcore_dtypes and kernel.dtype in tensorcore_dtypes)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_hwnc_tensorcore), name="conv2d_hwnc_tensorcore_direct.cuda", plevel=20, ) else: raise RuntimeError("Unsupported shape for conv2d HWNC.\ Need to satisfy tensor core schedule.") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda", ) else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) cudnn_impl = True if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda", ) elif not cudnn_impl: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def convert(topology, backend, device, extra_config={}): """ This function is used to convert a `onnxconverter_common.topology.Topology` object into a *backend* model. Args: topology: The `onnxconverter_common.topology.Topology` object that will be converted into a backend model backend: Which backend the model should be run on device: Which device the translated model will be run on extra_config: Extra configurations to be used by individual operator converters Returns: A model implemented in the selected backend """ assert topology is not None, "Cannot convert a Topology object of type None." assert backend is not None, "Cannot convert a Topology object into backend None." assert device is not None, "Cannot convert a Topology object into device None." tvm_backend = None operator_map = {} if tvm_installed(): import tvm from tvm import relay from tvm.contrib import graph_runtime tvm_backend = tvm.__name__ for operator in topology.topological_operator_iterator(): try: converter = get_converter(operator.type) if backend == onnx.__name__: # vers = LooseVersion(torch.__version__) # allowed_min = LooseVersion("1.6.0") # Pytorch <= 1.6.0 has a bug with exporting GEMM into ONNX. # For the moment only tree_trav is enabled for pytorch <= 1.6.0 # if vers < allowed_min: extra_config[constants.TREE_IMPLEMENTATION] = "tree_trav" operator_map[operator.full_name] = converter( operator, device, extra_config) except ValueError: raise MissingConverter( "Unable to find converter for {} type {} with extra config: {}." .format(operator.type, type(getattr(operator, "raw_model", None)), extra_config)) except Exception as e: raise e # Set the parameters for the model / container n_threads = None if constants.N_THREADS not in extra_config else extra_config[ constants.N_THREADS] batch_size = None if constants.BATCH_SIZE not in extra_config else extra_config[ constants.BATCH_SIZE] # We set the number of threads for torch here to avoid errors in case we JIT. # We set intra op concurrency while we force operators to run sequentially. # We can revise this later, but in general we don't have graphs requireing inter-op parallelism. if n_threads is not None: if torch.get_num_interop_threads() != 1: torch.set_num_interop_threads(1) torch.set_num_threads(n_threads) operators = list(topology.topological_operator_iterator()) torch_model = _PyTorchBackendModel(topology.raw_model.input_names, topology.raw_model.output_names, operator_map, operators, extra_config).eval() if backend == onnx.__name__: onnx_model_name = output_model_name = None target_opset = 11 # Set optional configuration options for ONNX if any. if constants.ONNX_OUTPUT_MODEL_NAME in extra_config: onnx_model_name = extra_config[constants.ONNX_OUTPUT_MODEL_NAME] output_model_name = onnx_model_name + ".onnx" if constants.ONNX_TARGET_OPSET in extra_config: target_opset = extra_config[constants.ONNX_TARGET_OPSET] if output_model_name is None: output_model_name = str(uuid4().hex) + ".onnx" # Put the tracing test input into the right format. batch_trace_input, _ = _get_trace_input_from_test_input( extra_config[constants.TEST_INPUT], batch_size) # Generate the ONNX models torch.onnx.export( torch_model, batch_trace_input, output_model_name, input_names=topology.raw_model.input_names, output_names=topology.raw_model.output_names, keep_initializers_as_inputs=False, opset_version=target_opset, do_constant_folding=True, ) hb_model = onnx.load(output_model_name) os.remove(output_model_name) # Set the ONNX model name if any. if onnx_model_name is not None: hb_model.graph.name = onnx_model_name # Fix the model to use arbitrary batch dimensions def fix_dim(dim): updated = False if dim.HasField("dim_value"): dim.Clear() updated = True dim.dim_param = "sym" return updated def fix_value_info(value): num_fixed = 0 if value.type.HasField("tensor_type"): shape = value.type.tensor_type.shape if shape: dim = shape.dim[0] if fix_dim(dim): num_fixed += 1 return num_fixed def fix_graph(graph): num_fixed = 0 for input in graph.input: num_fixed += fix_value_info(input) for output in graph.output: num_fixed += fix_value_info(output) for node in graph.node: for attr in node.attribute: if attr.HasField("g"): num_fixed += fix_graph(attr.g) return num_fixed fix_graph(hb_model.graph) elif backend == tvm_backend: # First we need to generate the torchscript model. batch_trace_input, remainder_trace_input = _get_trace_input_from_test_input( extra_config[constants.TEST_INPUT], batch_size) ts_model = _jit_model(torch_model, batch_trace_input, "cpu", extra_config) if remainder_trace_input is not None: remainder_ts_model = _jit_model(torch_model, remainder_trace_input, "cpu", extra_config) # Generate the test input in the TVM format. In case we have a remainder beyond the batch, generate a remainder test input as well. test_input = [( topology.raw_model.input_names[i], batch_trace_input[i].shape if type(batch_trace_input) is tuple else batch_trace_input.shape, ) for i in range(len(topology.raw_model.input_names))] if remainder_trace_input is not None: remainder_test_input = [( topology.raw_model.input_names[i], remainder_trace_input[i].shape if type(remainder_trace_input) is tuple else remainder_trace_input.shape, ) for i in range(len(topology.raw_model.input_names))] # Pick the proper target. if device == "cuda": target = tvm.target.cuda() ctx = tvm.gpu() elif device == "cpu": target = "llvm" ctx = tvm.cpu() elif "llvm" in device: target = device ctx = tvm.cpu() else: raise RuntimeError("Device {} not recognized".format(device)) # Get configuration parameters. config = {} if constants.TVM_MAX_FUSE_DEPTH in extra_config: config["relay.FuseOps.max_depth"] = extra_config[ constants.TVM_MAX_FUSE_DEPTH] else: # 50 is a good depth for operator fusion. More than that will probably hurt performance. # https://github.com/microsoft/hummingbird/issues/232#issuecomment-697979508 config["relay.FuseOps.max_depth"] = 50 # Create the relay version of the model. model, params = relay.frontend.from_pytorch(ts_model, test_input) if remainder_trace_input is not None: remainder_model, remainder_params = relay.frontend.from_pytorch( remainder_ts_model, remainder_test_input) # Generate the model. We set opt_level=3 to enable all optimizations. with tvm.transform.PassContext(opt_level=3, config=config): graph, lib, params = relay.build(model, target=target, params=params) tvm_model = graph_runtime.create(graph, lib, ctx) tvm_model.set_input(**params) if remainder_trace_input is not None: with tvm.transform.PassContext(opt_level=3, config=config): graph, lib, params = relay.build(remainder_model, target=target, params=remainder_params) tvm_remainder_model = graph_runtime.create(graph, lib, ctx) tvm_remainder_model.set_input(**params) # In the container we will be using the context to properly configure the input tensors. extra_config[constants.TVM_CONTEXT] = ctx extra_config[ constants.TVM_INPUT_NAMES] = topology.raw_model.input_names if remainder_trace_input is not None: extra_config[constants.TVM_REMAINDER_MODEL] = tvm_remainder_model hb_model = tvm_model else: # Set the device for the model. if device != "cpu": if backend == torch.__name__ or torch.jit.__name__: torch_model = torch_model.to(device) # If the backend is tochscript, jit the model. if backend == torch.jit.__name__: trace_input, _ = _get_trace_input_from_test_input( extra_config[constants.TEST_INPUT], batch_size) if device != "cpu": trace_input.to(device) torch_model = torch.jit.trace(torch_model, trace_input).eval() torch.jit.optimized_execution(torch_model) hb_model = torch_model # Return if the container is not needed. if constants.CONTAINER in extra_config and not extra_config[ constants.CONTAINER]: return hb_model # We scan the operators backwards until we find an operator with a defined type. # This is necessary because ONNX models can have arbitrary operators doing casting, reshaping etc. idx = len(operators) - 1 while (idx >= 0 and not operator_map[operators[idx].full_name].regression and not operator_map[operators[idx].full_name].classification and not operator_map[operators[idx].full_name].anomaly_detection and not operator_map[operators[idx].full_name].transformer): idx -= 1 assert idx >= 0, "Cannot detect container type. Please fill an issue at https://github.com/microsoft/hummingbird." # If is a transformer, we need to check whether there is another operator type before. # E.g., normalization after classification. tmp_idx = idx if operator_map[operators[idx].full_name].transformer: while (idx >= 0 and not operator_map[operators[idx].full_name].regression and not operator_map[operators[idx].full_name].classification and not operator_map[operators[idx].full_name].anomaly_detection): idx -= 1 if idx < 0: idx = tmp_idx # Get the proper container type. if operator_map[operators[idx].full_name].regression: # We are doing a regression task. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerRegression elif backend == onnx.__name__: container = ONNXSklearnContainerRegression elif backend == tvm_backend: container = TVMSklearnContainerRegression else: container = PyTorchSklearnContainerRegression elif operator_map[operators[idx].full_name].anomaly_detection: # We are doing anomaly detection. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerAnomalyDetection elif backend == onnx.__name__: container = ONNXSklearnContainerAnomalyDetection elif backend == tvm_backend: container = TVMSklearnContainerAnomalyDetection else: container = PyTorchSklearnContainerAnomalyDetection elif operator_map[operators[idx].full_name].transformer: # We are just transforming the input data. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerTransformer elif backend == onnx.__name__: container = ONNXSklearnContainerTransformer elif backend == tvm_backend: container = TVMSklearnContainerTransformer else: container = PyTorchSklearnContainerTransformer else: # We are doing a classification task. if backend == torch.jit.__name__: container = TorchScriptSklearnContainerClassification elif backend == onnx.__name__: container = ONNXSklearnContainerClassification elif backend == tvm_backend: container = TVMSklearnContainerClassification else: container = PyTorchSklearnContainerClassification n_threads = None if constants.N_THREADS not in extra_config else extra_config[ constants.N_THREADS] batch_size = None if constants.BATCH_SIZE not in extra_config else extra_config[ constants.BATCH_SIZE] hb_model = container(hb_model, n_threads, batch_size, extra_config=extra_config) return hb_model
with tvm.target.create("cuda"): s, arg_bufs = conv2d( N, H, W, CO, CI, KH, KW, strides, padding, scaling_factor) print(tvm.lower(s, arg_bufs, simple_mode=True)) func = tvm.build(s, arg_bufs) print(func.imported_modules[0].get_source()) # check correctness a_np = np.random.randint(size=(N, CI//BI, H, W, BI), low=-128, high=127, dtype='int8') w_np = np.random.randint( size=(CO//BO, CI//BI, KH, KW, BO, BI), low=-128, high=127, dtype='int8') a_np_ = a_np.transpose((0, 1, 4, 2, 3)).ravel().reshape(N, CI, H, W) w_np_ = w_np.transpose((0, 4, 1, 5, 2, 3)).ravel().reshape(CO, CI, KH, KW) #c_np = conv2d_nchw_python(a_np_, w_np_, strides, padding).astype('int8') #c_np = c_np.reshape(N, CO//BO, BO, *c_np.shape[2:]).transpose(0, 1, 3, 4, 2) c_np = np.zeros((N, CO//BO, H, W, BO), dtype='int8') ctx = tvm.gpu() a_tvm = tvm.nd.empty(a_np.shape, dtype='int8', ctx=ctx).copyfrom(a_np) w_tvm = tvm.nd.empty(w_np.shape, dtype='int8', ctx=ctx).copyfrom(w_np) c_tvm = tvm.nd.empty(c_np.shape, dtype='int8', ctx=ctx) func(a_tvm, w_tvm, c_tvm) #np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) evaluator = func.time_evaluator(func.entry_name, ctx, number=1000) t = evaluator(a_tvm, w_tvm, c_tvm).mean num_flops = N*c_np.shape[-2] * c_np.shape[-3] * CO*CI*KH*KW*2 GFLOPS = num_flops / (t * 1e3) / 1e6 print('Time cost of this operator: %f, %g GFLOPS' % (t, GFLOPS))
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96): A = te.placeholder((n, l), name='A', dtype='float16') B = te.placeholder((l, m), name='B', dtype='float16') k = te.reduce_axis((0, l), name='k') C = te.compute((n, m), lambda i, j: te.sum( A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k)) s = te.create_schedule(C.op) y, x = s[C].op.axis k = s[C].op.reduce_axis[0] AA = s.cache_read(A, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BB = s.cache_read(B, "shared", [C]) BL = s.cache_read(BB, "local", [C]) CL = s.cache_write(C, "local") bx = 4 by = 32 step_k = 8 v = 4 TX = 8 TY = 1 tile_x = bx * TX tile_y = by * TY WX = min(warp_tile_m, tile_x) tile_k = 16 vthread = 1 yo, ty = s[C].split(y, tile_y * vthread) vy, ty = s[C].split(ty, tile_y) ty, yi = s[C].split(ty, TY) xo, xi = s[C].split(x, tile_x) tz, xi = s[C].split(xi, WX) tx, xi = s[C].split(xi, TX) ko, ki = s[CL].split(k, step_k * tile_k) kl, ki = s[CL].split(ki, tile_k) s[C].reorder(yo, xo, tz, ty, tx, yi, xi) s[C].bind(yo, te.thread_axis("blockIdx.y")) s[C].bind(xo, te.thread_axis("blockIdx.x")) s[C].bind(ty, te.thread_axis("threadIdx.y")) s[C].bind(tz, te.thread_axis("threadIdx.z")) s[C].bind(tx, te.thread_axis("threadIdx.x")) s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy")) s[CL].compute_at(s[C], tx) yo, xo = CL.op.axis s[CL].reorder(ko, kl, ki, yo, xo) s[AA].compute_at(s[CL], ko) xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v) tz, tx = s[AA].split(xi, factor=(WX // TX) * v) tx, vec = s[AA].split(tx, factor=v) fused = s[AA].fuse(s[AA].op.axis[0], xo) _, ty = s[AA].split(fused, factor=by) s[AA].bind(ty, te.thread_axis("threadIdx.y")) s[AA].bind(tz, te.thread_axis("threadIdx.z")) s[AA].bind(tx, te.thread_axis("threadIdx.x")) s[AA].vectorize(vec) s[BB].compute_at(s[CL], ko) xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v) tz, tx = s[BB].split(xi, factor=(WX // TX) * v) tx, vec = s[BB].split(tx, factor=v) fused = s[BB].fuse(s[BB].op.axis[0], xo) _, ty = s[BB].split(fused, factor=by) s[BB].bind(ty, te.thread_axis("threadIdx.y")) s[BB].bind(tz, te.thread_axis("threadIdx.z")) s[BB].bind(tx, te.thread_axis("threadIdx.x")) s[BB].vectorize(vec) s[AL].compute_at(s[CL], kl) s[BL].compute_at(s[CL], kl) s[CL].pragma(ko, 'tensor_core') func = tvm.build(s, [A, B, C], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(l, m)).astype(B.dtype) c_np = np.zeros((n, m), dtype=np.float32) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('gemm m=%d n=%d k=%d: %f ms' % (m, n, l, evaluator(a, b, c).mean * 1e3)) c_np = np.dot(a_np, b_np) np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
def conv2d_winograd_without_weight_transfrom_strategy_cuda( attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cuda strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nchw_winograd_without_weight_transform), name="conv2d_nchw_winograd_without_weight_transform.cuda", ) elif layout == "NHWC": N, H, W, _ = get_const_tuple(data.shape) alpha, _, CI, CO = get_const_tuple(kernel.shape) dilation_h, dilation_w = dilation judge_winograd_tensorcore, _, _ = judge_winograd( N, H, W, alpha, alpha, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=True, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_tensorcore_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform ), name= "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_direct_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_direct_without_weight_transform ), name= "conv2d_nhwc_winograd_direct_without_weight_transform.cuda", ) if PassContext.current().config.get("relay.backend.use_auto_scheduler", False): strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc_winograd_without_weight_transform", plevel=15, ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) return strategy
s[B].bind(s[B].op.reduce_axis[0], tx) s[B].bind(s[B].op.axis[0], bx) s[BF].compute_at(s[B], s[B].op.axis[0]) _, noi = s[BF].split(s[BF].op.reduce_axis[0], factor=2) BF2 = s.rfactor(BF, noi, 0) s[BF].bind(s[BF].op.axis[0], tx) s[BF2].compute_at(s[BF], s[BF].op.axis[1]) fcuda = tvm.build(s, [A, B], "cuda") @unittest.skipIf(not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"), "skip because cuda is not enabled..") def test_cuda_const_float_to_half(): # This import is required to use nvcc to perform code gen; # otherwise it is found that the code gen is done by nvrtc. from tvm import autotvm shape = (2, 3, 4) a = te.placeholder(shape, dtype='float16', name='a') b = tvm.tir.const(0.5, dtype='float16') c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name='c') s = te.create_schedule(c.op) axes = [axis for axis in c.op.axis] fused = s[c].fuse(*axes) bx, tx = s[c].split(fused, factor=64) s[c].bind(bx, te.thread_axis('blockIdx.x')) s[c].bind(tx, te.thread_axis('threadIdx.x'))
def run_case(dtype, image, target): # Check image import os import json import sys STAT_REPEAT = os.environ.get('STAT_REPEAT', '') if STAT_REPEAT == '' or STAT_REPEAT == None: STAT_REPEAT = 10 STAT_REPEAT = int(STAT_REPEAT) # FGG: set model files via CK env CATEG_FILE = '../synset.txt' synset = eval(open(os.path.join(CATEG_FILE)).read()) files = [] val = {} if image != None and image != '': files = [image] else: ipath = os.environ.get('CK_ENV_DATASET_IMAGENET_VAL', '') if ipath == '': print('Error: path to ImageNet dataset is not set!') exit(1) if not os.path.isdir(ipath): print('Error: path to ImageNet dataset was not found!') exit(1) # get all files d = os.listdir(ipath) for x in d: x1 = x.lower() if x1.startswith('ilsvrc2012_val_'): files.append(os.path.join(ipath, x)) files = sorted(files) STAT_REPEAT = 1 # Get correct labels ival = os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT', '') fval = open(ival).read().split('\n') val = {} for x in fval: x = x.strip() if x != '': y = x.split(' ') val[y[0]] = int(y[1]) # FGG: set timers import time timers = {} # Get first shape (expect that will be the same for all) dt = time.time() image = Image.open(os.path.join(files[0])).resize((224, 224)) if image.mode != 'RGB': image = image.convert('RGB') timers['execution_time_load_image'] = time.time() - dt dt = time.time() img = transform_image(image) timers['execution_time_transform_image'] = time.time() - dt # load model from mxnet.gluon.model_zoo.vision import get_model from mxnet.gluon.utils import download model_path = os.environ['CK_ENV_MODEL_MXNET'] model_id = os.environ['MXNET_MODEL_ID'] block = get_model(model_id, pretrained=True, root=model_path) # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon net, params = nnvm.frontend.from_mxnet(block) # we want a probability so add a softmax operator net = nnvm.sym.softmax(net) # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3) if dtype != 'float32': params = { k: tvm.nd.array(v.asnumpy().astype(dtype)) for k, v in params.items() } # compile if target == None or target == 'cpu': xtarget = 'llvm' elif target == 'cuda': xtarget = 'cuda' opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, target=xtarget, shape={"data": data_shape}, params=params, dtype=dtype, target_host=None) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) if target == None or target == 'cpu': ctx = tvm.cpu(0) elif target == 'cuda': ctx = tvm.gpu(0) rlib = lib rparams = params # create graph runtime dt = time.time() module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) timers['execution_time_create_run_time_graph'] = (time.time() - dt) total_images = 0 correct_images_top1 = 0 correct_images_top5 = 0 # Shuffle files and pre-read JSON with accuracy to continue aggregating it # otherwise if FPGA board hangs, we can continue checking random images ... import random random.shuffle(files) if len(files) > 1 and os.path.isfile('aggregate-ck-timer.json'): x = json.load(open('aggregate-ck-timer.json')) if 'total_images' in x: total_images = x['total_images'] if 'correct_images_top1' in x: correct_images_top1 = x['correct_images_top1'] if 'correct_images_top5' in x: correct_images_top5 = x['correct_images_top5'] dt1 = time.time() for f in files: total_images += 1 print( '===============================================================================' ) print('Image ' + str(total_images) + ' of ' + str(len(files)) + ' : ' + f) image = Image.open(os.path.join(f)).resize((224, 224)) if image.mode != 'RGB': image = image.convert('RGB') img = transform_image(image) # set inputs module.set_input('data', tvm.nd.array(img.astype(dtype))) module.set_input(**rparams) # perform some warm up runs # print("warm up..") warm_up_timer = module.module.time_evaluator("run", ctx, 1) warm_up_timer() # execute print('') print("run (" + str(STAT_REPEAT) + " statistical repetitions)") dt = time.time() timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT) tcost = timer() timers['execution_time_classify'] = (time.time() - dt) / STAT_REPEAT # get outputs tvm_output = module.get_output(0, tvm.nd.empty((1000, ), dtype, ctx)) top1 = np.argmax(tvm_output.asnumpy()) top5 = [] atop5 = get_top5(tvm_output.asnumpy()) print('') print('TVM prediction Top1:', top1, synset[top1]) print('') print('TVM prediction Top5:') for q in atop5: x = q[1] y = synset[x] top5.append(x) print(x, y) print('') print("Internal T-cost: %g" % tcost.mean) # Check correctness if available if len(val) > 0: top = val[os.path.basename(f)] correct_top1 = False if top == top1: correct_top1 = True correct_images_top1 += 1 print('') if correct_top1: print('Current prediction Top1: CORRECT') else: print('Current prediction Top1: INCORRECT +(' + str(top) + ')') accuracy_top1 = float(correct_images_top1) / float(total_images) print('Current accuracy Top1: ' + ('%.5f' % accuracy_top1)) correct_top5 = False if top in top5: correct_top5 = True correct_images_top5 += 1 print('') if correct_top5: print('Current prediction Top5: CORRECT') else: print('Current prediction Top5: INCORRECT +(' + str(top) + ')') accuracy_top5 = float(correct_images_top5) / float(total_images) print('Current accuracy Top5: ' + ('%.5f' % accuracy_top5)) print('') print('Total elapsed time: ' + ('%.1f' % (time.time() - dt1)) + ' sec.') timers['total_images'] = total_images timers['correct_images_top1'] = correct_images_top1 timers['accuracy_top1'] = accuracy_top1 timers['correct_images_top5'] = correct_images_top5 timers['accuracy_top5'] = accuracy_top5 timers['execution_time_classify_internal'] = tcost.mean timers['execution_time'] = tcost.mean with open('tmp-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) with open('aggregate-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) sys.stdout.flush() return
# To generate the module library, TVM will first transfer the high level IR # into the lower intrinsic IR of the specified target backend, which is CUDA # in this example. Then the machine code will be generated as the module library. opt_level = 3 target = tvm.target.cuda() with tvm.transform.PassContext(opt_level=opt_level): lib = relay.build(mod, target, params=params) ##################################################################### # Run the generate library # ------------------------ # Now we can create graph runtime and run the module on Nvidia GPU. # create random input dev = tvm.gpu() data = np.random.uniform(-1, 1, size=data_shape).astype("float32") # create module module = graph_runtime.GraphModule(lib["default"](dev)) # set input and parameters module.set_input("data", data) # run module.run() # get output out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy() # Print first 10 elements of output print(out.flatten()[0:10]) ###################################################################### # Save and Load Compiled Module
timing = time.time() else: print('Executes: ', info.name, (time.time() - timing) * 1000) np_a = np.random.randn(n, c // 16, h, w, 16).astype('float16') np_b = np.random.randn(ko // 16, ic // 16, kh, kw, 16, 16).astype('float16') #np_a = (np.arange(n * (c // 16) * h * w * 16) % 7).astype('float16') #np_b = (np.arange((ko // 16) * kh * kw * ic * 16) % 7).astype('float16') #np_a.shape = (n, c // 16, h, w, 16) #np_b.shape = (ko // 16, ic // 16, kh, kw, 16, 16) np_c = np.random.randn(n, ko // 16, (h - kh) // stride_h + 1, (w - kw) // stride_w + 1, 16).astype('float32') nd_a = tvm.nd.array(np_a, tvm.gpu()) nd_b = tvm.nd.array(np_b, tvm.gpu()) nd_c = tvm.nd.array(np_c, tvm.gpu()) import tensorizer passes = [(1, tensorizer.loop_swizzle), (1, tensorizer.rewrite), (1, tensorizer.inject_sync), (1, tensorizer.sliding_window)] with tvm.transform.PassContext(opt_level=4, config={'tir.add_lower_pass': passes}): #with tvm.transform.PassContext(opt_level=4): module = tvm.build(sch, [a, b, conv], 'nvptx') fte = module.time_evaluator(module.entry_name, ctx=tvm.gpu(), number=3, repeat=10) res = fte(nd_a, nd_b, nd_c).results
print(tvm.lower(s, [A, A_ch], simple_mode=True)) """ blockdim, threaddim = 32, 32 n, c, h, w = s[A_ch].op.axis hw = s[A_ch].fuse(h, w) no, ni = s[A_ch].split(n, nparts=blockdim) co, ci = s[A_ch].split(c, nparts=blockdim) hwo, hwi = s[A_ch].split(hw, nparts=32*32) s[A_ch].reorder(no, co, hwo, ni, ci, hwi) s[A_ch].bind(no, block_y) s[A_ch].bind(co, block_x) s[A_ch].bind(hwo, thread_x) s[A_ch].vectorize(hwi) """ func = tvm.build(s, [A, A_ch], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=input_tensor).astype(A.dtype) a = tvm.nd.array(a_np, ctx) a_ch = tvm.nd.array(np.zeros(output_tensor, dtype=A_ch.dtype), ctx) func(a, a_ch) evaluator = func.time_evaluator(func.entry_name, ctx, number=1) conv_time = evaluator(a, a_ch).mean * 1e3 tot_byte = batch * in_channel * in_size * in_size * 4 / 1024 / 1024 / 1024 # GB print('Convolution: %f ms, Bandwidth: %f GB/s' % (conv_time, tot_byte / conv_time * 1000 * 2)) dev_module = func.imported_modules[0] print(dev_module) print("----GPU code----") print(dev_module.get_source())
def test_bind(): if not tvm.gpu(0).exist: print('[Warning] No GPU found! Skip bind test!') return @script def vec_add(a, b): c = output_tensor((1000, ), 'float32') for tx in bind('threadIdx.x', 1000): c[tx] = a[tx] + b[tx] return c a = tvm.placeholder((1000, ), dtype='float32', name='a') b = tvm.placeholder((1000, ), dtype='float32', name='b') func, ins, outs = run_and_check(vec_add, [a, b], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') @script def raw(a, b): c = output_tensor((1000, ), 'float32') for i in range(1000): c[i] = a[i] + b[i] return c c = raw(a, b) sch = tvm.create_schedule(c.op) x = tvm.thread_axis('threadIdx.x') sch[c].bind(c.op.axis[0], x) func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') @tvm.hybrid.script def foo(a): c = output_tensor((a.shape[0],), a.dtype) total = allocate((1,), a.dtype, 'local') len_i = a.shape[0] len_j = a.shape[1] for i in bind('threadIdx.x', len_i): total[0] = 0. for k in const_range(len_j): total[0] += a[i, k] c[i] = total[0] return c a = tvm.placeholder((8, 4), 'float32') c = foo(a) s = tvm.create_schedule(c.op) ir = tvm.lower(s, [a, c], simple_mode=True) assert not isinstance(ir, tvm.stmt.AttrStmt) func, ins, outs = run_and_check(foo, [a], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') @tvm.hybrid.script def max_threads(a): b = output_tensor(a.shape, a.dtype) n = a.shape[0] m = max_num_threads(True) for i in bind('threadIdx.x', m): for j in bind('blockIdx.x', ceil_div(n, m)): if i * m + j < n: b[i * m + j] = a[i * m + j] + a[i * m + j] return b a = tvm.placeholder((10000, ), 'float32') with tvm.target.create('cuda'): func, ins, outs = run_and_check(max_threads, [a], target='cuda') run_and_check(func, ins, outs=outs, target='cuda')
with open(temp.relpath("deploy_param.params"), "wb") as fo: fo.write(nnvm.compiler.save_param_dict(params)) print(temp.listdir()) ###################################################################### # Deploy locally to Nvidia GPU # ------------------------------ # Now we can load the module back. import numpy as np from tvm.contrib import graph_runtime loaded_lib = tvm.module.load(path_lib) loaded_json = open(temp.relpath("deploy_graph.json")).read() loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read()) module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0)) module.load_params(loaded_params) input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32")) module.run(data=input_data) out = module.get_output(0, out=tvm.nd.empty(out_shape)) # Print first 10 elements of output print(out.asnumpy()[0][0:10]) ###################################################################### # Compile and Deploy the Model to Raspberry Pi Remotely with RPC # ------------------------------ # Following the steps above, we can also compile the model for Raspberry Pi. # TVM provides rpc module to help with remote deploying. # # For demonstration, we simply start an RPC server on the same machine,
###################################################################### # Deploy and Run # -------------- # Now that we have have compiled module, let us run it. # We can use :any:`graph_runtime <tvm.contrib.graph_runtime.create>` # in tvm to create a deployable :any:`GraphModule <tvm.contrib.graph_runtime.GraphModule>`. # We can use the :any:`set_input <tvm.contrib.graph_runtime.GraphModule.set_input>`, # :any:`run <tvm.contrib.graph_runtime.GraphModule.run>` and # :any:`get_output <tvm.contrib.graph_runtime.GraphModule.get_output>` function # to set the input, execute the graph and get the output we need. # import tvm import numpy as np from tvm.contrib import graph_runtime, util module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0)) x_np = np.array([1, 2, 3, 4]).astype("float32") y_np = np.array([4, 4, 4, 4]).astype("float32") # set input to the graph module module.set_input(x=x_np, y=y_np) # run forward computation module.run() # get the first output out = module.get_output(0, out=tvm.nd.empty(shape)) print(out.asnumpy()) ###################################################################### # Provide Model Parameters # ------------------------ # Most deep learning models contains two types of inputs: parameters # that remains fixed during inference and data input that need to
# --------------------------------------------- # We should be familiar with the process right now. import nnvm.compiler target = 'cuda' # assume first input name is data input_name = sym.list_input_names()[0] shape_dict = {input_name: x.shape} with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params) ###################################################################### # Execute on TVM # --------------------------------------------- # The process is no different from other example from tvm.contrib import graph_runtime ctx = tvm.gpu(0) dtype = 'float32' m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input(input_name, tvm.nd.array(x.astype(dtype))) m.set_input(**params) # execute m.run() # get outputs output_shape = (1, 1, 672, 672) tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy() ###################################################################### # Display results # --------------------------------------------- # We put input and output image neck to neck
def test_tensor_core_batch_matmal(): batch_size = 4 n = 512 m, l = n, n assert n % 32 == 0 assert m % 8 == 0 assert l % 16 == 0 nn, mm, ll = n // 32, m // 8, l // 16 A = te.placeholder((batch_size, nn, ll, 32, 16), name="A", dtype="float16") B = te.placeholder((batch_size, ll, mm, 16, 8), name="B", dtype="float16") k1 = te.reduce_axis((0, ll), name="k1") k2 = te.reduce_axis((0, 16), name="k2") C = te.compute( (batch_size, nn, mm, 32, 8), lambda b, i, j, ii, jj: te.sum( A[b, i, k1, ii, k2].astype("float") * B[b, k1, j, k2, jj].astype("float"), axis=[k1, k2] ), name="Fragment_C", ) s = te.create_schedule(C.op) warp_size = 32 kernel_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 chunk = 4 block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") AS = s.cache_read(A, "shared", [C]) BS = s.cache_read(B, "shared", [C]) AF = s.cache_read(AS, "wmma.matrix_a", [C]) BF = s.cache_read(BS, "wmma.matrix_b", [C]) CF = s.cache_write(C, "wmma.accumulator") b, i, j, kernel_i, kernel_j = s[C].op.axis i, ii = s[C].split(i, factor=warp_row_tiles) block_i, i = s[C].split(i, factor=block_row_warps) j, jj = s[C].split(j, factor=warp_col_tiles) block_j, j = s[C].split(j, factor=block_col_warps) s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j) s[C].bind(b, block_z) s[C].bind(block_i, block_x) s[C].bind(block_j, block_y) s[C].bind(i, thread_y) s[C].bind(j, thread_z) s[CF].compute_at(s[C], j) b, warp_i, warp_j, _i, _j = s[CF].op.axis k, _k = CF.op.reduce_axis ko, ki = s[CF].split(k, factor=chunk) s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k) s[AF].compute_at(s[CF], ki) s[BF].compute_at(s[CF], ki) s[AS].compute_at(s[CF], ko) b, xo, yo, xi, yi = AS.op.axis tx, xo = s[AS].split(xo, nparts=block_row_warps) ty, yo = s[AS].split(yo, nparts=block_col_warps) t = s[AS].fuse(xi, yi) to, ti = s[AS].split(t, nparts=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(to, thread_x) s[BS].compute_at(s[CF], ko) b, xo, yo, xi, yi = BS.op.axis tx, xo = s[BS].split(xo, nparts=block_row_warps) ty, yo = s[BS].split(yo, nparts=block_col_warps) t = s[BS].fuse(xi, yi) to, ti = s[BS].split(t, nparts=warp_size) s[BS].bind(tx, thread_y) s[BS].bind(ty, thread_z) s[BS].bind(to, thread_x) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_a")) s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_b")) s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16))) s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16))) func = tvm.build(s, [A, B, C], "cuda") dev = tvm.gpu(0) a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype) b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) if VERIFY: func(a, b, c) a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) np.testing.assert_allclose( c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4 )
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if data.dtype in ('int8', 'uint8') and kernel.dtype in ('int8', 'uint8'): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda") else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda") elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda") N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment judge_winograd_tensorcore, judge_winograd_shape = winograd_judge( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, pre_flag=False) if judge_winograd_shape: if target.kind.name == "cuda" and \ nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \ judge_winograd_tensorcore: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore ), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5) if target.kind.name == "cuda": if nvcc.have_tensorcore(tvm.gpu(0).compute_version): if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \ (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \ (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20) elif layout == "HWNC": assert kernel_layout in [ "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i" ] _, _, N, in_channels = get_const_tuple(data.shape) pre_computed = len(kernel.shape) == 6 if pre_computed: _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple( kernel.shape) out_channels = oc_chunk * oc_block_factor else: _, _, out_channels, _ = get_const_tuple(kernel.shape) if topi.cuda.is_shape_tensorcore_direct_qualified( batch=N, in_channels=in_channels, num_filter=out_channels, in_dtype=data.dtype): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_hwnc_tensorcore), name="conv2d_hwnc_tensorcore_direct.cuda", plevel=20) else: raise RuntimeError("Unsupported shape for conv2d HWNC.\ Need to satisfy tensor core schedule.") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda") else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \ padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda") elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda") else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \ padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25) cudnn_impl = True if layout == 'NCHW': # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda") elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda") elif not cudnn_impl: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_cuda( attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cuda strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nchw_winograd_without_weight_transform), name="conv2d_nchw_winograd_without_weight_transform.cuda") elif layout == "NHWC": N, H, W, _ = get_const_tuple(data.shape) alpha, _, CI, CO = get_const_tuple(kernel.shape) dilation_h, dilation_w = dilation judge_winograd_tensorcore, _ = winograd_judge(N, H, W, alpha, alpha, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, pre_flag=True) if target.kind.name == "cuda" and \ nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \ judge_winograd_tensorcore: strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_tensorcore_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform ), name= "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda" ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_direct_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_direct_without_weight_transform ), name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda" ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) return strategy
def create_ctx(device, did=0): if device == "x86": ctx = tvm.cpu(did) elif device == "gpu": ctx = tvm.gpu(did) return ctx
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if data.dtype in ('int8', 'uint8') and kernel.dtype in ('int8', 'uint8'): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda") else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda") elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda") N, _, _, _ = get_const_tuple(data.shape) _, _, CI, CO = get_const_tuple(kernel.shape) if target.target_name == "cuda": if nvcc.have_tensorcore(tvm.gpu(0).compute_version): if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \ (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \ (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda") else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation if target.target_name == "cuda" and "cudnn" in target.libs: if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \ padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=15) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda") elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda") else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d if layout == 'NCHW': # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda") elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda") else: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy