def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') bias = tvm.placeholder((num_filter, 1, 1), name='bias') a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), (dilation, dilation), layout='NCHW', out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4) for device in get_all_backend(): with autotvm.tophub.context(device): # load tophub pre-tuned parameters check_device(device)
def compute_contrib_conv2d_NCHWc(attrs, inputs, _): """Compute definition of conv2d NCHWc""" padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") out_channel = attrs.get_int("channels") groups = attrs.get_int("groups") layout = attrs.get_str("layout") out_layout = attrs.get_str("out_layout") out_dtype = attrs.get_str("out_dtype") out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype if layout == "NCHW": _, in_channel, _, _ = get_const_tuple(inputs[0].shape) else: _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape) in_channel = in_channel_chunk * in_channel_block assert dilation == (1, 1), "not support dilate now" if groups == 1: # pylint: disable=assignment-from-no-return out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation, layout, out_layout, out_dtype) # pylint: enable=assignment-from-no-return elif groups == in_channel and groups == out_channel: # pylint: disable=assignment-from-no-return out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation, layout, out_layout, out_dtype) # pylint: enable=assignment-from-no-return else: raise ValueError("not support arbitrary group number > 1 for now") if attrs.get_bool("use_bias"): bias = inputs[2] bias = topi.expand_dims(bias, axis=1, num_newaxis=2) out = topi.add(out, bias) return out
def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding): in_height = in_width = in_size with tvm.target.rasp(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') B = topi.nn.conv2d(A, W, stride, padding) s = topi.generic.schedule_conv2d_nchw([B]) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_conv2d") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], "llvm") func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar): input_dtype = 'uint32' out_dtype = 'int16' with tvm.target.create('llvm'): A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A') B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B') C = topi.nn.bitserial_dense(A, B, activation_bits, weight_bits, out_dtype=out_dtype, unipolar=unipolar) s = topi.generic.schedule_bitserial_dense([C]) a_shape = get_const_tuple(A.shape) b_shape = get_const_tuple(B.shape) @memoize("topi.tests.test_topi_bitseral_dense") def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype) b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype) if unipolar: b_ = np.copy(b_np).astype(out_dtype) for x in np.nditer(b_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 c_np = np.dot(a_np, b_.T) else: c_np = np.dot(a_np, b_np.T) return a_np, b_np, c_np a_np, b_np, c_np = get_ref_data() ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func = tvm.build(s, [A, B, C], "llvm") func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def test_bilayout_index(): bilayout = tvm.bijective_layout("NCHW", "NCHW16c") dst_index = bilayout.forward_index([0, 18, 6, 6]) assert get_const_tuple(dst_index) == (0, 1, 6, 6, 2) src_index = bilayout.backward_index([0, 1, 6, 6, 2]) assert get_const_tuple(src_index) == (0, 18, 6, 6)
def test_bilayout_shape(): bilayout = tvm.bijective_layout("NCHW", "NCHW16c") assert isinstance(bilayout, tvm.tensor.BijectiveLayout) dst_shape = bilayout.forward_shape((1, 32, 7, 7)) assert get_const_tuple(dst_shape) == (1, 2, 7, 7, 16) src_shape = bilayout.backward_shape(dst_shape) assert get_const_tuple(src_shape) == (1, 32, 7, 7)
def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode): iw = ih kw = kh sw = sh ph, pw = padding A = tvm.placeholder((n, ic, ih, iw), name='A') B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding, pool_type=pool_type, ceil_mode=ceil_mode) B = topi.nn.relu(B) dtype = A.dtype bshape = get_const_tuple(B.shape) ashape = get_const_tuple(A.shape) if ceil_mode: assert bshape[2] == int(math.ceil(float(ashape[2] - kh + ph * 2) / sh) + 1) assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pw * 2) / sw) + 1) else: assert bshape[2] == int(math.floor(float(ashape[2] - kh + ph * 2) / sh) + 1) assert bshape[3] == int(math.floor(float(ashape[3] - kw + pw * 2) / sw) + 1) a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype) pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype) no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw))) pad_np[np.ix_(*no_zero)] = a_np _, oc, oh, ow = get_const_tuple(B.shape) b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype) if pool_type == 'avg': for i in range(oh): for j in range(ow): b_np[:,:,i,j] = np.mean(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) elif pool_type =='max': for i in range(oh): for j in range(ow): b_np[:,:,i,j] = np.max(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) b_np = np.maximum(b_np, 0.0) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_pool(B) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) print(tvm.lower(s, [A, B], simple_mode=True)) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['opengl']: check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): # declare DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter, (stride_h, stride_w), padding_args, (dilation, dilation), in_layout, out_layout, dtype) # TODO: add scale_shift implement for NCHWc and add test here Relu = topi.nn.relu(DepthwiseConv2d) # schedule s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d) s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Relu], device) # Prepare pod type for test data closure input_shape = (batch, in_channel, in_height, in_width) filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( input_np, filter_np, stride, padding) relu_scipy = np.maximum(depthwise_conv2d_scipy, 0) return (_transform_data(input_np, ic_block), _transform_kernel(filter_np, oc_block), _transform_data(depthwise_conv2d_scipy, oc_block), _transform_data(relu_scipy, oc_block)) # Get the test data (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data() input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) f1(input_tvm, filter_tvm, depthwise_conv2d_tvm) # launch kernel 2 (depthwise_conv2d + relu) f2(input_tvm, filter_tvm, relu_tvm) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype) b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype) if unipolar: b_ = np.copy(b_np).astype(out_dtype) for x in np.nditer(b_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 c_np = np.dot(a_np, b_.T) else: c_np = np.dot(a_np, b_np.T) return a_np, b_np, c_np
def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) if unipolar: w_ = np.copy(w_np).astype(out_dtype) for x in np.nditer(w_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype) else: b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype) return a_np, w_np, b_np
def compute_deformable_conv2d(attrs, inputs, out_dtype, target): """Compute definition of deformable_conv2d""" padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) dilation = get_const_tuple(attrs.dilation) deformable_groups = attrs.deformable_groups groups = attrs.groups out_dtype = attrs.out_dtype out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype with target: out = topi.nn.deformable_conv2d_nchw(inputs[0], inputs[1], inputs[2], strides, padding, dilation, deformable_groups, groups, out_dtype) return [out]
def verify_leaky_relu(m, alpha): A = tvm.placeholder((m,), name='A') B = topi.nn.leaky_relu(A, alpha) s = tvm.create_schedule([B.op]) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], "llvm", name="leaky_relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def verify_deformable_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, deformable_groups=1, groups=1): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, deformable_groups, groups)) A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A') out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1 Offset = tvm.placeholder((batch, deformable_groups * kernel * kernel * 2, out_size, out_size), name='offset') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') bias = tvm.placeholder((num_filter, 1, 1), name='bias') a_shape = get_const_tuple(A.shape) offset_shape = get_const_tuple(Offset.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) offset_np = np.random.randn(*offset_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np = topi.testing.deformable_conv2d_nchw_python(a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups) return a_np, offset_np, w_np, c_np a_np, offset_np, w_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.deformable_conv2d_nchw(A, Offset, W, stride, padding, dilation, deformable_groups, groups, out_dtype=dtype) s = topi.generic.schedule_deformable_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) offset = tvm.nd.array(offset_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx) func = tvm.build(s, [A, Offset, W, C], device) func(a, offset, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in ['llvm', 'cuda']: check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) # build the kernel f = tvm.build(schedule, [Filter, Out_grad, In_grad], device) # prepare pod type for test data closure dtype = Out_grad.dtype out_grad_shape = get_const_tuple(Out_grad.shape) filter_shape = get_const_tuple(Filter.shape) # use memoize to pickle the test data for next time use @memoize("topi.tests.test_topi_depthwise_conv2d_backward_input.nhwc") def get_ref_data(): out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) dilated_out_grad_np = topi.testing.dilate_python(out_grad_np, [1, stride_h, stride_w, 1]) # padding params in forward propagation fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple([padding_h, padding_w], (filter_h, filter_w)) # padding params in backward propagation bpad_top = filter_h - 1 - fpad_top bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1) bpad_left = filter_w - 1 - fpad_left bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1) padded_out_grad = np.zeros((batch, dilated_out_grad_np.shape[1]+bpad_top+bpad_bottom, dilated_out_grad_np.shape[2]+bpad_left+bpad_right, out_channel)) padded_out_grad[:, bpad_top:dilated_out_grad_np.shape[1]+bpad_top, bpad_left:dilated_out_grad_np.shape[2]+bpad_left, :] = dilated_out_grad_np in_grad_np = np.zeros((batch, in_h, in_w, in_channel)) for b in range(batch): for c in range(in_channel): for m in range(channel_multiplier): in_grad_np[b, :, :, c] += signal.convolve2d(padded_out_grad[b, :, :, c*channel_multiplier+m], \ filter_np[:, :, c, m], mode='valid')[0:in_h, 0:in_w] return (out_grad_np, filter_np, in_grad_np) (out_grad_np, filter_np, in_grad_np) = get_ref_data() out_grad_tvm = tvm.nd.array(out_grad_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), ctx) # launch the kernel timer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean tvm.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5)
def verify_leaky_relu(m, alpha): A = tvm.placeholder((m,), name='A') B = topi.cpp.nn.leaky_relu(A, alpha) device = "llvm" target = topi.cpp.TEST_create_target(device) s = topi.cpp.generic.schedule_injective(target, [B]) a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="leaky_relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) k = 10.0 dilation = (1, 1) with tvm.target.create(device): A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') B = topi.nn.conv2d(A, W, stride, padding, dilation) if typ == "add": C = B + k elif typ == "sub": C = B - k elif typ == "mul": C = B * k elif typ == "div": C = B / k else: raise NotImplementedError() s = topi.generic.schedule_conv2d_nchw([C]) foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ) a_npy = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) w_npy = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) b_npy = topi.testing.conv2d_nchw_python(a_npy, w_npy, stride, padding) c_npy = np.random.uniform(size=get_const_tuple(B.shape)).astype(B.dtype) if typ == "add": c_npy = b_npy + k elif typ == "sub": c_npy = b_npy - k elif typ == "mul": c_npy = b_npy * k elif typ == "div": c_npy = b_npy / k else: raise NotImplementedError() a_nd = tvm.nd.array(a_npy, ctx) w_nd = tvm.nd.array(w_npy, ctx) b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx) foo(a_nd, w_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func1 = tvm.build(s1, [A, W, B], device) func2 = tvm.build(s2, [A, W, C], device) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def verify_softmax(m, n, dtype="float32"): A = tvm.placeholder((m, n), dtype=dtype, name='A') B = topi.nn.softmax(A) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.softmax_python(a_np) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_softmax(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="softmax") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']: check_device(device)
def verify_global_pool(n, c, h, w, pool_type): A = tvm.placeholder((n, c, h, w), name='A') B = topi.cpp.nn.global_pool(A, pool_code[pool_type]) B = topi.cpp.nn.relu(B) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) if pool_type == 'avg': b_np = np.mean(a_np, axis=(2,3), keepdims=True) elif pool_type =='max': b_np = np.max(a_np, axis=(2,3), keepdims=True) b_np = np.maximum(b_np, 0.0) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) if device == "llvm": s = topi.cpp.generic.default_schedule(target, [B], False) else: s = topi.cpp.cuda.schedule_global_pool(target, [B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm']: check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_group_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def verify_relu(m, n, dtype): A = tvm.placeholder((m, n), name='A', dtype=dtype) B = topi.cpp.nn.relu(A) assert B.dtype == dtype a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) if device == "llvm": s = topi.cpp.generic.schedule_injective(target, [B]) else: s = topi.cpp.cuda.schedule_injective(target, [B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm']: check_device(device)
def verify_log_softmax(m, n): A = tvm.placeholder((m, n), name='A') B = topi.nn.log_softmax(A) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.log_softmax_python(a_np) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_softmax(B) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="log_softmax") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ["opengl"]: check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding), (dilation, dilation), layout='NCHW%dc'%ic_block, out_layout="NCHW%dc"%oc_block, out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
def verify_softmax(m, n): A = tvm.placeholder((m, n), name='A') B = topi.cpp.nn.softmax(A, 1) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.softmax_python(a_np) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) if device == "llvm": s = topi.cpp.generic.default_schedule(target, [B], False) else: s = topi.cpp.cuda.schedule_softmax(target, [B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="softmax") foo(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm']: check_device(device)
def verify_global_pool(n, c, h, w, pool_type): A = tvm.placeholder((n, c, h, w), name='A') B = topi.nn.global_pool(A, pool_type=pool_type) B = topi.nn.relu(B) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) if pool_type == 'avg': b_np = np.mean(a_np, axis=(2,3), keepdims=True) elif pool_type =='max': b_np = np.max(a_np, axis=(2,3), keepdims=True) b_np = np.maximum(b_np, 0.0) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_adaptive_pool(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in get_all_backend(): check_device(device)
def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax): '''Verify region operator by comparing outputs from tvm and numpy implementation''' in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') B = topi.vision.yolo2.region(A, n, classes, coords, background, l_softmax) a_shape = get_const_tuple(A.shape) dtype = A.dtype def get_ref_data_region(): a_np = np.random.uniform(size=a_shape).astype(dtype) b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax) return a_np, b_np a_np, b_np = get_ref_data_region() def check_device(device): '''Cheching devices is enabled or not''' ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.vision.schedule_region([B]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, B], device) func(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['llvm', 'cuda']: check_device(device)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.build_config(auto_unroll_max_step=128, unroll_explicit=device == 'rocm'): func1 = tvm.build(s1, [A, W, B], device) func1(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) func2 = tvm.build(s2, [A, W, C], device) func2(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding): in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') B = topi.nn.conv2d_nchw(A, W, stride, padding) C = topi.nn.relu(B) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_con2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s1 = topi.generic.schedule_conv2d_nchw([B]) s2 = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.build_config(auto_unroll_max_step=1400, unroll_explicit=(device != "cuda")): func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) func1(a, w, b) func2(a, w, c) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in ['opengl']: check_device(device)
def compute_conv2d_transpose(attrs, inputs, out_dtype, target): """Compute definition of conv2d_transpose""" padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) dilation = get_const_tuple(attrs.dilation) groups = attrs.groups layout = attrs.data_layout out_dtype = attrs.out_dtype out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "") else out_dtype) assert layout == "NCHW", "only support nchw for now" assert dilation == (1, 1), "not support dilate now" assert groups == 1, "only support groups == 1 for now" out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype) output_padding = get_const_tuple(attrs.output_padding) out = topi.nn.pad(out, [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]]) return [out]
def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1): in_height = in_width = in_size A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A') W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W') B = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation) C = topi.nn.relu(B) s1 = topi.cuda.schedule_conv2d_hwcn([B]) s2 = topi.cuda.schedule_conv2d_hwcn([C]) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_hwcn.verify_hwcn") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1)) b_np = topi.testing.conv2d_hwcn_python(a_np, dw_np, stride, padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func1 = tvm.build(s1, [A, W, B], device) func2 = tvm.build(s2, [A, W, C], device) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']: check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if dtype == "float16" and device == "cuda" and not have_fp16( tvm.gpu(0).compute_version): print("Skip because %s does not have fp16 support" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_elemwise(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ndhwc_implement) with tvm.target.create(device): B = fcompute(A, W, stride, padding, dilation, dtype) s = fschedule([B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): b = topi.vision.rcnn.roi_align_nchw(a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio) s = topi.generic.schedule_roi_align(b) tvm_a = tvm.nd.array(a_np, ctx) tvm_rois = tvm.nd.array(rois_np, ctx) tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx) f = tvm.build(s, [a, rois, b], device) f(tvm_a, tvm_rois, tvm_b) tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-3)
def split_shape_func(attrs, inputs, _): """ Shape function for split op. """ if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)): indices_or_sections = get_const_int(attrs.indices_or_sections) else: indices_or_sections = get_const_tuple(attrs.indices_or_sections) axis = get_const_int(attrs.axis) num_out = indices_or_sections if isinstance(indices_or_sections, int) \ else len(indices_or_sections) + 1 if isinstance(indices_or_sections, int): indices_or_sections = [indices_or_sections] return [_split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis)) for i in range(num_out)]
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s_func = topi.testing.dispatch(device, _pool_grad_schedule) s = s_func(PoolGrad) a = tvm.nd.array(a_np, ctx) out_grad = tvm.nd.array(out_grad_np, ctx) pool_grad = tvm.nd.array( np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), ctx) f = tvm.build(s, [A, OutGrad, PoolGrad], device) f(a, out_grad, pool_grad) tvm.testing.assert_allclose(pool_grad.asnumpy(), pool_grad_np, rtol=1e-5)
def check_device(device): if not tvm.module.enabled(device.split(' ')[0]): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) ctx = tvm.context(device.split(' ')[0], 0) with tvm.target.create(device): B = topi.cuda.conv2d_cuda(A, W, stride, 0, layout='NCHW') # Return NCHW s = topi.cuda.schedule_conv2d_nchw([B]) # Borrow NCHW schedule a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], target="cuda", target_host="llvm") func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device): if not tvm.runtime.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # Measure time cost of kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) print("Stride = (%d, %d)" % (stride_h, stride_w)) print("padding = %s\n" % padding) print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape))) print("average time cost of 1000 runs (depthwise_conv2d) = %g us" % (tcost_1*1e6)) print("average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us" % (tcost_2*1e6)) print("average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us" % (tcost_3*1e6)) # correctness depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape)) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success")
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) for fcompute, fschedule in topi.testing.dispatch( device, _dense_implement): with tvm.target.create(device): D = fcompute(A, B, C if use_bias else None) D = topi.nn.relu(D) s = fschedule([D]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(c_np, ctx) d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
def check_device(device): if not tvm.runtime.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): B = topi.nn.conv2d(A, W, (stride, stride), padding, (dilation, dilation), layout='NHWC', out_dtype=dtype) s = topi.generic.schedule_conv2d_nhwc([B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return with tvm.target.create(device): B = topi.nn.conv1d(A, W, stride, padding, dilation, layout, 'float32') if layout == 'NCW': s = topi.generic.schedule_conv1d_ncw([B]) else: s = topi.generic.schedule_conv1d_nwc([B]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) if device == "llvm": s = topi.cpp.generic.schedule_dense(target, [D]) elif device == "rocm": s = topi.cpp.rocm.schedule_dense(target, [D]) else: s = topi.cpp.cuda.schedule_dense(target, [D]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(c_np, ctx) d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if not nvcc.have_tensorcore(ctx.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % device) with tvm.target.create(device): fcompute, fschedule = topi.testing.dispatch( device, _conv3d_ndhwc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, 'float32') if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
def run_tvm(nwarmup: int, nloops: int, args: dict, out, verbose: bool = False, debug: bool = False, scheduling=None) -> Result: """ Take dict[TVM_Tensor, np_array] as args, convert them to TVM tensors and call `lam`. Result of lambda is converted back to numpy array and returned. """ ctx = tvm.cpu(0) pls = [] # placeholders vals_nd = [] # initial values for pl, val in args.items(): pls.append(pl) vals_nd.append(tvm.nd.array(val, ctx=ctx)) sout = tvm.create_schedule(out.op) scheduling(sout) if scheduling is not None else None ir = tvm.lower(sout, pls + [out], simple_mode=True) # print(type(ir), ir.__str__) print(ir) if debug else None mout = tvm.build(sout, pls + [out]) out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx=ctx) perfs: List[float] = [] for i in range(nwarmup + nloops): tb = perf_counter() mout(*(vals_nd + [out_nd])) te = perf_counter() if i >= nwarmup: perfs.append(te - tb) if verbose: print("TVM", te - tb) return Result.fromPasses(out_nd.asnumpy(), perfs)
def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax): '''Verify region operator by comparing outputs from tvm and numpy implementation''' in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') B = topi.cpp.yolo.region(A, n, classes, coords, background, l_softmax) a_shape = get_const_tuple(A.shape) dtype = A.dtype def get_ref_data_region(): '''Randomly initialize the data variables and get refernce output for the region operation''' a_np = np.random.uniform(size=a_shape).astype(dtype) b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax) return a_np, b_np a_np, b_np = get_ref_data_region() def check_device(device): '''Check the device is available and if so, build and run the program''' if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) if device == "llvm": s = topi.cpp.generic.default_schedule(target, [B], False) else: s = topi.cpp.rocm.schedule_region(target, [B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, B], device, name="region") func(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']: check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): if bgemm == "direct": fcompute, fschedule = topi.testing.dispatch( device, _conv2d_nhwc_winograd_direct) elif bgemm == "tensorcore": fcompute, fschedule = topi.testing.dispatch( device, _conv2d_nhwc_winograd_tensorcore) C = fcompute(A, W, stride, padding, dilation, 'float32') if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=2e-3)
def conv1d_strategy(attrs, inputs, out_type, target): """conv1d generic strategy""" logger.warning("conv1d is not optimized for this platform.") layout = attrs.data_layout dilation = get_const_tuple(attrs.dilation) if dilation[0] < 1: raise ValueError("dilation should be a positive value") strategy = _op.OpStrategy() if layout == "NCW": strategy.add_implementation( wrap_compute_conv1d(topi.nn.conv1d_ncw), wrap_topi_schedule(topi.generic.schedule_conv1d_ncw), name="conv1d_ncw.generic") elif layout == "NWC": strategy.add_implementation( wrap_compute_conv1d(topi.nn.conv1d_nwc), wrap_topi_schedule(topi.generic.schedule_conv1d_nwc), name="conv1d_nwc.generic") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy
def squeeze_shape_func(attrs, inputs, _): """ Shape function for squeeze op. """ axis = attrs.axis if attrs.axis is None else get_const_tuple(attrs.axis) keep_axes = [] if axis is not None: for i in range(inputs[0].shape[0].value): if i not in axis: keep_axes.append(i) # Due to current relay type system, it is possible even # a static kernel function needs shape function. To handle # this case, we allow axis to be None in squeeze shape func # for now. # TODO(kevinthesun): Enhance relay type system to avoid this. if keep_axes: out = _squeeze_shape_func(inputs[0], convert(keep_axes)) else: out = te.compute((), lambda *indices: 0) return [out]
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.create(device): D = topi.nn.dense(A, B, C if use_bias else None, out_dtype=out_dtype) D = topi.nn.relu(D) s = topi.generic.schedule_dense([D]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(c_np, ctx) d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), ctx) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax): '''Verify region operator by comparing outputs from tvm and numpy implementation''' in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') B = topi.vision.yolo2.region(A, n, classes, coords, background, l_softmax) a_shape = get_const_tuple(A.shape) dtype = A.dtype def get_ref_data_region(): a_np = np.random.uniform(size=a_shape).astype(dtype) b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax) return a_np, b_np a_np, b_np = get_ref_data_region() def check_device(device): '''Cheching devices is enabled or not''' ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): if device == 'llvm': s = topi.generic.vision.schedule_region([B]) else: s = topi.cuda.vision.schedule_region([B]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, B], device) func(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['llvm', 'cuda']: check_device(device)
def verify_shortcut(batch, in_size, in_channel): '''Verify shortcut operator by comparing outputs from tvm and numpy implementation''' in_height = in_width = in_size A1 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A1') A2 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A2') B = topi.vision.shortcut(A1, A2) a_shape = get_const_tuple(A1.shape) dtype = A1.dtype def get_ref_data_shortcut(): a_np1 = np.random.uniform(size=a_shape).astype(dtype) a_np2 = np.random.uniform(size=a_shape).astype(dtype) b_np = topi.testing.shortcut_python(a_np1, a_np2) return a_np1, a_np2, b_np a_np1, a_np2, b_np = get_ref_data_shortcut() def check_device(device): '''Cheching devices is enabled or not''' ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_injective([B]) a1 = tvm.nd.array(a_np1, ctx) a2 = tvm.nd.array(a_np2, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A1, A2, B], device) func(a1, a2, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['llvm', 'cuda']: check_device(device)
def weight_prepack_conv2d(attrs, inputs, tinfos): import ast data = tinfos[0] kernel = tinfos[1] padding = ast.literal_eval(attrs['padding']) stride = ast.literal_eval(attrs['strides']) wkl = _get_workload(data, kernel, stride, padding, 'float32') sch = _get_schedule_conv(wkl) is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd) ic_bn, oc_bn = sch.ic_bn, sch.oc_bn new_attrs = {k: attrs[k] for k in attrs.keys()} new_attrs.pop('layout', None) kernel_sym = inputs[1] oc, ic, h, w = get_const_tuple(tinfos[1].shape) OC = oc // oc_bn IC = ic // ic_bn trans_kernel = sym.transpose(kernel_sym, axes=(1, 2, 3, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(ic, h, w, OC, oc_bn)) trans_kernel = sym.transpose(trans_kernel, axes=(1, 2, 3, 4, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(h, w, OC, oc_bn, IC, ic_bn)) if is_kernel_1x1: # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 5, 3, 0, 1)) else: # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 0, 1, 5, 3)) if attrs.get_bool('use_bias'): bias = inputs[2] bias = sym.reshape(bias, shape=(OC, oc_bn)) return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, bias, **new_attrs) else: return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, **new_attrs)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if device == "cuda" and not tvm.contrib.nvcc.have_int8( ctx.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_group_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def verify_relu(m, n): A = tvm.placeholder((m, n), name='A') B = topi.nn.relu(A) s = topi.cuda.schedule_elemwise(B) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="relu") foo(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal']: check_device(device)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) ctx = tvm.context(device, 0) input = tvm.nd.array(input_np, ctx) filter_d = tvm.nd.array(filter_np_d, ctx) filter_1 = tvm.nd.array(filter_np_1, ctx) output = tvm.nd.array( np.zeros(get_const_tuple(Output.shape), dtype=Output.dtype), ctx) with tvm.target.create(device): if layout == "NCHW": s = schedule_depth_1by1_fused_nchw([Output]) else: s = schedule_depth_1by1_fused_nhwc([Output]) print( tvm.lower(s, [Input, Filter_d, Filter_1, Output], simple_mode=True)) func = tvm.build(s, [Input, Filter_d, Filter_1, Output], device, name=("Depthwise1by1Fused_%d_%d" % (Input.shape[1], Input.shape[2]))) print(func.imported_modules[0].get_source()) # func(a, w, b) timer_1 = func.time_evaluator(func.entry_name, ctx, number=10) tcost_1 = timer_1(input, filter_d, filter_1, output).mean # np.testing.assert_allclose(output.asnumpy(), output_np, rtol=1e-5) d = ~np.isclose(output.asnumpy(), output_np, rtol=1e-5) print(output.asnumpy()[d]) print(output_np[d]) print(np.where(d)) print( "Depthwise & 1by1 Fused ({}): average running time is {:.2f} us.". format(layout, tcost_1 * 1e6))
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding), (dilation, dilation), layout='NCHW%dc'%ic_block, out_layout="NCHW%dc"%oc_block, out_dtype=dtype) s = topi.generic.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) # print(tvm.lower(s, [A, W, C], simple_mode=True)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): B = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NHWC', out_dtype="int32") s = topi.x86.schedule_conv2d_nhwc_pack_int8([B]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def verify_global_pool(dshape, pool_type, layout='NCHW'): """verify function of global_pool""" assert layout in ["NCHW", "NHWC"] A = te.placeholder(shape=dshape, name='A') B = topi.nn.global_pool(A, pool_type=pool_type, layout=layout) B = topi.nn.relu(B) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) axis = (layout.find('H'), layout.find('W')) if pool_type == 'avg': b_np = np.mean(a_np, axis=axis, keepdims=True) elif pool_type == 'max': b_np = np.max(a_np, axis=axis, keepdims=True) b_np = np.maximum(b_np, 0.0) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s_func = topi.testing.dispatch(device, _adaptive_pool_schedule) if device == "cuda": s = s_func(B, layout) else: s = s_func(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in get_all_backend(): check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ncdhw_implement) with tvm.target.create(device): C = fcompute(A, W, (stride, stride, stride), padding, (dilation, dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, space_kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, space_kernel, stride, padding_sum, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return if default_schedule: device += " -libs=cudnn" print("Running on target: %s" % device) with tvm.target.create(device): B = topi.cuda.conv2d.conv2d_cuda( autotvm.get_config(), A, W, stride, 0, dilation=1, algo=cudnn_algo) if default_schedule else topi.nn.conv2d_nhwc( A, W, stride, padding, dilation=1) s = topi.cuda.schedule_conv2d_nchw_cuda( None, [B]) if default_schedule else schedule_conv2d_nhwc([B], A) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device, name=("Conv2d_%d_%d" % (in_height, in_width))) # func(a, w, b) timer_1 = func.time_evaluator(func.entry_name, ctx, number=10) tcost_1 = timer_1(a, w, b).mean np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print("1x1 convolution: average running time is {:.2f} us.".format( tcost_1 * 1e6))