Example #1
0
def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_packv1_writecachev1(C):
    bn = 32
    packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')
    C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k), name='C')
    s = tvm.create_schedule(C.op)
    # Allocate write cache
    CC = s.cache_write(C, 'global')
    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
    # Write cache is computed at yo
    s[CC].compute_at(s[C], yo)
    ko, ki = s[CC].split(s[CC].op.reduce_axis[0], factor=32,nparts =None)

    # New inner axes
    xc, yc = s[CC].op.axis
    s[CC].reorder(ko, xc, ki, yc)
    s[CC].vectorize(yc)
    #s[CC].parallel(ko)不能加???
    s[CC].unroll(ki)

    s[C].parallel(xo)
    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)
    s[packedB].unroll(y)
    return s
Example #2
0
def Gemm_v4(C):
    # Tiling(blocking)、reordering、Vectorization、Loop Permutation、Array Packing
    s = tvm.create_schedule(
        C.op)  # Tiling、reordering、Vectorization、Loop Permutation、Array Packing
    bn = 32
    k, = s[C].op.reduce_axis
    packedB = tvm.compute((N / bn, K, bn),
                          lambda x, y, z: B[y, x * bn + z],
                          name='packedB')
    C = tvm.compute(
        (M, N),
        lambda x, y: tvm.sum(
            A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
        name='C')
    s = tvm.create_schedule(C.op)
    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)

    ko, ki = s[C].split(k, factor=4)
    s[C].reorder(xo, yo, ko, xi, ki, yi)
    s[C].vectorize(yi)
    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)
    print("Tiling、reordering、Vectorization、Loop Permutation、Array Packing")
    return s
def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_packv1(C):
    bn = 32
    packedB = tvm.compute((N / bn, K, bn),
                          lambda x, y, z: B[y, x * bn + z],
                          name='packedB')
    C = tvm.compute(
        (M, N),
        lambda x, y: tvm.sum(
            A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
        name='C')
    s = tvm.create_schedule(C.op)
    #tile接口只接受二维平铺
    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
    #通过这种方式对第三维的K进行划分
    ko, ki = s[C].split(s[C].op.reduce_axis[0], factor=32, nparts=None)
    #注意这里的bn和factor都对应的是axis的inner的取值范围
    #这里我们试试o顺序变了有啥变化没
    s[C].reorder(xo, yo, ko, xi, ki, yi)
    #实验表明(无论从可行性和性能上都)应该针对最内层
    s[C].vectorize(yi)
    #实验表明,应当针对最外层的循环轴进行parallel
    s[C].parallel(xo)
    #实验表明,unroll用于倒数第二层效果好
    s[C].unroll(ki)

    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)
    s[packedB].unroll(y)
    return s
Example #4
0
def Gemm_v6(C):
    # Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks、Parallel
    bn = 32
    s = tvm.create_schedule(C.op)
    k, = s[C].op.reduce_axis
    packedB = tvm.compute((N / bn, K, bn),
                          lambda x, y, z: B[y, x * bn + z],
                          name='packedB')
    C = tvm.compute(
        (M, N),
        lambda x, y: tvm.sum(
            A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
        name='C')
    s = tvm.create_schedule(C.op)
    # Futhermore, we can also utilize multi-core processors to do the thread-level parallelization.
    CC = s.cache_write(C, 'global')
    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
    s[CC].compute_at(s[C], yo)
    xc, yc = s[CC].op.axis
    k, = s[CC].op.reduce_axis
    ko, ki = s[CC].split(k, factor=4)
    s[CC].reorder(ko, xc, ki, yc)
    s[CC].unroll(ki)
    s[CC].vectorize(yc)
    # parallel
    s[C].parallel(xo)
    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)
    print(
        "Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks、Parallel"
    )
    return s
Example #5
0
def Gemm_v5(C):
    # Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks
    bn = 32
    s = tvm.create_schedule(C.op)
    k, = s[C].op.reduce_axis
    packedB = tvm.compute((N / bn, K, bn),
                          lambda x, y, z: B[y, x * bn + z],
                          name='packedB')
    C = tvm.compute(
        (M, N),
        lambda x, y: tvm.sum(
            A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
        name='C')
    s = tvm.create_schedule(C.op)
    # Allocate write cache
    CC = s.cache_write(C, 'global')
    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
    # Write cache is computed at yo
    s[CC].compute_at(s[C], yo)
    # New inner axes
    xc, yc = s[CC].op.axis
    k, = s[CC].op.reduce_axis
    ko, ki = s[CC].split(k, factor=4)
    s[CC].reorder(ko, xc, ki, yc)
    s[CC].unroll(ki)
    s[CC].vectorize(yc)
    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)
    print(
        "Tiling、reordering、Vectorization、Loop Permutation、Array Packing、Write cache for blocks"
    )
    return s
Example #6
0
def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype):
    """Transposed 1D convolution ncw forward operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template
    Input : tvm.Tensor
        3-D with shape [batch, in_channel, inp_width]
    Filter : tvm.Tensor
        3-D with shape [in_channel, num_filter, kernel_size]
    stride : tuple of one int
        The spatial stride along width
    padding : int, tuple, or string
        int: padding size
        tuple of 2 ints: (pad_left, pad_right) for left and right padding
        string: ['VALID', 'SAME']
    out_dtype: str
        The output type. This is used in mixed precision

    Returns
    -------
    Output : tvm.Tensor
    u    3-D with shape [batch, out_channel, out_width]
    """
    if isinstance(stride, (tuple, list)):
        stride = stride[0]
    cfg.stride = stride
    batch, inp_channels, inp_width = get_const_tuple(data.shape)
    _, out_channels, kernel_size = get_const_tuple(kernel.shape)
    pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size)
    out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right
    pad_left = kernel_size - 1 - pad_left
    pad_right = kernel_size - 1 - pad_right
    dilated_width = stride * (inp_width - 1) + 1
    data = tvm.compute(
        (batch, inp_channels, pad_left + dilated_width + pad_right),
        lambda n, c, x: tvm.if_then_else(
            tvm.all(x >= pad_left, x < pad_left + dilated_width,
                    tvm.indexmod(x - pad_left, stride).equal(0)), data[
                        n, c, tvm.indexdiv(x - pad_left, stride)],
            tvm.const(0., "float32")),
        name='data_pad')

    dc = tvm.reduce_axis((0, inp_channels), name='dc')
    dw = tvm.reduce_axis((0, kernel_size), name='dw')
    data_out = tvm.compute(
        (batch, out_channels, out_width),
        lambda b, c, w: tvm.sum(data[b, dc, w + dw].astype(out_dtype) * kernel[
            dc, c, kernel_size - 1 - dw].astype(out_dtype),
                                axis=[dc, dw]),
        tag="conv1d_transpose_ncw")

    return data_out
def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_packv1_writecachev1_config_define(
        N, K, M, dtype):
    A = tvm.placeholder((N, K), name='A', dtype=dtype)
    B = tvm.placeholder((K, M), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, K), name='k')

    bn = 32
    packedB = tvm.compute((N / bn, K, bn),
                          lambda x, y, z: B[y, x * bn + z],
                          name='packedB')
    C = tvm.compute(
        (M, N),
        lambda x, y: tvm.sum(
            A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
        name='C')
    s = tvm.create_schedule(C.op)
    k = s[C].op.reduce_axis[0]
    C = s.cache_write(C, 'global')
    y, x = s[C].op.axis

    cfg = autotvm.get_config()
    #cfg.define_knob("bn",candidate=[1,2,4,8,16,32,64])
    cfg.define_split("tile_x", x, num_outputs=2)
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_k", k, num_outputs=2)
    # cfg.define_split('tile_x', x, policy='factors', filter=lambda x: x.size[-1] <= 64)
    # cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 64)
    # cfg.define_split('tile_k', k, policy='factors', filter=lambda x: x.size[-1] <= 64)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    yo, yi = cfg["tile_y"].apply(s, C, y)
    ko, ki = cfg["tile_k"].apply(s, C, k)
    # cfg.define_knob("tile_x", [1, 4, 8, 16, 32, 64])
    # cfg.define_knob("tile_y", [1, 4, 8, 16, 32, 64])
    # cfg.define_knob("tile_k", [1, 4, 8, 16, 32, 64])
    # xo, xi = s[C].split(x, cfg['tile_x'].val)
    # yo, yi = s[C].split(y, cfg['tile_y'].val)
    # ko, ki = s[C].split(k, cfg['tile_k'].val)

    s[C].reorder(xo, yo, ko, xi, ki, yi)
    s[C].vectorize(yi)
    s[C].parallel(xo)
    s[C].unroll(ki)
    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)
    s[packedB].unroll(y)
    return s, [A, B, C]
Example #8
0
def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype):
    """Transposed 2D convolution nchw forward operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]
    Filter : tvm.Tensor
        4-D with shape [in_channel, num_filter, filter_height, filter_width]
    strides : tuple of two ints
        The spatial stride along height and width
    padding : int or str
        Padding size, or ['VALID', 'SAME']
    out_dtype: str
        The output type. This is used in mixed precision

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape)
    _, out_channels, kernel_height, kernel_width = get_const_tuple(kernel.shape)
    stride_height, stride_width = stride
    cfg.stride = stride
    pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple(
        padding, (kernel_height, kernel_width))

    out_width = (inp_width - 1) * stride_width + \
                kernel_width - pad_left - pad_right
    pad_left = kernel_width - 1 - pad_left
    pad_right = kernel_width - 1 - pad_right
    dilated_width = stride_width * (inp_width - 1) + 1

    out_height = (inp_height - 1) * stride_height + \
                 kernel_height - pad_top - pad_bottom
    pad_top = kernel_height - 1 - pad_top
    pad_bottom = kernel_height - 1 - pad_bottom
    dilated_height = stride_height * (inp_height - 1) + 1

    # compute pad
    data = tvm.compute(
        (batch, inp_channels,
         pad_top + dilated_height + pad_bottom,
         pad_left + dilated_width + pad_right),
        lambda n, c, y, x: tvm.if_then_else(
            tvm.all(x >= pad_left,
                    x < pad_left + dilated_width,
                    tvm.indexmod(x - pad_left, stride_width).equal(0),
                    y >= pad_top,
                    y < pad_top + dilated_height,
                    tvm.indexmod(y - pad_top, stride_height).equal(0)),
            data[n, c,
                 tvm.indexdiv(y - pad_top, stride_height),
                 tvm.indexdiv(x - pad_left, stride_width)],
            tvm.const(0., "float32")),
        name='data_pad')

    # compute transposed conv
    dc = tvm.reduce_axis((0, inp_channels), name='dc')
    dh = tvm.reduce_axis((0, kernel_height), name='dh')
    dw = tvm.reduce_axis((0, kernel_width), name='dw')
    data_out = tvm.compute(
        (batch, out_channels, out_height, out_width),
        lambda b, c, h, w: tvm.sum(
            data[b, dc, h + dh, w + dw].astype(out_dtype) *
            kernel[dc,
                   c,
                   kernel_height - 1 - dh,
                   kernel_width - 1 - dw].astype(out_dtype),
            axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")

    return data_out
Example #9
0
###################################################################################################
# Just as it is shown in the figure above, after blocking the computations, we can observe the array
# access pattern of B (after flattening), which is regular but discontinuous. We expect that after
# some transformation we can get continuous access pattern. We can reorder a [16][16] array to
# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing
# the corresponding value from the packed array.
#

# We have to re-write the algorithm slightly.
packedB = tvm.compute((N / bn, K, bn),
                      lambda x, y, z: B[y, x * bn + z],
                      name='packedB')
C = tvm.compute(
    (M, N),
    lambda x, y: tvm.sum(
        A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
    name='C')

s = tvm.create_schedule(C.op)

xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
k, = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=4)

s[C].reorder(xo, yo, ko, xi, ki, yi)
s[C].vectorize(yi)

x, y, z = s[packedB].op.axis
s[packedB].vectorize(z)
s[packedB].parallel(x)
Example #10
0
s = tvm.create_schedule(CMatrix.op)
f = tvm.build(s, [MatrixA, MatrixB, CMatrix], target=target, name='mmult')
c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
f(a, b, c)
tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
eval = f.time_evaluator(f.entry_name, ctx, number=1)
print('Without optimizations: %f' % eval(a, b, c).mean)

bn = 32
packedMatrixB = tvm.compute((N / bn, K, bn),
                            lambda x, y, z: MatrixB[y, x * bn + z],
                            name='packedMatrixB')
CMatrix = tvm.compute(
    (M, N),
    lambda x, y: tvm.sum(MatrixA[x, k] * packedMatrixB[y // bn, k,
                                                       tvm.indexmod(y, bn)],
                         axis=k),
    name='CMatrix')
s = tvm.create_schedule(CMatrix.op)
newCMatrix = s.cache_write(CMatrix, 'global')
xo, yo, xi, yi = s[CMatrix].tile(CMatrix.op.axis[0], CMatrix.op.axis[1], bn,
                                 bn)
s[newCMatrix].compute_at(s[CMatrix], yo)
xc, yc = s[newCMatrix].op.axis
k, = s[newCMatrix].op.reduce_axis
ko, ki = s[newCMatrix].split(k, factor=4)
s[newCMatrix].reorder(ko, xc, ki, yc)
s[newCMatrix].unroll(ki)
s[newCMatrix].vectorize(yc)
#s[CMatrix].parallel(xo)
Example #11
0
def fused_convs(input_data, filters, resnet_block=False):

	out_dtype = input_data.dtype

	Input = None
	nodes = [input_data]
	params = [input_data]

	for f in filters:
		Input = nodes[-1]
		Filter = f.placeholder
		layout = f.layout
		depthwise = f.depthwise
		kernel = f.kernel
		stride = f.stride
		padding = f.padding
		dilation = f.dilation

		assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise

		padded_count = 0
		conv_count = 0
		depthwise_count = 0

		if isinstance(stride, int):
			stride_h = stride_w = stride
		else:
			stride_h, stride_w = stride

		if isinstance(dilation, int):
			dilation_h = dilation_w = dilation
		else:
			dilation_h, dilation_w = dilation

		batch, in_height, in_width, in_channel = Input.shape
		if f.NHWC_transpose: # HWOI
			kernel_h, kernel_w, tmp, kernel_channel = Filter.shape
		else: # HWIO
			kernel_h, kernel_w, kernel_channel, tmp = Filter.shape
		if depthwise:
			channel_multiplier = tmp
		else:
			num_filter = tmp

		# compute the output shape
		dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
		dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
		pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
			padding, (dilated_kernel_h, dilated_kernel_w))

		out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter
		out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
		out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

		if f.kernel > 1:
			print("Padding is needed!")

			pad_before = [0, pad_top, pad_left, 0]
			pad_after = [0, pad_down, pad_right, 0]

			PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count))
			padded_count += 1
			nodes.append(PaddedInput)

			# Update Input
			Input = PaddedInput
			batch, in_height, in_width, in_channel = Input.shape

		if not depthwise:
			rc = tvm.reduce_axis((0, in_channel), name='rc')
		if kernel > 1:
			ry = tvm.reduce_axis((0, kernel_h), name='ry')
			rx = tvm.reduce_axis((0, kernel_w), name='rx')

		if not depthwise: # Normal convolution
			if kernel > 1:
				Output = tvm.compute(
				(batch, out_height, out_width, out_channel),
				lambda nn, yy, xx, ff: tvm.sum(
					Input[nn, yy * stride_h + ry * dilation_h,
								xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
					(Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]),
					name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc")
			else: # Only reduce rc axis
				Output = tvm.compute(
				(batch, out_height, out_width, out_channel),
				lambda nn, yy, xx, ff: tvm.sum(
					Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) *
					(Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]),
					name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc")
			conv_count += 1
		else: # Depthwise convolution (kernel > 1)
			Output = tvm.compute(
			(batch, out_height, out_width, out_channel),
			lambda b, i, j, c: tvm.sum(
				(Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w,
							 tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) *
				(Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)),
				axis=[ry, rx]),
			name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc")
			depthwise_count += 1

		nodes.append(Output)
		params.append(Filter)

	if resnet_block:
		First = nodes[0]
		Last = nodes[-1]
		assert (first.shape == last.shape)
		Output = tvm.compute(
			(batch, out_height, out_width, out_channel),
			lambda b, i, j, c: tvm.sum(
				(First[b, i, j, c].astype(out_dtype) + 
				(Last[b, i, j, c]).astype(out_dtype))),
			name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc")
		nodes.append(Output)

	params.append(nodes[-1]) # Final output
	return nodes, params
                               stmt='answer = numpy.dot(a, b)',
                               number=np_repeat)
print("Numpy running time: %f" % (np_runing_time / np_repeat))

answer = numpy.dot(a.asnumpy(), b.asnumpy())

# Algorithm
k = tvm.reduce_axis((0, K), 'k')
A = tvm.placeholder((M, K), name='A')
B = tvm.placeholder((K, N), name='B')

bn = 32

packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')
C = tvm.compute((M, N),
                lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
                name = 'C')

s = tvm.create_schedule(C.op)

# Allocate write cache
CC = s.cache_write(C, 'global')

xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)

# Write cache is computed at yo
s[CC].compute_at(s[C], yo)

# New inner axes
xc, yc = s[CC].op.axis