Python simplify Examples

Programming Language: Python

Namespace/Package Name: tvm.topi.utils

Method/Function: simplify

Examples at hotexamples.com: 10

Python simplify - 10 examples found. These are the top rated real world Python examples of tvm.topi.utils.simplify extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: depthwise_conv2d_nhwc.py Project: octoml/qualcomm

def compute_depthwise_conv2d_NHWC_HWOI(Input,
                                       Filter,
                                       stride,
                                       padding,
                                       dilation,
                                       out_dtype=None,
                                       args={}):
    """Depthwise convolution operator in NCHWc layout. """
    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_height, in_width, channels = Input.shape
    kernel_h, kernel_w, _, _ = Filter.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    channel_block = 4
    channel_chunk = channels // channel_block
    num_filter_chunk = 1

    # compute:
    Input = te.compute(
        [batch, in_height, in_width, channel_chunk, channel_block],
        lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc * 4 + icb],
        name="input_pack",
        tag="input_pack",
    )
    Filter = te.compute(
        [kernel_h, kernel_w, channel_chunk, num_filter_chunk, channel_block],
        lambda kh, kw, ifc, nfc, cb: Filter[kh, kw, ifc * 4 + cb, nfc],
        name="filter_pack",
        tag="filter_pack",
    )

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2

    # compute graph
    pad_before = [0, pad_top, pad_left, 0, 0]
    pad_after = [0, pad_down, pad_right, 0, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w -
                                                        1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h -
                                                         1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[
            3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[
            2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h

    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch, out_height, out_width, channel_chunk, channel_block),
        lambda nn, yy, xx, ffc, ffb: te.sum(
            (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
                  dilation_w, ffc, ffb] * Filter[ry, rx, ffc, 0, ffb]).astype(
                      args["accumulator"]),
            axis=[ry, rx],
        ),
        tag="depthwise_conv2d_nhwc",
    )

    dummy_cast = te.compute(
        (batch, out_height_orig, out_width_orig, channel_chunk, channel_block),
        lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
        tag="dummy_cast")
    return te.compute((batch, out_height_orig, out_width_orig, channels),
                      lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4],
                      tag="cast_from_acc" + args["accumulator"][-2:])

Example #2

Show file

def compute_conv2d_NCHWc_KCRSk(Input,
                               Filter,
                               stride,
                               padding,
                               dilation,
                               out_dtype=None,
                               args={}):
    """Convolution operator in NCHWc layout. """

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2
    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w -
                                                        1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h -
                                                         1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[
            3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[
            2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h

    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
    rcb = te.reduce_axis((0, in_channel_block), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    # When tuning, insert a cache_read("texture") stage to properly test
    # performance of kernels that utlize texture inputs. The cache_read
    # is not needed when using the graph_runtime which supports passing
    # in external texture buffers. This can be removed once AutoTVM tuning
    # supports capturing this runtime information during task extraction
    # or once texture lowering in tir.TextureFlatten supports cache_read
    # cancellation when padding is utilized.
    if autotvm.GLOBAL_SCOPE.in_tuning:
        # NCHWc x KCRSk
        # texture: NCH|W|c
        # texture: K|CRS|k
        Filter_tx = te.compute(
            (num_filter_chunk, channel * kernel_h * kernel_w,
             num_filter_block),
            lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (
                crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
            name="packed_filter")
        conv = te.compute(
            (batch, num_filter_chunk, out_height, out_width, num_filter_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w +
                      rx * dilation_w, rcb] * Filter_tx[ffc, (
                          (rcc * in_channel_block + rcb) * kernel_h + ry
                      ) * kernel_w + rx, ffb]).astype(args["accumulator"]),
                axis=[rcc, rcb, ry, rx],
            ),
            tag="conv2d_nchwc",
        )
    else:
        conv = te.compute(
            (batch, num_filter_chunk, out_height, out_width, num_filter_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w +
                      rx * dilation_w, rcb] * Filter[
                          ffc, rcc * in_channel_block + rcb, ry, rx, ffb]).
                astype(args["accumulator"]),
                axis=[rcc, rcb, ry, rx],
            ),
            tag="conv2d_nchwc",
        )
    return te.compute(
        (batch, num_filter_chunk, out_height_orig, out_width_orig,
         num_filter_block),
        lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
        tag="cast_from_acc" + args["accumulator"][-2:])

Example #3

Show file

def compute_depthwise_conv2d_NCHWc_KCRSk(Input,
                                         Filter,
                                         stride,
                                         padding,
                                         dilation,
                                         out_dtype=None,
                                         args={}):
    """Depthwise convolution operator in NCHWc layout. """
    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, channel_chunk, in_height, in_width, channel_block = Input.shape
    _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_channel_chunk = simplify(channel_chunk * channel_multiplier)
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    if autotvm.GLOBAL_SCOPE.in_tuning:
        # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c]
        # NCHWc x CMRS
        # texture: NCH|W|c
        # texture: C|MRS|c
        Filter_tx = te.compute(
            (channel_chunk, channel_multiplier * kernel_h * kernel_w,
             channel_block),
            lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), (
                mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb],
            name="packed_filter")

        conv = te.compute(
            (batch, out_channel_chunk, out_height, out_width, channel_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, ffc // channel_multiplier, yy * stride_h + ry *
                      dilation_h, xx * stride_w + rx * dilation_w, ffb] *
                 Filter_tx[ffc // channel_multiplier, (
                     (ffc % channel_multiplier) * kernel_h + ry) * kernel_w +
                           rx, ffb]).astype(args["accumulator"]),
                axis=[ry, rx],
            ),
            tag="depthwise_conv2d_nchwc_kcrsk_texture",
        )
    else:
        conv = te.compute(
            (batch, out_channel_chunk, out_height, out_width, channel_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, ffc // channel_multiplier, yy * stride_h + ry *
                      dilation_h, xx * stride_w + rx * dilation_w, ffb] *
                 Filter[ffc // channel_multiplier, ffc % channel_multiplier,
                        ry, rx, ffb]).astype(args["accumulator"]),
                axis=[ry, rx],
            ),
            tag="depthwise_conv2d_nchwc_kcrsk",
        )
    return te.compute(
        conv.shape,
        lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
        tag="cast_from_acc" + args["accumulator"][-2:])

Example #4

Show file

def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation,
                               out_dtype):
    """Compute function for Cortex-M7 SIMD implementation of conv2d."""
    assert isinstance(strides, int) or len(strides) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(strides, int):
        stride_h = stride_w = strides
    else:
        stride_h, stride_w = strides

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch_size, in_height, in_width, in_channels = data.shape
    kernel_h, kernel_w, out_channels, _ = kernel.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    padded_data = pad(data, pad_before, pad_after, name="padded_data")

    rc = te.reduce_axis((0, in_channels), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch_size, out_height, out_width, out_channels),
        lambda nn, yy, xx, ff: te.sum(
            padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx
                        * dilation_w, rc].astype(out_dtype) * kernel[
                            ry, rx, ff, rc].astype(out_dtype),
            axis=[ry, rx, rc],
        ),
        name="conv2d",
        tag="conv2d_nhwc",
    )

    ###########################
    # Config Space Definition #
    ###########################
    n, oh, ow, co = (
        cfg.axis(batch_size.value),
        cfg.axis(out_height.value),
        cfg.axis(out_width.value),
        cfg.axis(out_channels.value),
    )
    kh, kw, ci = (
        cfg.reduce_axis(kernel_h.value),
        cfg.reduce_axis(kernel_w.value),
        cfg.reduce_axis(in_channels.value),
    )

    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
    cio, cii = cfg.define_split(
        "tile_ci",
        ci,
        policy="factors",
        num_outputs=2,
        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
    )
    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)

    cfg.define_reorder(
        "reorder_0_simd",
        [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
        policy="candidate",
        candidate=[
            [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
            [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
            [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
            [n, kh, kw, oh, coo, owo, cio, owi, coi, cii],
        ],
    )

    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
    cfg.define_knob("unroll_explicit", [0, 1])

    if cfg.is_fallback:
        cfg.fallback_split("tile_ow", [-1, out_width.value])
        cfg.fallback_split("tile_ci", [-1, in_channels.value])
        cfg.fallback_split("tile_co", [-1, out_channels.value])

    return conv

Example #5

Show file

def compute_conv2d_NHWC_HWIO(Input,
                             Filter,
                             stride,
                             padding,
                             dilation,
                             out_dtype=None,
                             args={}):
    """Convolution operator in NHWC layout. """

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_height, in_width, in_channel = Input.shape
    kernel_h, kernel_w, _, out_channels = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    in_channel_block = 4
    in_channel_tail = in_channel % in_channel_block
    in_channel_chunk = in_channel // in_channel_block
    num_filter_block = 4
    num_filter_tail = out_channels % num_filter_block
    num_filter_chunk = out_channels // num_filter_block

    pad_value = tvm.tir.const(0, Input.dtype)
    # compute:
    if in_channel_tail == 0:
        Input = te.compute(
            [batch, in_height, in_width, in_channel_chunk, in_channel_block],
            lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc *
                                               in_channel_block + icb],
            name="input_pack",
            tag="input_pack",
        )
    else:
        in_channel_chunk += 1

        def _reorder_data(*indices):
            condition = []
            condition.append(indices[3] == in_channel_chunk - 1)
            condition.append(indices[4] >= in_channel_tail)
            condition = tvm.tir.all(*condition)
            return tvm.tir.if_then_else(
                condition, pad_value,
                Input[indices[0], indices[1], indices[2],
                      indices[3] * in_channel_block + indices[4]])

        Input = te.compute(
            [batch, in_height, in_width, in_channel_chunk, in_channel_block],
            _reorder_data,
            name="input_pack",
            tag="input_pack_expanded",
        )
    if num_filter_tail == 0 and in_channel_tail == 0:
        Filter = te.compute(
            [
                kernel_h, kernel_w, in_channel, num_filter_chunk,
                num_filter_block
            ],
            lambda kh, kw, ic, nfc, nfb: Filter[kh, kw, ic, nfc *
                                                num_filter_block + nfb],
            name="filter_pack",
            tag="filter_pack",
        )
    else:
        num_filter_chunk += 1

        # HWIO
        def _reorder_weights(*indices):
            conditionA = []
            conditionA.append(indices[3] == num_filter_chunk - 1)
            conditionA.append(indices[4] >= num_filter_block)
            conditionAT = tvm.tir.all(*conditionA)

            conditionO = []
            conditionO.append(conditionAT)
            conditionO.append(
                indices[2] >= in_channel_chunk * in_channel_block +
                in_channel_tail)
            conditionOT = tvm.tir.any(*conditionO)
            return tvm.tir.if_then_else(
                conditionOT, pad_value,
                Filter[indices[0], indices[1], indices[2],
                       indices[3] * num_filter_block + indices[4]])

        Filter = te.compute(
            [
                kernel_h, kernel_w, in_channel, num_filter_chunk,
                num_filter_block
            ],
            _reorder_weights,
            name="filter_pack",
            tag="filter_pack_expanded",
        )

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2

    # compute graph
    pad_before = [0, pad_top, pad_left, 0, 0]
    pad_after = [0, pad_down, pad_right, 0, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w -
                                                        1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h -
                                                         1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[
            3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[
            2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h

    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
    rcb = te.reduce_axis((0, in_channel_block), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")
    conv = te.compute(
        (batch, out_height, out_width, num_filter_chunk, num_filter_block),
        lambda nn, yy, xx, fc, fb: te.sum(
            (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
                  dilation_w, rcc, rcb] * Filter[ry, rx, rcc * in_channel_block
                                                 + rcb, fc, fb]).astype(args[
                                                     "accumulator"]),
            axis=[ry, rx, rcc, rcb],
        ),
        tag="conv2d_nhwc",
    )

    dummy_cast = te.compute(
        (batch, out_height_orig, out_width_orig, num_filter_chunk,
         num_filter_block),
        lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
        tag="dummy_cast")
    return te.compute((batch, out_height_orig, out_width_orig, out_channels),
                      lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4],
                      tag="cast_from_acc" + args["accumulator"][-2:])

Example #6

Show file

File: conv1d.py Project: zotanika/incubator-tvm

def conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation,
                           out_dtype):
    """Compute function for v7e-m DSP instructions of conv1d on NWC layout."""
    if isinstance(strides, (tuple, list)):
        strides = strides[0]
    if isinstance(dilation, (tuple, list)):
        dilation = dilation[0]

    batch_size, data_width, in_channels = data.shape
    kernel_size, out_channels, _ = kernel.shape

    # Compute the output shape
    dilated_kernel_size = (kernel_size - 1) * dilation + 1
    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size, ))
    out_channels = simplify(out_channels)
    out_width = simplify(
        (data_width - dilated_kernel_size + pad_left + pad_right) // strides +
        1)

    # Apply padding
    pad_before = [0, pad_left, 0]
    pad_after = [0, pad_right, 0]
    padded_data = pad(data, pad_before, pad_after, name="padded_data")

    # Compute graph
    rc = te.reduce_axis((0, in_channels), name="rc")
    rw = te.reduce_axis((0, kernel_size), name="rw")

    conv = te.compute(
        (batch_size, out_width, out_channels),
        lambda b, w, c: te.sum(
            padded_data[b, w * strides + rw * dilation, rc].astype(out_dtype) *
            kernel[rw, c, rc].astype(out_dtype),
            axis=[rw, rc],
        ),
        name="conv1d",
        tag="conv1d_nwc",
    )

    ###########################
    # Config Space Definition #
    ###########################
    n, ow, co = (
        cfg.axis(batch_size.value),
        cfg.axis(out_width.value),
        cfg.axis(out_channels.value),
    )
    kw, ci = (
        cfg.reduce_axis(kernel_size.value),
        cfg.reduce_axis(in_channels.value),
    )

    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
    cio, cii = cfg.define_split(
        "tile_ci",
        ci,
        policy="factors",
        num_outputs=2,
        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
    )
    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)

    cfg.define_reorder(
        "reorder_0_simd",
        [n, owo, owi, coo, coi, kw, cio, cii],
        policy="candidate",
        candidate=[
            [n, kw, owo, coo, cio, owi, coi, cii],
            [n, kw, coo, owo, cio, owi, coi, cii],
            [n, kw, owo, coo, cio, owi, coi, cii],
            [n, kw, coo, owo, cio, owi, coi, cii],
        ],
    )

    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
    cfg.define_knob("unroll_explicit", [0, 1])

    if cfg.is_fallback:
        cfg.fallback_split("tile_ow", [-1, out_width.value])
        cfg.fallback_split("tile_ci", [-1, in_channels.value])
        cfg.fallback_split("tile_co", [-1, out_channels.value])

    return conv

Example #7

Show file

def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None):
    """Morphological dilation operator in NCHW layout.

    Parameters
    ----------
    input : tvm.te.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    filter : tvm.te.Tensor
        3-D with shape [ in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size

    dilations: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    out_dtype : Optional[str]
        Specifies the output data type.

    Returns
    -------
    Output : tvm.te.Tensor
        4-D with shape [batch, in_channel, out_height, out_width]
    """
    if out_dtype is None:
        out_dtype = input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilations, int) or len(dilations) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilations, int):
        dilation_h = dilation_w = dilations
    else:
        dilation_h, dilation_w = dilations

    batch, in_channel, in_height, in_width = input.shape
    channel, kernel_h, kernel_w = filter.shape
    assert (in_channel.value == channel.value
            ), "For Dilation2D input and filter channels should be same."

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(input, pad_before, pad_after, name="pad_temp")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    return te.compute(
        (batch, in_channel, out_height, out_width),
        lambda nn, ff, yy, xx: te.max(
            temp[nn, ff, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
                 dilation_w].astype(out_dtype) + filter[ff, ry, rx].astype(
                     out_dtype),
            axis=[ry, rx],
        ),
        tag="dilation2d_nchw",
    )

Example #8

Show file

File: utils.py Project: were/tvm

def expand_spatial_dimensions(in_height, in_width, kernel_h, kernel_w,
                              dilation_h, dilation_w, padding, stride_h,
                              stride_w):
    """
    Expands spatial dimensions to be dividable by factor 4. This will allow us to do extrimely
    better parallel computation on GPU. The drawback of this solution - it will be number of
    useless computations. By fact the speed-up of parallelism significantly overcomes the slowdown
    of extra compute and eventuially this is useful approach, at least for GPU

    Parameters
    ----------
    in_height: int
        Height of the feature map

    in_width: int
        Width of the feature map

    kernel_h: int
        Height of the conv2d kernel

    kernel_w: int
        Width of the conv2d kernel

    dilation_h: int
        Vertical dilation of the conv2d kernel

    dilation_w: int
        Horizontal dilation of the conv2d kernel

    padding: tuple or list
        Conv2d paddings

    stride_h: int
        Vertical stride  of the conv2d kernel

    stride_w: int
        Horizontal stride  of the conv2d kernel
    """
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which values would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2
    return out_height_orig, out_height, out_width_orig, out_width

Example #9

Show file

def compute_conv2d_NCHWc_tpack(Input, Filter, stride, padding, dilation, out_dtype=None, args={}):
    """Convolution operator in NCHWc layout. """

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channels, in_height, in_width = Input.shape
    out_channles, _, kernel_h, kernel_w = Filter.shape
    in_channel_tail = in_channels % 4
    in_channel_chunk = in_channels // 4
    if in_channel_tail == 0:
      in_channel_tail = 4
    else:
      in_channel_chunk += 1

    num_filter_block = out_channles % 4
    num_filter_chunk = out_channles // 4
    if num_filter_block == 0:
        num_filter_block = 4
    else:
        num_filter_chunk += 1

    pad_value = tvm.tir.const(0, Input.dtype)
    def _reorder_data(*indices):
        condition = []
        condition.append(indices[1] == in_channel_chunk - 1)
        condition.append(indices[4] >= in_channel_tail)
        condition = tvm.tir.all(*condition)
        return tvm.tir.if_then_else(
                condition,
                pad_value,
                Input[indices[0],indices[1] * 4 + indices[4], indices[2], indices[3]])

    # compute:
    reordered_data = te.compute(
        [batch, in_channel_chunk, in_height, in_width, 4],
        _reorder_data,
        name="input_pack",
        tag="input_pack",
    )

    def _reorder_weights(*indices):
        conditionA = []
        conditionA.append(indices[0] == num_filter_chunk - 1)
        conditionA.append(indices[4] >= num_filter_block)
        conditionAT = tvm.tir.all(*conditionA)

        conditionO = []
        conditionO.append(conditionAT)
        conditionO.append(indices[1] >= in_channel_chunk * 4 + in_channel_tail)
        conditionOT = tvm.tir.any(*conditionO)
        return tvm.tir.if_then_else(
                conditionOT,
                pad_value,
                Filter[indices[0] * 4 + indices[4], indices[1], indices[2], indices[3]])

    reordered_filter = te.compute(
        [num_filter_chunk, in_channel_chunk * 4, kernel_h, kernel_w, 4],
        _reorder_weights,
        name="filter_pack",
        tag="filter_pack",
    )

    # batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
    # num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w)
    )

    out_height_orig = out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2

    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h
    temp = nn.pad(reordered_data, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rcc")
    rcb = te.reduce_axis((0, 4), name="rcb")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch, num_filter_chunk, out_height, out_width, 4),
        lambda nn, ffc, yy, xx, ffb: te.sum(
            (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
            * reordered_filter[ffc, rcc * 4 + rcb, ry, rx, ffb]).astype(args["accumulator"]),
            axis=[rcc, rcb, ry, rx],
        ),
        tag="conv2d_nchwc_tpack",
    )

    # conv = s.cache_write(conv, "local") does not work properly, it does not create
    # intermediate buffer, continues to read/write from global tensor as accumulator and
    # leads to the crash in runtime
    # due to this reason we had to use such dummy cast and compute_at to create such intermediate
    # accumulator with local scope
    dummy_cast = te.compute((batch, num_filter_chunk, out_height_orig, out_width_orig, 4), lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype(out_dtype), tag="dummy_cast")

    return te.compute((batch, out_channles, out_height_orig, out_width_orig), lambda n,c,y,x: dummy_cast[n,c // 4,y,x,c % 4], tag="cast_from_acc" + args["accumulator"][-2:])

Example #10

Show file

File: test_target_texture_codegen_opencl.py Project: zotanika/incubator-tvm

def compute_conv2d_NCHWc_KCRSk_acc32(Input,
                                     Filter,
                                     stride,
                                     padding,
                                     dilation,
                                     out_dtype=None):
    """Convolution operator in NCHWc layout."""

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
    rcb = te.reduce_axis((0, in_channel_block), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    # NCHWc x KCRSk
    # texture: NCH|W|c
    # texture: K|CRS|k
    # c = crs//RS
    # rs = crs % RS
    # r = rs // W == (crs // S) % R
    # s = rs % W == crs % S
    Filter = te.compute(
        (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
        lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (
            crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
        name="packed_filter",
    )
    conv = te.compute(
        (batch, num_filter_chunk, out_height, out_width, num_filter_block),
        lambda nn, ffc, yy, xx, ffb: te.sum(
            (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx
                  * dilation_w, rcb] * Filter[ffc, (
                      (rcc * in_channel_block + rcb) * kernel_h + ry
                  ) * kernel_w + rx, ffb]).astype(out_dtype),
            axis=[rcc, rcb, ry, rx],
        ),
        tag="conv2d_nchwc_kcrsk_texture",
    )
    output = te.compute(
        conv.shape,
        lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32"))
    return output