コード例 #1
0
ファイル: bitserial_dense.py プロジェクト: bddppq/tvm
    def _schedule(cfg, s, data_vec, weight_vec, output):
        s[data_vec].parallel(s[data_vec].op.axis[0])
        s[weight_vec].parallel(s[weight_vec].op.axis[0])

        y, x = s[output].op.axis
        wb, db, k = s[output].op.reduce_axis

        yo, yi = cfg["tile_y"].apply(s, output, y)
        xo, xi = cfg["tile_x"].apply(s, output, x)
        ko, ki = cfg["tile_k"].apply(s, output, k)


        cfg["reorder_0"].apply(s, output, [yo, xo, ko, yi, wb, db, ki, xi])
        cfg["ann_reduce"].apply(s, output, [db, wb],
                                axis_lens=[get_const_int(db.dom.extent),
                                           get_const_int(wb.dom.extent)],
                                max_unroll=8,
                                cfg=cfg)
        cfg["ann_spatial"].apply(s, output, [yi, xi],
                                 axis_lens=[cfg['tile_y'].size[-1],
                                            cfg['tile_x'].size[-1]],
                                 max_unroll=8,
                                 cfg=cfg)
        s[output].vectorize(xi)
        s[output].parallel(yo)
        return s
コード例 #2
0
ファイル: bitserial_conv2d.py プロジェクト: bddppq/tvm
def _schedule_bitserial_conv2d_nhwc(cfg, s, data_q, data_pad, data_vec,
                                    kernel_q, kernel_vec,
                                    conv_out, output, last):
    # no stride and padding info here
    _, IH, IW, CI, IB = data_q.shape
    KH, KW, _, CO, KB = kernel_q.shape
    _, OH, OW, _ = output.shape

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    ##### Schedule data padding and packing
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, h, _, _, _, _, _ = s[data_vec].op.axis
    cfg.define_split("tile_ah", cfg.axis(h), policy="all", num_outputs=2, max_factor=32)
    oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
    s[data_vec].parallel(oh)

    ##### Schedule kernel packing
    co, _, _, _, _, _ = s[kernel_vec].op.axis
    cfg.define_split("tile_bco", cfg.axis(co), policy="all", num_outputs=2, max_factor=32)
    oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
    s[kernel_vec].parallel(oco)

    ##### Schedule Convolution
    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis

    # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
    cfg["reorder_0"].apply(s, conv_out, [n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2])
    cfg["ann_reduce"].apply(s, conv_out, [b1, b2, dh, dw],
                            axis_lens=[get_const_int(b1.dom.extent),
                                       get_const_int(b2.dom.extent),
                                       get_const_int(dh.dom.extent),
                                       get_const_int(dw.dom.extent)],
                            max_unroll=16,
                            cfg=cfg)

    s[conv_out].unroll(b1)
    s[conv_out].unroll(b2)
    s[conv_out].vectorize(vc)

    # # Schedule output
    n, h, w, co = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, oh, ow, co, vh, vw, vc)
    s[last].vectorize(vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    oho, iho = cfg["tile_oh"].apply(s, last, oh)  # reuse parameter
    s[last].parallel(oho)

    return s
コード例 #3
0
ファイル: _algorithm.py プロジェクト: bddppq/tvm
def compute_argsort(attrs, inputs, _, target):
    """Compute definition of argsort"""
    axis = get_const_int(attrs.axis)
    is_ascend = bool(get_const_int(attrs.is_ascend))
    dtype = str(attrs.dtype)
    return [
        topi.argsort(inputs[0], None, axis=axis, is_ascend=is_ascend, \
                            dtype=dtype, flag=False)
    ]
コード例 #4
0
ファイル: _vision.py プロジェクト: bddppq/tvm
def compute_multibox_transform_loc(attrs, inputs, _, target):
    """Compute definition of multibox_detection"""
    clip = bool(get_const_int(attrs.clip))
    threshold = get_const_float(attrs.threshold)
    variances = get_float_tuple(attrs.variances)
    return topi.vision.ssd.multibox_transform_loc(
        inputs[0], inputs[1], inputs[2], clip, threshold, variances)
コード例 #5
0
ファイル: generic.py プロジェクト: deepakbabel/tvm
 def _compute_multibox_transform_loc(attrs, inputs, _):
     """Compute definition of multibox_detection"""
     clip = bool(get_const_int(attrs.clip))
     threshold = get_const_float(attrs.threshold)
     variances = get_float_tuple(attrs.variances)
     return topi_compute(inputs[0], inputs[1], inputs[2], clip, threshold,
                         variances)
コード例 #6
0
def compute_conv2d(attrs, inputs, _):
    """Compute definition of conv2d"""
    padding = attrs.get_int_tuple("padding")
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    channels = attrs.get_int("channels")
    layout = attrs["layout"]
    assert layout == "NCHW" or layout == "NHWC"
    (dilation_h, dilation_w) = dilation
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")
    elif dilation == (1, 1):
        kernel = inputs[1]
    elif layout == "NCHW":
        kernel = topi.nn.dilate(inputs[1], [1, 1, dilation_h, dilation_w])
    else:  #layout == NHWC
        kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])

    if groups == 1:
        out = topi.nn.conv2d(inputs[0], kernel, strides, padding, layout)
    elif groups == get_const_int(inputs[0].shape[1]) and groups == channels:
        out = topi.nn.depthwise_conv2d_nchw(inputs[0], kernel, strides,
                                            padding)
    else:
        raise ValueError("not support arbitrary group number for now")
    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        expand_axis = 1 if layout == "NCHW" else 0
        bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
        out = topi.broadcast_add(out, bias)
    return out
コード例 #7
0
    def _callback(op):
        if op.tag == "sparse_dense_bsrmm":
            y_bsrmm = op.input_tensors[0]
            w_indptr = y_bsrmm.op.input_tensors[0]
            assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block"
            y_reshape = op
            (m, num_blocks, b_r) = s[y_bsrmm].op.axis
            bs_r = get_const_int(b_r.dom.extent)
            (elem_idx, c) = s[y_bsrmm].op.reduce_axis
            
            (m_o, n_o) = s[y_reshape].op.axis
            s[y_reshape].bind(m_o, te.thread_axis("blockIdx.x"))
            s[y_reshape].bind(n_o, te.thread_axis("blockIdx.y"))
            s[y_bsrmm].compute_at(s[y_reshape], n_o)

            thread_x = te.thread_axis("threadIdx.x")

            cfg.define_split("tile_c", c, num_outputs=2)
            co, ci = cfg['tile_c'].apply(s, y_bsrmm, c)

            y_bsrmm_factored = s.rfactor(y_bsrmm, ci)
            tx = s[y_bsrmm].op.reduce_axis[0]
            s[y_bsrmm].bind(tx, thread_x)
            s[y_bsrmm_factored].compute_at(s[y_bsrmm], tx)
            s[y_bsrmm].set_store_predicate(thread_x.var.equal(0))
            s[y_reshape].set_store_predicate(thread_x.var.equal(0))
コード例 #8
0
def compute_conv2d(attrs, inputs, _):
    """Compute definition of conv2d"""
    padding = attrs.get_int_tuple("padding")
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    channels = attrs.get_int("channels")
    layout = attrs["layout"]
    kernel_layout = attrs["kernel_layout"]
    out_dtype = attrs["out_dtype"]
    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
    assert layout in ["NCHW", "NHWC", "NCHW4c"]
    (dilation_h, dilation_w) = dilation
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
        # pylint: disable=assignment-from-no-return
        out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
                             dilation, layout, out_dtype)
        # pylint: enable=assignment-from-no-return
    elif groups == 1:
        out = topi.nn.conv2d(
            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
    elif layout == "NCHW" and \
         groups == get_const_int(inputs[0].shape[1]) and \
         groups == channels:
        out = topi.nn.depthwise_conv2d_nchw(
            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
    elif layout in ["NCHW", "NCHW4c"]:
        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
                                        out_dtype)
    elif layout == "NHWC" and \
         kernel_layout == "HWOI" and \
         groups == get_const_int(inputs[0].shape[3]) and \
         groups == channels:
        out = topi.nn.depthwise_conv2d_nhwc(
            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
    else:
        raise ValueError("not support arbitrary group number for now")

    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0
        bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
        out = topi.add(out, bias)
    return out
コード例 #9
0
def compute_conv2d(attrs, inputs, out_type, target):
    """Compute definition of conv2d"""
    padding = get_const_tuple(attrs.padding)
    strides = get_const_tuple(attrs.strides)
    dilation = get_const_tuple(attrs.dilation)
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    out_dtype = attrs.out_dtype
    out_dtype = (inputs[0].dtype if out_dtype in ("same", "") else out_dtype)

    assert layout in ["NCHW", "NHWC", "NCHW4c"]
    (dilation_h, dilation_w) = dilation
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        out = topi.nn.conv2d(inputs[0],
                             inputs[1],
                             strides,
                             padding,
                             dilation,
                             layout,
                             out_dtype=out_dtype)
    elif layout == "NCHW" and \
         kernel_layout == "OIHW" and \
         get_const_int(inputs[1].shape[0]) == groups and \
         get_const_int(inputs[1].shape[1]) == 1:
        out = topi.nn.depthwise_conv2d_nchw(inputs[0],
                                            inputs[1],
                                            strides,
                                            padding,
                                            dilation,
                                            out_dtype=out_dtype)
    elif layout == "NHWC" and \
         kernel_layout == "HWOI" and\
         get_const_int(inputs[1].shape[2]) == groups and \
         get_const_int(inputs[1].shape[3]) == 1:
        out = topi.nn.depthwise_conv2d_nhwc(inputs[0],
                                            inputs[1],
                                            strides,
                                            padding,
                                            dilation,
                                            out_dtype=out_dtype)
    else:
        raise ValueError("not support arbitrary group number for now")
    return [out]
コード例 #10
0
ファイル: nn.py プロジェクト: bddppq/tvm
def compute_conv2d(attrs, inputs, _):
    """Compute definition of conv2d"""
    padding = attrs.get_int_tuple("padding")
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    channels = attrs.get_int("channels")
    layout = attrs["layout"]
    kernel_layout = attrs["kernel_layout"]
    out_dtype = attrs["out_dtype"]
    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
    assert layout in ["NCHW", "NHWC", "NCHW4c"]
    (dilation_h, dilation_w) = dilation
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
        # pylint: disable=assignment-from-no-return
        out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
                             dilation, layout, out_dtype=out_dtype)
        # pylint: enable=assignment-from-no-return
    elif groups == 1:
        out = topi.nn.conv2d(
            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype)
    elif layout == "NCHW" and \
         groups == get_const_int(inputs[0].shape[1]) and \
         groups == channels:
        out = topi.nn.depthwise_conv2d_nchw(
            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
    elif layout in ["NCHW", "NCHW4c"]:
        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
                                        out_dtype=out_dtype)
    elif layout == "NHWC" and \
         kernel_layout == "HWOI" and \
         groups == get_const_int(inputs[0].shape[3]) and \
         groups == channels:
        out = topi.nn.depthwise_conv2d_nhwc(
            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
    else:
        raise ValueError("not support arbitrary group number for now")

    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0
        bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
        out = topi.add(out, bias)
    return out
コード例 #11
0
ファイル: generic.py プロジェクト: zachzzc/incubator-tvm
 def _compute_multibox_prior(attrs, inputs, _):
     """Compute definition of multibox_prior"""
     sizes = get_float_tuple(attrs.sizes)
     ratios = get_float_tuple(attrs.ratios)
     steps = get_float_tuple(attrs.steps)
     offsets = get_float_tuple(attrs.offsets)
     clip = bool(get_const_int(attrs.clip))
     return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)]
コード例 #12
0
def compute_decode_BBox(attrs, inputs, _, target):
    """Compute definition of proposal"""
    bbox_mean = get_float_tuple(attrs.bbox_mean)
    bbox_std = get_float_tuple(attrs.bbox_std)
    class_agnostic = bool(get_const_int(attrs.class_agnostic))
    with target:
        return [
            topi.vision.rcnn.decode_BBox(inputs[0], inputs[1], inputs[2],
                                         bbox_mean, bbox_std, class_agnostic)
        ]
コード例 #13
0
ファイル: _transform.py プロジェクト: kalyan416/incubator-tvm
def split_shape_func(attrs, inputs, _):
    """
    Shape function for split op.
    """
    if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)):
        indices_or_sections = get_const_int(attrs.indices_or_sections)
    else:
        indices_or_sections = get_const_tuple(attrs.indices_or_sections)

    axis = get_const_int(attrs.axis)

    num_out = indices_or_sections if isinstance(indices_or_sections, int) \
        else len(indices_or_sections) + 1
    if isinstance(indices_or_sections, int):
        indices_or_sections = [indices_or_sections]
    return [
        _split_shape_func(inputs[0], convert(i), convert(indices_or_sections),
                          convert(axis)) for i in range(num_out)
    ]
コード例 #14
0
def decl_winograd(data, U, stride, padding, out_dtype):
    """declare winograd fast convolution F(2x2, 3x3) for conv2d"""
    N, C, H, W = [util.get_const_int(x) for x in data.shape]
    _, _, C, K = [util.get_const_int(x) for x in U.shape]
    HPAD, WPAD = 1, 1
    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride

    assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    m = 2
    r = 3
    alpha = m + r - 1
    K = K
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    # pack input tile
    input_tile = tvm.compute(
        (C, P, alpha, alpha),
        lambda c, b, eps, nu: tvm.select(
            b < P, data_pad[b // (nH * nW)][c][b // nW % nH * m + eps][
                b % nW * m + nu], tvm.const(0, data_pad.dtype)),
        name='d')

    V = decl_V_minimal(input_tile, alpha, C, P)

    # batch gemm
    c = tvm.reduce_axis((0, C), name='c')
    M = tvm.compute(
        (alpha, alpha, K, P),
        lambda eps, nu, k, b: tvm.sum(U[eps][nu][c][k] * V[eps][nu][c][b],
                                      axis=c),
        name='M')

    # inverse transform and unpack
    output = decl_output_minimal(M, N, K, H, W, P, m, nH, nW)

    return output
コード例 #15
0
ファイル: _vision.py プロジェクト: bddppq/tvm
def compute_multibox_prior(attrs, inputs, _, target):
    """Compute definition of multibox_prior"""
    sizes = get_float_tuple(attrs.sizes)
    ratios = get_float_tuple(attrs.ratios)
    steps = get_float_tuple(attrs.steps)
    offsets = get_float_tuple(attrs.offsets)
    clip = bool(get_const_int(attrs.clip))
    return [
        topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps,
                                       offsets, clip)
    ]
コード例 #16
0
ファイル: generic.py プロジェクト: zachzzc/incubator-tvm
 def _compute_nms(attrs, inputs, out_type):
     return_indices = bool(get_const_int(attrs.return_indices))
     max_output_size = get_const_int(attrs.max_output_size)
     iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
     top_k = get_const_int(attrs.top_k)
     coord_start = get_const_int(attrs.coord_start)
     score_index = get_const_int(attrs.score_index)
     id_index = get_const_int(attrs.id_index)
     invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [topi_compute(inputs[0], inputs[1], max_output_size, iou_threshold,
                          force_suppress, top_k, coord_start, score_index,
                          id_index, return_indices, invalid_to_bottom)]
コード例 #17
0
 def _callback(op):
     if op.tag == "sparse_dense_bsrmm":
         y_bsrmm = op.input_tensors[0]
         assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block"
         y_reshape = op
         (m, num_blocks, b_r) = s[y_bsrmm].op.axis
         bs_r = get_const_int(b_r.dom.extent)
         (elem_idx, c) = s[y_bsrmm].op.reduce_axis
         
         s[y_reshape].bind(s[y_reshape].op.axis[0], te.thread_axis("blockIdx.x"))
         s[y_reshape].bind(s[y_reshape].op.axis[1], te.thread_axis("threadIdx.x"))
         s[y_bsrmm].compute_at(s[y_reshape], s[y_reshape].op.axis[1])
コード例 #18
0
ファイル: generic.py プロジェクト: zachzzc/incubator-tvm
 def _compute_proposal(attrs, inputs, out_type):
     scales = get_float_tuple(attrs.scales)
     ratios = get_float_tuple(attrs.ratios)
     feature_stride = attrs.feature_stride
     threshold = attrs.threshold
     rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
     rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
     rpn_min_size = attrs.rpn_min_size
     iou_loss = bool(get_const_int(attrs.iou_loss))
     return [topi_compute(inputs[0], inputs[1], inputs[2], scales, ratios,
                          feature_stride, threshold, rpn_pre_nms_top_n,
                          rpn_post_nms_top_n, rpn_min_size, iou_loss)]
コード例 #19
0
ファイル: _transform.py プロジェクト: kalyan416/incubator-tvm
def take_shape_func(attrs, inputs, out_ndims):
    """
    Shape function for take op.
    """
    if attrs.axis is None:
        return [_take_no_axis_shape_func(inputs[1], out_ndims[0])]
    axis = get_const_int(attrs.axis)
    data_ndim = int(inputs[0].shape[0])
    if axis < 0:
        axis += data_ndim
    assert 0 <= axis < data_ndim
    return [_take_with_axis_shape_func(*inputs, convert(axis), out_ndims[0])]
コード例 #20
0
ファイル: _nn.py プロジェクト: bddppq/tvm
def compute_conv2d(attrs, inputs, out_type, target):
    """Compute definition of conv2d"""
    padding = get_const_tuple(attrs.padding)
    strides = get_const_tuple(attrs.strides)
    dilation = get_const_tuple(attrs.dilation)
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    out_dtype = attrs.out_dtype
    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
                 else out_dtype)

    assert layout in ["NCHW", "NHWC", "NCHW4c"]
    (dilation_h, dilation_w) = dilation
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        out = topi.nn.conv2d(
            inputs[0], inputs[1], strides, padding,
            dilation, layout, out_dtype=out_dtype)
    elif layout == "NCHW" and \
            get_const_int(inputs[1].shape[0]) == groups and \
            get_const_int(inputs[1].shape[1]) == 1:
        out = topi.nn.depthwise_conv2d_nchw(
            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
    elif layout == "NHWC" and \
            kernel_layout == "HWOI" and\
            get_const_int(inputs[1].shape[2]) == groups and \
            get_const_int(inputs[1].shape[3]) == 1:
        out = topi.nn.depthwise_conv2d_nhwc(
            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
    elif layout in ['NCHW', 'NCHW4c']:
        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
                                        out_dtype=out_dtype)
    else:
        raise ValueError("not support arbitrary group number for now")
    return [out]
コード例 #21
0
def compute_nms(attrs, inputs, _, target):
    """Compute definition of nms"""
    return_indices = bool(get_const_int(attrs.return_indices))
    max_output_size = get_const_int(attrs.max_output_size)
    iou_threshold = get_const_float(attrs.iou_threshold)
    force_suppress = bool(get_const_int(attrs.force_suppress))
    top_k = get_const_int(attrs.top_k)
    coord_start = get_const_int(attrs.coord_start)
    score_index = get_const_int(attrs.score_index)
    id_index = get_const_int(attrs.id_index)
    invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
    return [
        topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
                                        iou_threshold, force_suppress, top_k,
                                        coord_start, score_index, id_index,
                                        return_indices, invalid_to_bottom)
    ]
コード例 #22
0
def bitpack(data, bits, pack_type="int8", name="bitpack"):
    """Packs lowest dimension into format needed by VTA

    Parameters
    ----------
    pack_axis : int
        index of the axis to pack in data
    bit_axis : int
        index of axis to place bit axis in resulting packed data

    Returns
    -------
    packed : Tensor
        The packed tensor.
    """
    shape_vec = list(data.shape)
    if pack_type == 'int8':
        data_width = 8
    elif pack_type == 'int16':
        data_width = 16
    elif pack_type == 'int32':
        data_width = 32
    else:
        raise RuntimeError("Unknown pack type %s" % pack_type)
    assert data_width % bits == 0
    lanes = data_width // bits

    # Data must be in multiples of the data_width
    assert util.get_const_int(
        shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
    shape_vec[-1] = shape_vec[-1] // lanes
    oshape = tuple(shape_vec)

    def _bitpack(*indices):
        ret = None
        mask = tvm.const((1 << bits) - 1, pack_type)
        for k in range(lanes):
            idx = list(indices)
            idx[-1] = idx[-1] * lanes + k
            elem = data(*idx).astype(pack_type)
            if k == 0:
                ret = elem & mask
            else:
                val = (elem & mask) << tvm.const(k * bits, pack_type)
                ret = ret | val
        return ret

    return tvm.compute(oshape, _bitpack, name=name, tag='bitpack')
コード例 #23
0
ファイル: _rcnn.py プロジェクト: bddppq/tvm
def compute_proposal(attrs, inputs, _, target):
    """Compute definition of proposal"""
    scales = get_float_tuple(attrs.scales)
    ratios = get_float_tuple(attrs.ratios)
    feature_stride = attrs.feature_stride
    threshold = attrs.threshold
    rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
    rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
    rpn_min_size = attrs.rpn_min_size
    iou_loss = bool(get_const_int(attrs.iou_loss))
    with target:
        return [
            topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales, ratios,
                                      feature_stride, threshold, rpn_pre_nms_top_n,
                                      rpn_post_nms_top_n, rpn_min_size, iou_loss)
        ]
コード例 #24
0
def compute_proposal(attrs, inputs, _, target):
    """Compute definition of proposal"""
    scales = get_float_tuple(attrs.scales)
    ratios = get_float_tuple(attrs.ratios)
    feature_stride = attrs.feature_stride
    threshold = attrs.threshold
    rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
    rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
    rpn_min_size = attrs.rpn_min_size
    iou_loss = bool(get_const_int(attrs.iou_loss))
    with target:
        return [
            topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales,
                                      ratios, feature_stride, threshold,
                                      rpn_pre_nms_top_n, rpn_post_nms_top_n,
                                      rpn_min_size, iou_loss)
        ]
コード例 #25
0
ファイル: nn.py プロジェクト: fatu/nnvm
def compute_conv2d(attrs, inputs, _):
    """Compute definition of conv2d"""
    padding = attrs.get_int_tuple("padding")
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    channels = attrs.get_int("channels")
    layout = attrs["layout"]
    assert layout == "NCHW", "only support nchw for now"
    assert dilation == (1, 1), "not support dilate now"
    if groups == 1:
        out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding)
    elif groups == get_const_int(inputs[0].shape[1]) and groups == channels:
        out = topi.nn.depthwise_conv2d_nchw(inputs[0], inputs[1], strides, padding)
    else:
        raise ValueError("not support arbitrary group number for now")
    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
        out = topi.broadcast_add(out, bias)
    return out
コード例 #26
0
ファイル: nn.py プロジェクト: masa-ito-fj/nnvm
def compute_conv2d(attrs, inputs, _):
    """Compute definition of conv2d"""
    padding = attrs.get_int_tuple("padding")
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    channels = attrs.get_int("channels")
    layout = attrs["layout"]
    assert layout == "NCHW", "only support nchw for now"
    assert dilation == (1, 1), "not support dilate now"
    if groups == 1:
        out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding)
    elif groups == get_const_int(inputs[0].shape[1]) and groups == channels:
        out = topi.nn.depthwise_conv2d_nchw(inputs[0], inputs[1], strides, padding)
    else:
        raise ValueError("not support arbitrary group number for now")
    if attrs.get_bool("use_bias"):
        bias = inputs[2]
        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
        out = topi.broadcast_add(out, bias)
    return out
コード例 #27
0
ファイル: _transform.py プロジェクト: kalyan416/incubator-tvm
def concatenate_shape_func(attrs, inputs, _):
    axis = get_const_int(attrs.axis)
    if axis < 0:
        axis += inputs[0].shape[0]
    return [_concatenate_shape_func(inputs, convert(axis))]
コード例 #28
0
ファイル: test_topi_basic.py プロジェクト: LANHUIYING/tvm
def test_util():
    x = tvm.const(100)
    assert util.get_const_int(x) == 100
    assert util.get_const_tuple((x, x)) == (100, 100)
コード例 #29
0
def decl_winograd(data, U, stride, padding, out_dtype):
    """declare winograd fast convolution F(2x2, 3x3) for conv2d"""
    N, C, H, W = [util.get_const_int(x) for x in data.shape]
    _, _, C, K = [util.get_const_int(x) for x in U.shape]
    HPAD, WPAD = 1, 1
    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride

    assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    B_data = np.array(
        [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype)

    A_data = np.array([
        [1, 0],
        [1, 1],
        [1, -1],
        [0, -1],
    ], out_dtype)

    m = 2
    r = 3
    alpha = m + r - 1
    K = K

    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    # pack input tile
    input_tile = tvm.compute(
        (C, P, alpha, alpha),
        lambda c, b, eps, nu: tvm.select(
            b < P, data_pad[b // (nH * nW)][c][b // nW % nH * m + eps][
                b % nW * m + nu], tvm.const(0, data_pad.dtype)),
        name='d')

    # transform image
    B = const_array(B_data, 'B')
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    V = tvm.compute((alpha, alpha, C, P),
                    lambda eps, nu, c, b: tvm.sum(input_tile[c][b][r_eps][
                        r_nu] * B[r_eps][eps] * B[r_nu][nu],
                                                  axis=[r_eps, r_nu]),
                    name='V')

    # batch gemm
    c = tvm.reduce_axis((0, C), name='c')
    M = tvm.compute(
        (alpha, alpha, K, P),
        lambda eps, nu, k, b: tvm.sum(U[eps][nu][c][k] * V[eps][nu][c][b],
                                      axis=c),
        name='M')

    # inverse transform and unpack
    A = const_array(A_data, 'A')
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    output = tvm.compute(
        (N, K, H, W),
        lambda n, k, h, w: tvm.sum(M[r_eps][r_nu][k][n * nH * nW + (
            h // m) * nW + w // m] * A[r_eps][h % m] * A[r_nu][w % m],
                                   axis=[r_eps, r_nu]),
        name='output')

    return output
コード例 #30
0
ファイル: bitserial_conv2d.py プロジェクト: LANHUIYING/tvm
def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
                                  kernel, kernel_q, kernel_vec,
                                  conv_out, output, last):
    IB, _, CI, IH, IW = data_q.shape
    KB, CO, _, KH, KW = kernel_q.shape
    _, _, OH, OW = output.shape

    # Infer padding and stride
    if data_pad is None:
        padding = (0, 0)
        TH, TW = IH, IW
    else:
        _, _, _, TH, TW = data_pad.shape
        hpad = get_const_int((TH - IH) // 2)
        wpad = get_const_int((TW - IW) // 2)
        padding = (hpad, wpad)

    hstride = get_const_int((TH - KH) // (OH - 1))
    wstride = get_const_int((TW - KW) // (OW - 1))
    stride = (hstride, wstride)

    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW")
    sch = _get_schedule(wkl, "NCHW")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    ba = sch.ba
    bc = sch.bc

    CC = s.cache_write(conv_out, "global")
    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
    s[conv_out].vectorize(vc)

    s[CC].compute_at(s[conv_out], ow)
    n, co, oh, ow, vh, vw, vc = s[CC].op.axis
    ci, dh, dw, b1, b2 = s[CC].op.reduce_axis
    s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc)
    s[CC].unroll(b1)
    s[CC].unroll(b2)
    s[CC].vectorize(vc)

    ##### Schedule A
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, h, _, _, _, _, vw = s[data_vec].op.axis
    s[data_vec].vectorize(vw)
    if ba == 1:
        oaxis = h
        paxis = h
    else:
        oh, ih = s[data_vec].split(h, ba)
        oaxis = oh
        paxis = ih

    s[data_vec].parallel(paxis)
    s[data_vec].pragma(oaxis, "parallel_launch_point")
    s[data_vec].pragma(paxis, "parallel_stride_pattern")
    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")


    ##### Schedule B
    co, _, _, _, _, vc = s[kernel_vec].op.axis
    s[kernel_vec].vectorize(vc)
    if bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[kernel_vec].split(co, bc)
        oaxis = oco
        paxis = ico

    s[kernel_vec].parallel(paxis)
    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")


    ##### Schedule C
    n, co, h, w = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, co, oh, ow, vh, vw, vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    if bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[last].split(co, bc)
        oaxis = oco
        paxis = ico

    s[last].parallel(paxis)
    s[last].pragma(oaxis, "parallel_launch_point")
    s[last].pragma(paxis, "parallel_stride_pattern")
    s[last].pragma(oaxis, "parallel_barrier_when_finish")

    return s
コード例 #31
0
def _schedule_bitserial_conv2d_nhwc(cfg, s, data_q, data_pad, data_vec,
                                    kernel_q, kernel_vec, conv_out, output,
                                    last):
    # no stride and padding info here
    _, IH, IW, CI, IB = data_q.shape
    KH, KW, _, CO, KB = kernel_q.shape
    _, OH, OW, _ = output.shape

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    ##### Schedule data padding and packing
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, h, _, _, _, _, _ = s[data_vec].op.axis
    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
    oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
    s[data_vec].parallel(oh)

    ##### Schedule kernel packing
    co, _, _, _, _, _ = s[kernel_vec].op.axis
    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
    oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
    s[kernel_vec].parallel(oco)

    ##### Schedule Convolution
    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis

    # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
    cfg["reorder_0"].apply(s, conv_out,
                           [n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2])
    cfg["ann_reduce"].apply(s,
                            conv_out, [b1, b2, dh, dw],
                            axis_lens=[
                                get_const_int(b1.dom.extent),
                                get_const_int(b2.dom.extent),
                                get_const_int(dh.dom.extent),
                                get_const_int(dw.dom.extent)
                            ],
                            max_unroll=16,
                            cfg=cfg)

    s[conv_out].unroll(b1)
    s[conv_out].unroll(b2)
    s[conv_out].vectorize(vc)

    # # Schedule output
    n, h, w, co = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, oh, ow, co, vh, vw, vc)
    s[last].vectorize(vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    oho, iho = cfg["tile_oh"].apply(s, last, oh)  # reuse parameter
    s[last].parallel(oho)

    return s
コード例 #32
0
ファイル: generic.py プロジェクト: zachzzc/incubator-tvm
 def _compute_get_valid_counts(attrs, inputs, out_type):
     score_threshold = get_const_float(attrs.score_threshold)
     id_index = get_const_int(attrs.id_index)
     score_index = get_const_int(attrs.score_index)
     return topi_compute(inputs[0], score_threshold, id_index, score_index)
コード例 #33
0
def test_util():
    x = tvm.const(100, "int32")
    assert util.get_const_int(x) == 100
    assert util.get_const_tuple((x, x)) == (100, 100)
コード例 #34
0
def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, kernel,
                                  kernel_q, kernel_vec, conv_out, output,
                                  last):
    # no stride and padding info here
    _, IH, IW, CI, IB = data_q.shape
    KH, KW, _, CO, KB = kernel_q.shape
    _, OH, OW, _ = output.shape
    # Infer padding and stride
    if data_pad is None:
        padding = (0, 0)
        TH, TW = IH, IW
    else:
        _, TH, TW, _, _ = data_pad.shape
        hpad = get_const_int((TH - IH) // 2)
        wpad = get_const_int((TW - IW) // 2)
        padding = (hpad, wpad)

    hstride = get_const_int((TH - KH) // (OH - 1))
    wstride = get_const_int((TW - KW) // (OW - 1))
    stride = (hstride, wstride)

    wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NHWC")
    sch = _get_schedule(wkl, "NHWC")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    ba = sch.ba
    bc = sch.bc

    ##### Schedule data packing
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, h, _, _, _, _, _ = s[data_vec].op.axis
    if ba == 1:
        oaxis = h
        paxis = h
    else:
        oh, ih = s[data_vec].split(h, ba)
        oaxis = oh
        paxis = ih
    s[data_vec].parallel(paxis)
    s[data_vec].pragma(oaxis, "parallel_launch_point")
    s[data_vec].pragma(paxis, "parallel_stride_pattern")
    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule kernel packing
    co, _, _, _, _, _ = s[kernel_vec].op.axis
    if bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[kernel_vec].split(co, bc)
        oaxis = oco
        paxis = ico

    s[kernel_vec].parallel(paxis)
    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule Convolution
    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis

    s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)

    s[conv_out].unroll(b1)
    s[conv_out].unroll(b2)
    s[conv_out].vectorize(vc)

    # # Schedule output
    n, h, w, co = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, oh, ow, co, vh, vw, vc)
    s[last].vectorize(vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    if bc == 1:
        oaxis = oh
        paxis = oh
    else:
        oho, iho = s[last].split(oh, bc)
        oaxis = oho
        paxis = iho

    s[last].parallel(paxis)
    s[last].pragma(oaxis, "parallel_launch_point")
    s[last].pragma(paxis, "parallel_stride_pattern")
    s[last].pragma(oaxis, "parallel_barrier_when_finish")

    return s
コード例 #35
0
def decl_output_transform(cfg, X, M, VK, VP):
    N = get_const_int(X.shape[0])
    IH = get_const_int(X.shape[2])
    IW = get_const_int(X.shape[3])
    alpha = get_const_int(M.shape[0])

    K = get_const_int(M.shape[0]) * get_const_int(M.shape[4])
    P = get_const_int(M.shape[1]) * get_const_int(M.shape[5])

    # inverse transform
    A = const_matrix(A_data, 'A')
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    Y = tvm.compute((K // VK, P // VP, m, m, VK, VP),
                    lambda k, b, vh, vw, kk, bb: tvm.sum(M[k][b][r_eps][r_nu][
                        kk][bb] * A[r_eps][vh] * A[r_nu][vw],
                                                         axis=[r_eps, r_nu]),
                    name='Y')
    OH = get_const_int((IH + 2 * HPAD - 3) // HSTR + 1)
    OW = get_const_int((IW + 2 * WPAD - 3) // WSTR + 1)
    nH, nW = get_const_int((OH + m - 1) // m), get_const_int((OW + m - 1) // m)

    # unpack output
    def _output(n, k, h, w):
        k_elem = k % VK
        k_tile = k // VK
        b = n * nH * nW + h // m * nW + w // m
        b_elem = b % VP
        b_tile = b // VP
        return Y[k_tile][b_tile][h % m][w % m][k_elem][b_elem]

    output = tvm.compute((N, K, OH, OW),
                         _output,
                         name='output',
                         tag='winograd_conv_output')

    return output
コード例 #36
0
ファイル: bitserial_conv2d.py プロジェクト: bddppq/tvm
def _schedule_bitserial_conv2d_nchw(cfg, s, data_q, data_pad, data_vec,
                                    kernel_q, kernel_vec,
                                    conv_out, output, last):
    IB, _, CI, IH, IW = data_q.shape
    KB, CO, _, KH, KW = kernel_q.shape
    _, _, OH, OW = output.shape

    # Infer padding and stride
    if data_pad is None:
        padding = (0, 0)
        TH, TW = IH, IW
    else:
        _, _, _, TH, TW = data_pad.shape
        hpad = get_const_int((TH - IH) // 2)
        wpad = get_const_int((TW - IW) // 2)
        padding = (hpad, wpad)

    hstride = get_const_int((TH - KH) // (OH - 1))
    wstride = get_const_int((TW - KW) // (OW - 1))
    stride = (hstride, wstride)

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

     ##### Schedule Data padding, and bitpacking
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, _, h, _, _, _, _ = s[data_vec].op.axis
    cfg.define_split("tile_ah", cfg.axis(h), policy="all", num_outputs=2, max_factor=32)
    oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
    if cfg["tile_ah"].size[1] == 1:
        oaxis = oh
        paxis = oh
    else:
        oaxis = oh
        paxis = ih

    s[data_vec].parallel(paxis)
    s[data_vec].pragma(oaxis, "parallel_launch_point")
    s[data_vec].pragma(paxis, "parallel_stride_pattern")
    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")


    ##### Schedule Kenerl bitpacking
    co, _, _, _, _, _ = s[kernel_vec].op.axis
    cfg.define_split("tile_bco", cfg.axis(co), policy="all", num_outputs=2, max_factor=32)
    oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
    if cfg["tile_bco"].size[1] == 1:
        oaxis = oco
        paxis = oco
    else:
        oaxis = oco
        paxis = ico

    s[kernel_vec].parallel(paxis)
    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")


   ##### Schedule Convolution
    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
    ci, dh, dw, ib, kb = s[conv_out].op.reduce_axis

    # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
    cfg["reorder_0"].apply(s, conv_out, [n, co, oh, ow, vc, vh, vw, dh, dw, kb, ib, ci])
    cfg["ann_reduce"].apply(s, conv_out, [kb, ib, dh, dw],
                            axis_lens=[get_const_int(kb.dom.extent),
                                       get_const_int(ib.dom.extent),
                                       get_const_int(dh.dom.extent),
                                       get_const_int(dw.dom.extent)],
                            max_unroll=16,
                            cfg=cfg)

    s[conv_out].vectorize(vc)

    # # Schedule output
    n, co, h, w = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, co, oh, ow, vh, vw, vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    oco, ico = cfg["tile_oh"].apply(s, last, co)
    if cfg["tile_oh"].size[1] == 1:
        oaxis = oco
        paxis = oco
    else:
        oco, ico = s[last].split(co, bc)
        oaxis = oco
        paxis = ico

    s[last].parallel(oco)
    return s
コード例 #37
0
def decl_output_transform_minimal(cfg, X, M, VK, VP):
    def compute_A_T_dot_M(k, b, eps, nu, kk, bb):
        temp_expr = {}

        for j in range(alpha):
            m1_add_m2 = M[k][b][1][j][kk][bb] + M[k][b][2][j][kk][bb]
            m1_sub_m2 = M[k][b][1][j][kk][bb] - M[k][b][2][j][kk][bb]
            m3_add_m4 = M[k][b][3][j][kk][bb] + M[k][b][4][j][kk][bb]
            m3_sub_m4 = M[k][b][3][j][kk][bb] - M[k][b][4][j][kk][bb]
            m5_add_m6 = M[k][b][5][j][kk][bb] + M[k][b][6][j][kk][bb]
            m5_sub_m6 = M[k][b][5][j][kk][bb] - M[k][b][6][j][kk][bb]
            s0 = M[k][b][0][j][kk][bb] + m1_add_m2
            s5 = M[k][b][7][j][kk][bb] + m1_sub_m2
            s1 = m1_sub_m2 + m5_sub_m6 * 16
            s4 = m1_add_m2 + m3_add_m4 * 16
            s2 = m1_add_m2 + 8 * m5_add_m6
            s3 = m1_sub_m2 + 8 * m3_sub_m4
            s0 = s0 + m5_add_m6 * 32
            s5 = s5 + m3_sub_m4 * 32
            s1 = s1 + m3_sub_m4 * 2
            s4 = s4 + m5_add_m6 * 2
            s0 = s0 + m3_add_m4
            s5 = s5 + m5_sub_m6
            s2 = s2 + m3_add_m4 * 4
            s3 = s3 + m5_sub_m6 * 4
            temp_expr[(0, j)] = s0
            temp_expr[(1, j)] = s1
            temp_expr[(2, j)] = s2
            temp_expr[(3, j)] = s3
            temp_expr[(4, j)] = s4
            temp_expr[(5, j)] = s5
        now = tvm.const(0.0, "float32")
        for ii in range(m):
            for jj in range(alpha):
                now = tvm.select(tvm.all(eps == ii, nu == jj),
                                 temp_expr[(ii, jj)], now)
        return now

    N = get_const_int(X.shape[0])
    IH = get_const_int(X.shape[2])
    IW = get_const_int(X.shape[3])
    alpha = get_const_int(M.shape[0])

    K = get_const_int(M.shape[0]) * get_const_int(M.shape[4])
    P = get_const_int(M.shape[1]) * get_const_int(M.shape[5])

    A_T_dot_M = tvm.compute((K // VK, P // VP, m, alpha, VK, VP),
                            compute_A_T_dot_M,
                            name="A_T_dot_M")

    def compute_X_dot_A(k, b, eps, nu, kk, bb):
        temp_expr = {}

        for i in range(m):
            m1_add_m2 = A_T_dot_M[k][b][i][1][kk][bb] + A_T_dot_M[k][b][i][2][
                kk][bb]
            m1_sub_m2 = A_T_dot_M[k][b][i][1][kk][bb] - A_T_dot_M[k][b][i][2][
                kk][bb]
            m3_add_m4 = A_T_dot_M[k][b][i][3][kk][bb] + A_T_dot_M[k][b][i][4][
                kk][bb]
            m3_sub_m4 = A_T_dot_M[k][b][i][3][kk][bb] - A_T_dot_M[k][b][i][4][
                kk][bb]
            m5_add_m6 = A_T_dot_M[k][b][i][5][kk][bb] + A_T_dot_M[k][b][i][6][
                kk][bb]
            m5_sub_m6 = A_T_dot_M[k][b][i][5][kk][bb] - A_T_dot_M[k][b][i][6][
                kk][bb]
            s0 = A_T_dot_M[k][b][i][0][kk][bb] + m1_add_m2
            s5 = A_T_dot_M[k][b][i][7][kk][bb] + m1_sub_m2
            s1 = m1_sub_m2 + m5_sub_m6 * 16
            s4 = m1_add_m2 + m3_add_m4 * 16
            s2 = m1_add_m2 + 8 * m5_add_m6
            s3 = m1_sub_m2 + 8 * m3_sub_m4
            s0 = s0 + m5_add_m6 * 32
            s5 = s5 + m3_sub_m4 * 32
            s1 = s1 + m3_sub_m4 * 2
            s4 = s4 + m5_add_m6 * 2
            s0 = s0 + m3_add_m4
            s5 = s5 + m5_sub_m6
            s2 = s2 + m3_add_m4 * 4
            s3 = s3 + m5_sub_m6 * 4
            temp_expr[(i, 0)] = s0
            temp_expr[(i, 1)] = s1
            temp_expr[(i, 2)] = s2
            temp_expr[(i, 3)] = s3
            temp_expr[(i, 4)] = s4
            temp_expr[(i, 5)] = s5
        now = tvm.const(0.0, "float32")
        for ii in range(m):
            for jj in range(m):
                now = tvm.select(tvm.all(eps == ii, nu == jj),
                                 temp_expr[(ii, jj)], now)
        return now

    Y = tvm.compute((K // VK, P // VP, m, m, VK, VP),
                    compute_X_dot_A,
                    name="Y")
    OH = get_const_int((IH + 2 * HPAD - 3) // HSTR + 1)
    OW = get_const_int((IW + 2 * WPAD - 3) // WSTR + 1)
    nH, nW = get_const_int((OH + m - 1) // m), get_const_int((OW + m - 1) // m)

    # unpack output
    def _output(n, k, h, w):
        k_elem = k % VK
        k_tile = k // VK
        b = n * nH * nW + h // m * nW + w // m
        b_elem = b % VP
        b_tile = b // VP
        return Y[k_tile][b_tile][h % m][w % m][k_elem][b_elem]

    output = tvm.compute((N, K, OH, OW),
                         _output,
                         name='output',
                         tag='winograd_conv_output')
    return output
コード例 #38
0
def schedule_winograd(cfg, output, VK=6, VP=8):
    s = tvm.create_schedule(output.op)
    if not cfg:
        return s
    if output.name == "Y":
        Y = output
    else:
        Y = output.op.input_tensors[0]
    A_T_dot_M = Y.op.input_tensors[0]
    M = A_T_dot_M.op.input_tensors[0]
    U, V = M.op.input_tensors
    B_T_dot_X = V.op.input_tensors[0]
    #input_tile = B_T_dot_X.op.input_tensors[0]
    #data_pad = input_tile.op.input_tensors[0]
    # padding

    UNROLL = cfg['unroll'].val
    VECTORIZE = cfg['vectorize'].val
    TENSORIZE = cfg['tensorize'].val

    #if cfg['data_pad_inline'].val:
    #    s[data_pad].compute_inline()

    ## pack input tiles
    #(b, c, eps, nu, bb) = input_tile.op.axis
    #if cfg['input_tile_REORDER_C'].val:
    #    s[input_tile].reorder(b, eps, nu, c, bb)
    #if UNROLL:
    #    [s[input_tile].unroll(ax) for ax in [eps, nu]]
    #if VECTORIZE:
    #    s[input_tile].vectorize(bb)
    #if autotvm.GLOBAL_SCOPE.in_tuning:
    #    s[input_tile].pragma(b, 'debug_skip_region')
    #    s[data_pad].pragma(data_pad.op.axis[0], 'debug_skip_region')
    # s[input_tile].compute_inline()

    # transform kernel
    #if isinstance(U.op, tvm.tensor.ComputeOp):
    #    kernel, G = U.op.input_tensors
    #    if isinstance(kernel.op, tvm.tensor.ComputeOp):
    #        s[kernel].compute_inline()

    #    s[G].compute_inline()
    #    k, eps, nu, c, kk, = s[U].op.axis
    #    # r_kh, r_kw = s[U].op.reduce_axis
    #    # s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
    #    # s[U].unroll(eps)
    #    # s[U].unroll(nu)
    #    # s[U].unroll(r_kh)
    #    # s[U].unroll(r_kw)
    #    # s[U].vectorize(kk)
    #    if autotvm.GLOBAL_SCOPE.in_tuning:
    #        # kernel transformation will be pre-computed during compilation, so we skip
    #        # this part to make tuning records correct
    #        s[U].pragma(k, 'debug_skip_region')

    # if autotvm.GLOBAL_SCOPE.in_tuning:
    #     # kernel transformation will be pre-computed during compilation, so we skip
    #     # this part to make tuning records correct
    #     s[output].pragma(s[output].axis[0], 'debug_skip_region')

    (k, b, eps, nu, kk, bb) = A_T_dot_M.op.axis
    s[A_T_dot_M].reorder(b, k, eps, nu, kk, bb)
    if UNROLL:
        [s[A_T_dot_M].unroll(ax) for ax in [eps, nu, kk]]
    if VECTORIZE:
        s[A_T_dot_M].vectorize(bb)

    if cfg['M_COMPUTE_AT'].val:
        s[M].compute_at(s[A_T_dot_M], b)
    (k, b, eps, nu, kk, bb) = Y.op.axis
    s[Y].reorder(b, k, eps, nu, kk, bb)
    if UNROLL:
        [s[Y].unroll(ax) for ax in [eps, nu, kk]]

    if VECTORIZE:
        s[Y].vectorize(bb)

    if cfg['A_T_dot_M_COMPUTE_AT'].val:
        s[A_T_dot_M].compute_at(s[Y], b)

    # Schedule V
    (b, c, eps, nu, bb) = B_T_dot_X.op.axis
    if UNROLL:
        [s[B_T_dot_X].unroll(ax) for ax in [eps, nu]]
    if VECTORIZE:
        s[B_T_dot_X].vectorize(bb)

    # if cfg['B_T_dot_X_REORDER_C'].val:
    #     s[B_T_dot_X].reorder(b, eps, nu, c, bb)

    #if cfg['input_tile_COMPUTE_AT'].val:
    #    s[input_tile].compute_at(s[B_T_dot_X], b)

    (b, eps, nu, c, bb) = V.op.axis
    if UNROLL:
        [s[V].unroll(ax) for ax in [eps, nu]]
    if VECTORIZE:
        s[V].vectorize(bb)
    if cfg['V_REORDER_C'].val:
        s[V].reorder(b, eps, nu, c, bb)

    if cfg['B_T_dot_X_COMPUTE_AT'].val:
        s[B_T_dot_X].compute_at(s[V], b)

    (k, b, eps, nu, kk, bb) = M.op.axis
    if cfg['V_COMPUTE_AT'].val:
        s[V].compute_at(s[M], b)
    s[M].reorder(b, k, eps, nu, kk, bb)
    K = get_const_int(M.op.reduce_axis[0].dom.extent)
    s[M].tensorize(kk, intrin_gemm(M=VK, N=VP, K=K))
    return s
コード例 #39
0
def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel,
                                  kernel_q, kernel_vec, conv_out, output,
                                  last):
    IB, _, CI, IH, IW = data_q.shape
    KB, CO, _, KH, KW = kernel_q.shape
    _, _, OH, OW = output.shape

    # Infer padding and stride
    if data_pad is None:
        padding = (0, 0)
        TH, TW = IH, IW
    else:
        _, _, _, TH, TW = data_pad.shape
        hpad = get_const_int((TH - IH) // 2)
        wpad = get_const_int((TW - IW) // 2)
        padding = (hpad, wpad)

    hstride = get_const_int((TH - KH) // (OH - 1))
    wstride = get_const_int((TW - KW) // (OW - 1))
    stride = (hstride, wstride)

    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW")
    sch = _get_schedule(wkl, "NCHW")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    ba = sch.ba
    bc = sch.bc

    CC = s.cache_write(conv_out, "global")
    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
    s[conv_out].vectorize(vc)

    s[CC].compute_at(s[conv_out], ow)
    n, co, oh, ow, vh, vw, vc = s[CC].op.axis
    ci, dh, dw, b1, b2 = s[CC].op.reduce_axis
    s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc)
    s[CC].unroll(b1)
    s[CC].unroll(b2)
    s[CC].vectorize(vc)

    ##### Schedule A
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, h, _, _, _, _, vw = s[data_vec].op.axis
    s[data_vec].vectorize(vw)
    if ba == 1:
        oaxis = h
        paxis = h
    else:
        oh, ih = s[data_vec].split(h, ba)
        oaxis = oh
        paxis = ih

    s[data_vec].parallel(paxis)
    s[data_vec].pragma(oaxis, "parallel_launch_point")
    s[data_vec].pragma(paxis, "parallel_stride_pattern")
    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule B
    co, _, _, _, _, vc = s[kernel_vec].op.axis
    s[kernel_vec].vectorize(vc)
    if bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[kernel_vec].split(co, bc)
        oaxis = oco
        paxis = ico

    s[kernel_vec].parallel(paxis)
    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule C
    n, co, h, w = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, co, oh, ow, vh, vw, vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    if bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[last].split(co, bc)
        oaxis = oco
        paxis = ico

    s[last].parallel(paxis)
    s[last].pragma(oaxis, "parallel_launch_point")
    s[last].pragma(paxis, "parallel_stride_pattern")
    s[last].pragma(oaxis, "parallel_barrier_when_finish")

    return s
コード例 #40
0
ファイル: bitserial_conv2d.py プロジェクト: LANHUIYING/tvm
def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
                                  kernel, kernel_q, kernel_vec,
                                  conv_out, output, last):
    # no stride and padding info here
    _, IH, IW, CI, IB = data_q.shape
    KH, KW, _, CO, KB = kernel_q.shape
    _, OH, OW, _ = output.shape
    # Infer padding and stride
    if data_pad is None:
        padding = (0, 0)
        TH, TW = IH, IW
    else:
        _, TH, TW, _, _ = data_pad.shape
        hpad = get_const_int((TH - IH) // 2)
        wpad = get_const_int((TW - IW) // 2)
        padding = (hpad, wpad)

    hstride = get_const_int((TH - KH) // (OH - 1))
    wstride = get_const_int((TW - KW) // (OW - 1))
    stride = (hstride, wstride)

    wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NHWC")
    sch = _get_schedule(wkl, "NHWC")
    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    ba = sch.ba
    bc = sch.bc

    ##### Schedule data packing
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, h, _, _, _, _, _ = s[data_vec].op.axis
    if ba == 1:
        oaxis = h
        paxis = h
    else:
        oh, ih = s[data_vec].split(h, ba)
        oaxis = oh
        paxis = ih
    s[data_vec].parallel(paxis)
    s[data_vec].pragma(oaxis, "parallel_launch_point")
    s[data_vec].pragma(paxis, "parallel_stride_pattern")
    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")


    ##### Schedule kernel packing
    co, _, _, _, _, _ = s[kernel_vec].op.axis
    if bc == 1:
        oaxis = co
        paxis = co
    else:
        oco, ico = s[kernel_vec].split(co, bc)
        oaxis = oco
        paxis = ico

    s[kernel_vec].parallel(paxis)
    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")


    ##### Schedule Convolution
    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis

    s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)

    s[conv_out].unroll(b1)
    s[conv_out].unroll(b2)
    s[conv_out].vectorize(vc)

    # # Schedule output
    n, h, w, co = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, oh, ow, co, vh, vw, vc)
    s[last].vectorize(vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    if bc == 1:
        oaxis = oh
        paxis = oh
    else:
        oho, iho = s[last].split(oh, bc)
        oaxis = oho
        paxis = iho

    s[last].parallel(paxis)
    s[last].pragma(oaxis, "parallel_launch_point")
    s[last].pragma(paxis, "parallel_stride_pattern")
    s[last].pragma(oaxis, "parallel_barrier_when_finish")

    return s
コード例 #41
0
ファイル: generic.py プロジェクト: zachzzc/incubator-tvm
 def _compute_argsort(attrs, inputs, _):
     axis = get_const_int(attrs.axis)
     is_ascend = bool(get_const_int(attrs.is_ascend))
     dtype = attrs.dtype
     return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)]
コード例 #42
0
def _schedule_bitserial_conv2d_nchw(cfg, s, data_q, data_pad, data_vec,
                                    kernel_q, kernel_vec, conv_out, output,
                                    last):
    IB, _, CI, IH, IW = data_q.shape
    KB, CO, _, KH, KW = kernel_q.shape
    _, _, OH, OW = output.shape

    # Infer padding and stride
    if data_pad is None:
        padding = (0, 0)
        TH, TW = IH, IW
    else:
        _, _, _, TH, TW = data_pad.shape
        hpad = get_const_int((TH - IH) // 2)
        wpad = get_const_int((TW - IW) // 2)
        padding = (hpad, wpad)

    hstride = get_const_int((TH - KH) // (OH - 1))
    wstride = get_const_int((TW - KW) // (OW - 1))
    stride = (hstride, wstride)

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    ##### Schedule Data padding, and bitpacking
    if data_pad is not None:
        s[data_pad].compute_inline()

    _, _, h, _, _, _, _ = s[data_vec].op.axis
    cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32)
    oh, ih = cfg["tile_ah"].apply(s, data_vec, h)
    if cfg["tile_ah"].size[1] == 1:
        oaxis = oh
        paxis = oh
    else:
        oaxis = oh
        paxis = ih

    s[data_vec].parallel(paxis)
    s[data_vec].pragma(oaxis, "parallel_launch_point")
    s[data_vec].pragma(paxis, "parallel_stride_pattern")
    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule Kenerl bitpacking
    co, _, _, _, _, _ = s[kernel_vec].op.axis
    cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32)
    oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co)
    if cfg["tile_bco"].size[1] == 1:
        oaxis = oco
        paxis = oco
    else:
        oaxis = oco
        paxis = ico

    s[kernel_vec].parallel(paxis)
    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")

    ##### Schedule Convolution
    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
    ci, dh, dw, ib, kb = s[conv_out].op.reduce_axis

    # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
    cfg["reorder_0"].apply(s, conv_out,
                           [n, co, oh, ow, vc, vh, vw, dh, dw, kb, ib, ci])
    cfg["ann_reduce"].apply(s,
                            conv_out, [kb, ib, dh, dw],
                            axis_lens=[
                                get_const_int(kb.dom.extent),
                                get_const_int(ib.dom.extent),
                                get_const_int(dh.dom.extent),
                                get_const_int(dw.dom.extent)
                            ],
                            max_unroll=16,
                            cfg=cfg)

    s[conv_out].vectorize(vc)

    # # Schedule output
    n, co, h, w = s[last].op.axis
    co, vc = s[last].split(co, VC)
    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
    s[last].reorder(n, co, oh, ow, vh, vw, vc)
    if last != output:
        s[output].compute_inline()
    s[conv_out].compute_at(s[last], ow)

    oco, ico = cfg["tile_oh"].apply(s, last, co)
    if cfg["tile_oh"].size[1] == 1:
        oaxis = oco
        paxis = oco
    else:
        oco, ico = s[last].split(co, bc)
        oaxis = oco
        paxis = ico

    s[last].parallel(oco)
    return s
コード例 #43
0
def concatenate_shape_func(attrs, inputs, _):
    axis = get_const_int(attrs.axis)
    return [_concatenate_shape_func(inputs, convert(axis))]