Exemple #1
0
def _create_axis_record(attrs, inputs):
    axes = attrs.axis if attrs.axis is None else list(
        get_const_tuple(attrs.axis))
    exclude = get_const_int(attrs.exclude) > 0
    keepdims = get_const_int(attrs.keepdims) > 0
    data_shape = inputs[0]
    shape_size = data_shape.shape[0].value
    axis_record = [-1] * shape_size
    if axes is None:
        axes = list(range(shape_size))

    for i, axis in enumerate(axes):
        if axis < 0:
            axes[i] = shape_size + axis

    if exclude:
        ex_axes = []
        for i in range(shape_size):
            if i not in axes:
                ex_axes.append(i)
        axes = ex_axes

    for i in range(shape_size):
        if i not in axes:
            axis_record[i] = i

    if not keepdims:
        tmp = []
        for i in axis_record:
            if i >= 0:
                tmp.append(i)
        axis_record = tmp

    return axis_record
Exemple #2
0
def split_shape_func(attrs, inputs, _):
    """
    Shape function for split op.
    """
    if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)):
        indices_or_sections = get_const_int(attrs.indices_or_sections)
        assert indices_or_sections > 0, "Slice count must be > 0"
    else:
        indices_or_sections = list(get_const_tuple(attrs.indices_or_sections))
        assert sorted(indices_or_sections)[0] > 0 and indices_or_sections == sorted(
            indices_or_sections
        ), "split_indices must be sorted"

    axis = get_const_int(attrs.axis)

    if axis < 0:
        axis += get_const_int(inputs[0].shape[0])

    num_out = (
        indices_or_sections
        if isinstance(indices_or_sections, int)
        else len(indices_or_sections) + 1
    )
    if isinstance(indices_or_sections, int):
        indices_or_sections = [indices_or_sections]
    return [
        _split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis))
        for i in range(num_out)
    ]
Exemple #3
0
 def _compute_get_valid_counts(attrs, inputs, out_type):
     score_threshold = inputs[1]
     id_index = get_const_int(attrs.id_index)
     score_index = get_const_int(attrs.score_index)
     if attrs.score_threshold is not None:
         score_threshold = get_const_float(attrs.score_threshold)
     return topi_compute(inputs[0], score_threshold, id_index, score_index)
Exemple #4
0
    def _schedule(cfg, s, data_vec, weight_vec, output):
        s[data_vec].parallel(s[data_vec].op.axis[0])
        s[weight_vec].parallel(s[weight_vec].op.axis[0])

        y, x = s[output].op.axis
        wb, db, k = s[output].op.reduce_axis

        yo, yi = cfg["tile_y"].apply(s, output, y)
        xo, xi = cfg["tile_x"].apply(s, output, x)
        ko, ki = cfg["tile_k"].apply(s, output, k)

        cfg["reorder_0"].apply(s, output, [yo, xo, ko, yi, wb, db, ki, xi])
        cfg["ann_reduce"].apply(
            s,
            output,
            [db, wb],
            axis_lens=[
                get_const_int(db.dom.extent),
                get_const_int(wb.dom.extent)
            ],
            max_unroll=8,
            cfg=cfg,
        )
        cfg["ann_spatial"].apply(
            s,
            output,
            [yi, xi],
            axis_lens=[cfg["tile_y"].size[-1], cfg["tile_x"].size[-1]],
            max_unroll=8,
            cfg=cfg,
        )
        s[output].vectorize(xi)
        s[output].parallel(yo)
        return s
Exemple #5
0
 def _compute_argsort(attrs, inputs, _):
     axis = get_const_int(attrs.axis)
     is_ascend = bool(get_const_int(attrs.is_ascend))
     dtype = attrs.dtype
     return [
         topi_compute(inputs[0],
                      axis=axis,
                      is_ascend=is_ascend,
                      dtype=dtype)
     ]
Exemple #6
0
def expand_dim_shape_func(attrs, inputs, _):
    """
    Shape function for expand_dim op.
    """
    axis = get_const_int(attrs.axis)
    num_newaxis = get_const_int(attrs.num_newaxis)
    if axis < 0:
        axis = inputs[0].shape[0] + axis + 1
    ndim = inputs[0].shape[0] if inputs[0].shape else 0
    return [_expand_dim_shape_func(inputs[0], convert(ndim), convert(axis), convert(num_newaxis))]
Exemple #7
0
def gather_nd_shape_func(attrs, inputs, _):
    """
    Shape func for gather_nd operator.
    """
    batch_dims = get_const_int(attrs.batch_dims)
    index_rank = get_const_int(attrs.index_rank)

    assert index_rank > 0, "index_rank needs to be specified for dynamic gather_nd"

    return [_gather_nd_shape(inputs[0], inputs[1], convert(batch_dims), convert(index_rank))]
Exemple #8
0
    def _callback(op):
        if "conv2d_nhwc" not in op.tag:
            return

        ### extract tensors ###
        output = op.output(0)
        conv = op
        data_vec = conv.input_tensors[0]
        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
        last = outs[0]  # pylint: disable=unused-variable

        # tile reduction axes
        n, oh, ow, co = sched[conv].op.axis
        kh, kw, ci = sched[conv].op.reduce_axis
        # NOTE we can't inline data padding in the SIMD path, because it
        # introduces conditionals in the inner loop.
        data_pad = data_vec.op
        sched[data_pad].compute_inline()

        co, vc = cfg["tile_co"].apply(sched, conv, co)
        oh, vh = cfg["tile_oh"].apply(sched, conv, oh)
        ow, vw = cfg["tile_ow"].apply(sched, conv, ow)
        cfg["reorder_0"].apply(sched, conv,
                               [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
        cfg["ann_reduce"].apply(
            sched,
            conv,
            [kh, kw],
            axis_lens=[
                get_const_int(kh.dom.extent),
                get_const_int(kw.dom.extent)
            ],
            max_unroll=8,
            cfg=cfg,
        )
        cfg["ann_spatial"].apply(
            sched,
            conv,
            [vh, vw, vc],
            axis_lens=[
                cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1],
                cfg["tile_co"].size[-1]
            ],
            max_unroll=8,
            cfg=cfg,
        )

        kernel_scope = n  # this is the scope to attach global config inside this kernel

        # tune unroll
        sched[output].pragma(kernel_scope, "auto_unroll_max_step",
                             cfg["auto_unroll_max_step"].val)
        sched[output].pragma(kernel_scope, "unroll_explicit",
                             cfg["unroll_explicit"].val)
Exemple #9
0
 def _compute_topk(attrs, inputs, out_type):
     if attrs.k is not None:
         k = attrs.k
     else:
         k = inputs[1]
     axis = get_const_int(attrs.axis)
     ret_type = attrs.ret_type
     is_ascend = bool(get_const_int(attrs.is_ascend))
     dtype = attrs.dtype
     out = topi_compute(inputs[0], k, axis, ret_type, is_ascend, dtype)
     out = out if isinstance(out, list) else [out]
     return out
Exemple #10
0
 def _compute_multibox_transform_loc(attrs, inputs, _):
     """Compute definition of multibox_detection"""
     clip = bool(get_const_int(attrs.clip))
     threshold = get_const_float(attrs.threshold)
     variances = get_float_tuple(attrs.variances)
     return topi_compute(inputs[0], inputs[1], inputs[2], clip, threshold,
                         variances)
Exemple #11
0
 def _compute_multibox_prior(attrs, inputs, _):
     """Compute definition of multibox_prior"""
     sizes = get_float_tuple(attrs.sizes)
     ratios = get_float_tuple(attrs.ratios)
     steps = get_float_tuple(attrs.steps)
     offsets = get_float_tuple(attrs.offsets)
     clip = bool(get_const_int(attrs.clip))
     return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)]
Exemple #12
0
def stack_shape_func(attrs, inputs, _):
    """
    Shape func for stack.
    """
    axis = get_const_int(attrs.axis)
    if axis < 0:
        axis += inputs[0].shape[0] + 1
    return [_stack_shape_func(inputs[0], convert(axis), convert(len(inputs)))]
Exemple #13
0
def repeat_shape_func(attrs, inputs, _):
    """
    Shape func for repeat.
    """
    axis = get_const_int(attrs.axis)
    if axis < 0:
        axis = inputs[0].shape[0] + axis
    return [_repeat_shape_func(inputs[0], attrs.repeats, convert(axis))]
Exemple #14
0
def take_shape_func(attrs, inputs, out_ndims):
    """
    Shape function for take op.
    """
    if attrs.axis is None:
        return [_take_no_axis_shape_func(inputs[1], out_ndims[0])]
    axis = get_const_int(attrs.axis)
    batch_dims = get_const_int(attrs.batch_dims)
    data_ndim = int(inputs[0].shape[0])
    if inputs[1].shape:
        indicies_ndim = int(inputs[1].shape[0])
    if axis < 0:
        axis += data_ndim
    assert 0 <= axis < data_ndim
    if batch_dims < 0:
        batch_dims += indicies_ndim
    return [_take_with_axis_shape_func(*inputs, convert(axis), convert(batch_dims), out_ndims[0])]
Exemple #15
0
def bitpack(data, bits, pack_type="int8", name="bitpack"):
    """Packs lowest dimension into format needed by VTA

    Parameters
    ----------
    pack_axis : int
        index of the axis to pack in data
    bit_axis : int
        index of axis to place bit axis in resulting packed data

    Returns
    -------
    packed : Tensor
        The packed tensor.
    """
    shape_vec = list(data.shape)
    if pack_type == "int8":
        data_width = 8
    elif pack_type == "int16":
        data_width = 16
    elif pack_type == "int32":
        data_width = 32
    else:
        raise RuntimeError("Unknown pack type %s" % pack_type)
    assert data_width % bits == 0
    lanes = data_width // bits

    # Data must be in multiples of the data_width
    assert utils.get_const_int(
        shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
    shape_vec[-1] = shape_vec[-1] // lanes
    oshape = tuple(shape_vec)

    def _bitpack(*indices):
        ret = None
        mask = tvm.tir.const((1 << bits) - 1, pack_type)
        for k in range(lanes):
            idx = list(indices)
            idx[-1] = idx[-1] * lanes + k
            elem = data(*idx).astype(pack_type)
            if k == 0:
                ret = elem & mask
            else:
                val = (elem & mask) << tvm.tir.const(k * bits, pack_type)
                ret = ret | val
        return ret

    return te.compute(oshape, _bitpack, name=name, tag="bitpack")
Exemple #16
0
 def _compute_nms(attrs, inputs, out_type):
     max_output_size = inputs[3]
     iou_threshold = inputs[4]
     if attrs.max_output_size is not None:
         max_output_size = attrs.max_output_size
     if attrs.iou_threshold is not None:
         iou_threshold = get_const_float(attrs.iou_threshold)
     return_indices = bool(get_const_int(attrs.return_indices))
     force_suppress = bool(get_const_int(attrs.force_suppress))
     top_k = get_const_int(attrs.top_k)
     coord_start = get_const_int(attrs.coord_start)
     score_index = get_const_int(attrs.score_index)
     id_index = get_const_int(attrs.id_index)
     invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     if return_indices:
         return topi_compute(
             inputs[0],
             inputs[1],
             inputs[2],
             max_output_size,
             iou_threshold,
             force_suppress,
             top_k,
             coord_start,
             score_index,
             id_index,
             return_indices,
             invalid_to_bottom,
         )
     return [
         topi_compute(
             inputs[0],
             inputs[1],
             inputs[2],
             max_output_size,
             iou_threshold,
             force_suppress,
             top_k,
             coord_start,
             score_index,
             id_index,
             return_indices,
             invalid_to_bottom,
         )
     ]
Exemple #17
0
 def _compute_proposal(attrs, inputs, out_type):
     scales = get_float_tuple(attrs.scales)
     ratios = get_float_tuple(attrs.ratios)
     feature_stride = attrs.feature_stride
     threshold = attrs.threshold
     rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
     rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
     rpn_min_size = attrs.rpn_min_size
     iou_loss = bool(get_const_int(attrs.iou_loss))
     return [
         topi_compute(
             inputs[0],
             inputs[1],
             inputs[2],
             scales,
             ratios,
             feature_stride,
             threshold,
             rpn_pre_nms_top_n,
             rpn_post_nms_top_n,
             rpn_min_size,
             iou_loss,
         )
     ]
Exemple #18
0
def test_util():
    x = tvm.tir.const(100, "int32")
    assert utils.get_const_int(x) == 100
    assert utils.get_const_tuple((x, x)) == (100, 100)
Exemple #19
0
def schedule_conv2d_winograd(cfg, s, output, pre_computed):
    """Schedule winograd template"""
    inverse = s[output].op.input_tensors[0]
    bgemm, A = s[inverse].op.input_tensors
    kernel_pack, data_pack_trans = s[bgemm].op.input_tensors
    data_pack = s[data_pack_trans].op.input_tensors[0]
    input_tile, B = s[data_pack].op.input_tensors
    pad_data = s[input_tile].op.input_tensors[0]

    # data transform
    s[B].compute_inline()
    s[A].compute_inline()

    # probably will improve real topology execution
    if autotvm.GLOBAL_SCOPE.in_tuning:
        # Padding to texture
        AA = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [input_tile])
        bind_data_copy(s[AA])

    s[input_tile].compute_inline()

    OL = s.cache_write(data_pack, "local")
    c, p, eps, nu, cb = s[data_pack].op.axis
    fused = s[data_pack].fuse(c, p, eps, nu)
    bx, tx = s[data_pack].split(fused, 128)
    s[data_pack].vectorize(cb)
    s[data_pack].bind(bx, te.thread_axis("blockIdx.x"))
    s[data_pack].bind(tx, te.thread_axis("threadIdx.x"))

    _, _, eps, nu, cb = s[OL].op.axis
    r_a, r_b = s[OL].op.reduce_axis
    s[OL].unroll(eps)
    s[OL].unroll(nu)
    s[OL].unroll(r_a)
    s[OL].unroll(r_b)
    s[OL].vectorize(cb)
    s[OL].compute_at(s[data_pack], tx)
    s[data_pack].set_scope(get_texture_storage(data_pack.shape))

    s[data_pack_trans].compute_inline()

    # transform kernel
    if not pre_computed:
        kernel, G = s[kernel_pack].op.input_tensors
        eps, nu, ci, co, cob = s[kernel_pack].op.axis
        if autotvm.GLOBAL_SCOPE.in_tuning:
            # skip this part during tuning to make recrods accurate
            # this part will be pre-computed during pre-compute optimization pass
            s[G].pragma(s[G].op.axis[0], "debug_skip_region")
            s[kernel_pack].pragma(eps, "debug_skip_region")
        else:
            s[G].compute_inline()
            r_a, r_b = s[kernel_pack].op.reduce_axis
            for axis in [eps, nu, r_a, r_b]:
                s[kernel_pack].unroll(axis)

            fused = s[kernel_pack].fuse(ci, co)
            bb, tt = s[kernel_pack].split(fused, 128)
            s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b, cob)
            s[kernel_pack].vectorize(cob)
            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
    else:
        kernel = kernel_pack

    if isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag:
        # manage scheduling of datacopy
        pack_data = pad_data.op.input_tensors[0]
        bind_data_copy(s[pack_data])
        bind_data_copy(s[kernel])
    elif isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
        s[kernel].compute_inline()
    s[pad_data].compute_inline()

    ##### space definition begin #####
    cfg.define_knob("auto_unroll_max_step", [0, 4, 16])
    b1, b2, y, x, cb = s[bgemm].op.axis
    rcc = s[bgemm].op.reduce_axis[0]
    alpha = get_const_int(b1.dom.extent)

    cfg.define_split(
        "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 16
    )

    min_x_div = 1
    for bn in range(4, 0, -1):
        if bgemm.shape[3] % bn == 0:
            min_x_div = bn
            break

    cfg.define_split(
        "tile_x",
        x,
        num_outputs=3,
        filter=lambda entry: entry.size[2] <= 64
        and entry.size[1] >= min_x_div
        and entry.size[1] <= 16,
    )
    cfg.define_split("tile_rc", rcc, num_outputs=2)
    # TODO: Uncomment the following lines when multi_filter will be introduced
    # cfg.multi_filter(
    # filter=lambda entity: entity["tile_y"].size[2] * entity["tile_x"].size[2] in range(32,1024)
    # )
    ##### space definition end #####

    # batch gemm
    OL = s.cache_write(bgemm, "local")
    if (
        autotvm.GLOBAL_SCOPE.in_tuning
        or isinstance(kernel.op, tvm.te.ComputeOp)
        and "filter_pack" in kernel.op.tag
    ):
        BB = s.cache_read(kernel_pack, get_texture_storage(kernel_pack.shape), [OL])
        bind_data_copy(s[BB])

    by = s[bgemm].fuse(b1, b2, y)

    # tile and bind spatial axes
    bgemm_scope, by = s[bgemm].split(by, nparts=1)
    by, vy, ty = cfg["tile_y"].apply(s, bgemm, by)
    bx, vx, tx = cfg["tile_x"].apply(s, bgemm, x)
    s[bgemm].bind(by, te.thread_axis("blockIdx.y"))
    s[bgemm].bind(bx, te.thread_axis("blockIdx.x"))
    s[bgemm].bind(vy, te.thread_axis("vthread"))
    s[bgemm].bind(vx, te.thread_axis("vthread"))
    s[bgemm].bind(ty, te.thread_axis("threadIdx.y"))
    s[bgemm].bind(tx, te.thread_axis("threadIdx.x"))
    s[bgemm].reorder(bgemm_scope, by, bx, vy, vx, ty, tx, cb)
    s[bgemm].vectorize(cb)
    s[bgemm].set_scope(get_texture_storage(bgemm.shape))

    # tile reduction axes
    s[OL].compute_at(s[bgemm], tx)
    b1, b2, y, x, cb = s[OL].op.axis
    (rcc, rcb) = s[OL].op.reduce_axis
    b = s[OL].fuse(b1, b2)
    s[OL].reorder(b, y, x, rcc, rcb, cb)
    # s[OL].unroll(rcb)
    s[OL].pragma(rcb, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
    s[OL].pragma(rcb, "unroll_explicit", True)
    s[OL].vectorize(cb)

    # schedule inverse, output and fusion
    if output.op in s.outputs:
        OL = None
    else:
        OL = output
        s[OL].set_scope("local")
        output = s.outputs[0]

    if len(s[output].op.axis) == 4:
        n, co, h, w = s[output].op.axis
        cb = None
    else:
        n, co, h, w, cb = s[output].op.axis
    inverse_scope, n = s[output].split(n, nparts=1)

    fused = s[output].fuse(n, co, h, w)
    bb, tt = s[output].split(fused, 128)
    if cb is not None:
        s[output].reorder(bb, tt, cb)
        s[output].vectorize(cb)

    s[output].bind(bb, te.thread_axis("blockIdx.x"))
    s[output].bind(tt, te.thread_axis("threadIdx.x"))

    if OL is not None:
        s[OL].compute_at(s[output], tt)

    co, p, vh, vw, cb = s[inverse].op.axis
    r_a, r_b = s[inverse].op.reduce_axis
    for axis in [vh, vw, r_a, r_b]:
        s[inverse].unroll(axis)
    s[inverse].vectorize(cb)
    s[inverse].compute_at(s[output], tt)

    return s
Exemple #20
0
 def _compute_sort(attrs, inputs, _):
     axis = get_const_int(attrs.axis)
     is_ascend = bool(get_const_int(attrs.is_ascend))
     return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend)]
Exemple #21
0
def concatenate_shape_func(attrs, inputs, _):
    axis = get_const_int(attrs.axis)
    if axis < 0:
        axis += inputs[0].shape[0]
    return [_concatenate_shape_func(inputs, convert(axis))]