Ejemplo n.º 1
0
def test_reduce_combiner_simplify():
    ck = CanonicalChecker()
    dummy = tvm.var('dummy')
    comm_reducer = tvm.comm_reducer
    prod = comm_reducer(lambda x, y: x*y, lambda t0: tvm.const(1, t0))

    sum_or_prod = comm_reducer(
        lambda x, y: tvm.expr.Select(dummy < 0,
                                     x + y, x*y),
        lambda t0: tvm.expr.Select(dummy < 0,
                                   tvm.const(0, t0), tvm.const(1, t0)))
    sum_and_prod = comm_reducer(
        lambda x, y: (x[0] + y[0],
                      x[1]*y[1]),
        lambda t0, t1: (tvm.const(0, t0),
                        tvm.const(5, t0) - tvm.const(4, t0)))
    some_reducer1 = comm_reducer(
        lambda x, y: (x[0] + y[0],
                      x[0] + y[0] + x[1] + y[1],
                      x[0]*y[2] + y[0]*x[2],
                      x[1] + y[2],
                    4.0),
        lambda t0, t1, t2, t3, t4: (tvm.const(0, t0),
                                    tvm.const(1, t1),
                                    tvm.const(2, t2),
                                    tvm.const(3, t3),
                                    tvm.const(4, t4)))

    k = tvm.reduce_axis((0, 10), name="k")
    A = tvm.placeholder((10,), name='A')
    # Test that SimplifyCombiner makes use of vranges
    ck.analyzer.update(dummy, tvm.arith.ConstIntBound(-10, -4))
    ck.verify(sum_or_prod(A[k], k), tvm.sum(A[k], k))
    ck.analyzer.update(dummy, tvm.arith.ConstIntBound(5, 9), True)
    ck.verify(sum_or_prod(A[k], k), prod(A[k], k))
    ck.analyzer.update(dummy, tvm.arith.ConstIntBound(-10, 100), True)
    ck.verify(sum_and_prod((A[k], A[10-k]), k)[0], tvm.sum(A[k], k))
    ck.verify(sum_and_prod((A[k], A[10-k]), k)[1], prod(A[10-k], k))

    reference_simplified_sources = [[A[0]],
                                    [A[0], A[1]],
                                    [A[0], A[2]],
                                    [A[0], A[1], A[2], A[3]],
                                    [A[4]]]
    for j in range(5):
        # Here we use the j-th component of the result, so only it and the components it
        # depends on are left.
        simplified = ck.analyzer.canonical_simplify(
            some_reducer1((A[0], A[1], A[2], A[3], A[4]), k)[j])

        # Check that the remaining components are the expected ones.
        for lhs, rhs in zip(simplified.source, reference_simplified_sources[j]):
            assert tvm.ir_pass.Equal(lhs, rhs)

    # Test that components with side effects are not removed
    side_effect = lambda *xs: tvm.make.Call("int32", "dummy", xs, tvm.expr.Call.Intrinsic, None, 0)
    ck.verify(sum_and_prod((A[k], side_effect(A[10-k])), k)[0],
             sum_and_prod((A[k], side_effect(A[10-k])), k)[0])
    ck.verify(sum_and_prod((side_effect(A[k]), A[10-k]), k)[0],
              tvm.sum(side_effect(A[k]), k))
Ejemplo n.º 2
0
Archivo: dense.py Proyecto: bddppq/tvm
def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
    if out_dtype is None:
        out_dtype = data.dtype
    batch, in_dim = get_const_tuple(data.shape)
    out_dim, _ = get_const_tuple(weight.shape)
    # create tuning space
    cfg.define_split("tile_x", out_dim, num_outputs=2)
    cfg.define_split("tile_y", batch, num_outputs=2)
    cfg.define_split("tile_k", in_dim, num_outputs=2)
    if cfg.is_fallback:
        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)

    vec = cfg["tile_k"].size[-1]
    k = tvm.reduce_axis((0, in_dim // vec), "k")
    CC = tvm.compute((batch, out_dim, vec),
                     lambda z, y, x: tvm.sum(
                         data[z, k * vec + x].astype(out_dtype) *
                         weight[y, k * vec + x].astype(out_dtype), axis=k))

    kk = tvm.reduce_axis((0, vec), "kk")
    C = tvm.compute((batch, out_dim),
                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
                    tag="dense_nopack")
    if bias is not None:
        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)

    return C
Ejemplo n.º 3
0
def test_lstm_cell_inline():
    num_step = 128
    num_input = 256
    num_hidden = 1152
    batch_size = 4
    # Global transition matrix
    X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X")
    Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h")
    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
    # h: output hidden state, c: cell state.
    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
    s_init_c = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_c")
    s_init_h = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_h")
    # LSTM transition
    k = tvm.reduce_axis((0, num_input), name="ki2h")
    s_i2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
        name="s_i2h")
    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
    s_h2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
        name="s_h2h")
    # Gate rules
    gates = tvm.compute(s_i2h.shape, lambda *i:
                        s_i2h(*i) + s_h2h(*i), name="gates")
    gshape = (num_step, batch_size, num_hidden)
    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate")
    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform")
    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate")
    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate")
    next_c = tvm.compute(gshape,
                         lambda t, i, j:
                         forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
                         in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
    next_h = tvm.compute(gshape,
                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
    # schedule
    scan_h, scan_c = tvm.scan(
        [s_init_h, s_init_c],
        [update_h, update_c],
        [s_state_h, s_state_c],
        inputs=[X],
        name="lstm_scan")
    # schedule
    s = tvm.create_schedule(scan_h.op)
    # Inline gate computations
    s[gates].compute_inline()
    s[in_gate].compute_inline()
    s[in_transform].compute_inline()
    s[forget_gate].compute_inline()
    s[out_gate].compute_inline()
    # verify we can lower correctly
    tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
Ejemplo n.º 4
0
def test_tensor_reduce_multi_axis():
    m = tvm.var('m')
    n = tvm.var('n')
    A = tvm.placeholder((m, n), name='A')
    k1 = tvm.reduce_axis((0, n), "k")
    k2 = tvm.reduce_axis((0, m), "k")
    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2)))
    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
Ejemplo n.º 5
0
def test_reduce_simplify():
    ck = CanonicalChecker()
    k = tvm.reduce_axis((0, 10), name="k")
    j = tvm.reduce_axis((-5, 3), name="j")
    A = tvm.placeholder((10,), name='A')
    ck.verify(tvm.sum(tvm.expr.Select(k + j < 12, k + j, 0), [k, j]),
              tvm.sum(k + j, [k, j]))
    ck.verify(tvm.sum(A[3], []), A[3])
    # The rule below is not typical, removed for now
    ck.verify(tvm.sum(k / 10, k), tvm.sum(tvm.const(0, "int32"), k))
Ejemplo n.º 6
0
    def _conv(n, h, w, co, vh, vw, vc):
        b1b2 = (b1+b2).astype(out_dtype)
        if dorefa:
            return tvm.sum(
                (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) &
                              kernel_vec[co, dh, dw, ci, vc, b2].astype(out_dtype)) -
                 tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) &
                              ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2,
                axis=[dh, dw, ci, b1, b2])

        return tvm.sum(tvm.popcount(
            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
            kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
                       axis=[dh, dw, ci, b1, b2])
Ejemplo n.º 7
0
    def _conv(n, co, h, w, vh, vw, vc):
        b1b2 = (b1+b2).astype(out_dtype)
        if unipolar:
            return tvm.sum((tvm.popcount(
                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) &
                kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype))  -
                            tvm.popcount(
                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
                                & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
                           axis=[ci, dh, dw, b1, b2])

        return tvm.sum((tvm.popcount(
            data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
            kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
                       axis=[ci, dh, dw, b1, b2])
Ejemplo n.º 8
0
    def _sample(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype('int32')
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
        roi_start_h *= spatial_scale
        roi_end_h *= spatial_scale
        roi_start_w *= spatial_scale
        roi_end_w *= spatial_scale

        # force malformed ROIs to be 1x1
        roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype))
        roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype))

        bin_h = roi_h / pooled_size_h
        bin_w = roi_w / pooled_size_w

        if sample_ratio > 0:
            roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32')
        else:
            roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32')
            roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32')

        count = roi_bin_grid_h * roi_bin_grid_w
        rh = tvm.reduce_axis((0, roi_bin_grid_h))
        rw = tvm.reduce_axis((0, roi_bin_grid_w))
        roi_start_h += ph * bin_h
        roi_start_w += pw * bin_w
        return tvm.sum(_bilinear(batch_index, c,
                                 roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                                 roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count,
                       axis=[rh, rw])
Ejemplo n.º 9
0
Archivo: bnn.py Proyecto: bddppq/tvm
def binary_dense(data, weight):
    """Binary matrix multiplication using xor and bit-count.

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [batch, in_dim], dtype is uint32.

    weight : tvm.Tensor
        2-D with shape [out_dim, in_dim], dtype is uint32.

    Returns
    -------
    output : tvm.Tensor
        2-D with shape [batch, out_dim], dtype is float32.
    """
    assert data.dtype == 'uint32' and weight.dtype == 'uint32', \
        "dtype of data and weight should be uint32"
    assert len(data.shape) == 2 and len(weight.shape) == 2, \
        "only support 2-dim binary dense"
    batch, in_dim = data.shape
    out_dim, _ = weight.shape
    k = tvm.reduce_axis((0, in_dim), name='k')
    matmul = tvm.compute((batch, out_dim), lambda i, j: \
                          tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \
                          tag='binary_dense')

    return tvm.compute((batch, out_dim), lambda i, j: \
                        32 * in_dim - 2. * matmul(i, j), \
                        tag=tag.ELEMWISE)
Ejemplo n.º 10
0
def matmul_v1(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # 2. get the config object
    cfg = autotvm.get_config()

    # 3. define search space
    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])

    # 4. schedule according to config
    yo, yi = s[C].split(y, cfg['tile_y'].val)
    xo, xi = s[C].split(x, cfg['tile_x'].val)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Ejemplo n.º 11
0
def matmul(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Ejemplo n.º 12
0
Archivo: pooling.py Proyecto: gwli/tvm
def global_pool(data, pool_type):
    """Perform global pooling on the data

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, channel, in_height, in_width]

    pool_type : str
        Pool type, 'max' or 'avg'

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, channel, 1, 1]
    """
    assert len(data.shape) == 4, "only support 4-dim pooling"
    batch, channel, height, width = data.shape

    dheight = tvm.reduce_axis((0, height))
    dwidth = tvm.reduce_axis((0, width))

    if pool_type == 'max':
        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tvm.max(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
                            tag="global_pool_max")
    elif pool_type == 'avg':
        tsum = tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tvm.sum(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \
                            tag="global_pool_sum")
        return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \
                            tsum[n, c, h, w] / (height*width).astype(tsum.dtype), \
                            tag=tag.ELEMWISE)
    else:
        raise ValueError("Pool type should be 'avg' or 'max'.")
Ejemplo n.º 13
0
def test_dot():
    nn = 12
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    k = tvm.reduce_axis((0, n), 'k')
    C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C')
    s = tvm.create_schedule(C.op)
    fapi = lower(s, [A, B, C])

    def verify(target):
        if not tvm.module.enabled(target):
            print("Target %s is not enabled" % target)
            return
        f = tvm.codegen.build_module(fapi, target)
        # verify
        ctx = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx)
        c  = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)

    verify("llvm")
Ejemplo n.º 14
0
    def get_gemm_feature(target):
        k = tvm.reduce_axis((0, N), 'k')
        A = tvm.placeholder((N, N), name='A')
        B = tvm.placeholder((N, N), name='B')
        C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
                        name='C')

        s = tvm.create_schedule(C.op)

        y, x = s[C].op.axis
        axes = list(s[C].tile(y, x, 8, 8)) + [k]
        perm = np.random.permutation(5)
        axes = [axes[x] for x in perm]
        s[C].reorder(*axes)

        if "gpu" in target.keys:
            pick = []
            # filter out reduction axis
            for i in range(len(perm)):
                if perm[i] != 4:
                    pick.append(axes[i])
            s[C].bind(pick[0], tvm.thread_axis("blockIdx.x"))
            s[C].bind(pick[1], tvm.thread_axis("vthread"))
            s[C].bind(pick[2], tvm.thread_axis("threadIdx.y"))

        with target:
            feas = feature.get_itervar_feature(s, [A, B, C])
            feas = feature.flatten_itervar_feature(feas)
        return feas
Ejemplo n.º 15
0
def packed_conv2d(data,
                  kernel,
                  padding,
                  strides,
                  out_dtype="int32"):
    """ Packed conv2d function.
    """
    if padding[0]:
        pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
    else:
        pad_data = data
    assert len(data.shape) == 6
    assert len(kernel.shape) == 6
    oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
    owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
    oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])

    ishape = topi.util.get_const_tuple(data.shape)
    kshape = topi.util.get_const_tuple(kernel.shape)
    assert data.dtype == "int8", data.dtype
    assert kernel.dtype == "int8", kernel.dtype
    d_i = tvm.reduce_axis((0, kshape[2]), name='d_i')
    d_j = tvm.reduce_axis((0, kshape[3]), name='d_j')
    k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
    k_i = tvm.reduce_axis((0, ishape[-1]), name='k_i')
    hstride, wstride = strides
    res = tvm.compute(
        oshape,
        lambda b_o, c_o, i, j, b_i, c_i: tvm.sum(
            pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) *
            kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
            axis=[k_o, d_i, d_j, k_i]),
        name="res", tag="packed_conv2d")
    return res
Ejemplo n.º 16
0
def intrin_gemv(m, n):
    w = tvm.placeholder((m, n), name='w')
    x = tvm.placeholder((n,), name='x')
    k = tvm.reduce_axis((0, n), name='k')
    z = tvm.compute((m,), lambda i:
                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
    Wb = tvm.decl_buffer(w.shape, w.dtype,
                         name="W",
                         offset_factor=16,
                         strides=[tvm.var('ldw'), 1])
    def intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        ww_ptr = ww.access_ptr("r")
        xx_ptr = xx.access_ptr("r")
        zz_ptr = zz.access_ptr("w")
        body = tvm.call_packed(
            "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        reset = tvm.call_packed(
            "fill_zero", zz_ptr, n)
        update = tvm.call_packed(
            "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        return body, reset, update

    with tvm.build_config(data_alignment=16,
                          offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func,
                                      binds={w: Wb})
Ejemplo n.º 17
0
def test_conv_tiling():
    HSTR = WSTR = 1
    in_channel = 128
    kernel_height = kernel_width = 3
    out_channel = 64
    batch_size = 1
    in_height = in_width = 64
    out_height = out_width = in_height - kernel_height + 1
    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
        out_channel), name='kernel')
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')
    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                     kernel[kh, kw, ic, oc],
                                                     axis=[ic, kh, kw]),
                       name="conv2d")
    s = tvm.create_schedule(conv.op)

    n, oc, oh, ow = conv.op.axis
    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    stmt = tvm.ir_pass.LoopPartition(stmt, True)
    stmt = tvm.ir_pass.Simplify(stmt)
    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
Ejemplo n.º 18
0
Archivo: dense.py Proyecto: bddppq/tvm
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
    if out_dtype is None:
        out_dtype = data.dtype
    batch, in_dim = get_const_tuple(data.shape)
    out_dim, _ = get_const_tuple(weight.shape)
    # create tuning space
    cfg.define_split("tile_y", batch, num_outputs=3)
    cfg.define_split("tile_x", out_dim, num_outputs=3)
    cfg.define_split("tile_k", in_dim, num_outputs=2)
    if cfg.is_fallback:
        _default_dense_pack_config(cfg, batch, out_dim, in_dim)

    packw_bn = cfg["tile_x"].size[-1]
    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
    packw = tvm.compute(packw_shape,
                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")

    k = tvm.reduce_axis((0, in_dim), name="k")
    C = tvm.compute((batch, out_dim),
                    lambda y, x: tvm.sum(
                        data[y, k].astype(out_dtype) *
                        packw[x // packw_bn, k, x % packw_bn].astype(out_dtype),
                        axis=k),
                    tag="dense_pack")
    if bias is not None:
        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                        tag=tag.BROADCAST)
    return C
Ejemplo n.º 19
0
def intrin_gemv(m, l):
    a = tvm.placeholder((l,), name='a')
    b = tvm.placeholder((m, l), name='b')
    k = tvm.reduce_axis((0, l), name='k')
    c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c')
    Ab = tvm.decl_buffer(a.shape, a.dtype,
                         name="A",
                         offset_factor=1,
                         strides=[1])
    Bb = tvm.decl_buffer(b.shape, b.dtype,
                         name="B",
                         offset_factor=1,
                         strides=[tvm.var("s1"), 1])
    Cb = tvm.decl_buffer(c.shape, c.dtype,
                         name="C",
                         offset_factor=1,
                         strides=[1])
    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(tvm.call_extern("int32", "gemv_update",
                                cc.access_ptr("w"),
                                aa.access_ptr("r"),
                                bb.access_ptr("r"),
                                m, l, bb.strides[0]))
        return ib.get()
    with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
Ejemplo n.º 20
0
def dense_default(data, weight, bias=None):
    """The default implementation of dense in topi.

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [batch, in_dim]

    weight : tvm.Tensor
        2-D with shape [out_dim, in_dim]

    bias : tvm.Tensor, optional
        1-D with shape [out_dim]

    Returns
    -------
    output : tvm.Tensor
        2-D with shape [batch, out_dim]
    """
    assert len(data.shape) == 2 and len(weight.shape) == 2, \
        "only support 2-dim dense"
    if bias is not None:
        assert len(bias.shape) == 1
    batch, in_dim = data.shape
    out_dim, _ = weight.shape
    k = tvm.reduce_axis((0, in_dim), name='k')
    matmul = tvm.compute((batch, out_dim), \
                         lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \
                         tag='dense')
    if bias is not None:
        matmul = tvm.compute((batch, out_dim), \
                             lambda i, j: matmul[i, j] + bias[j], \
                             tag=tag.BROADCAST)
    return matmul
Ejemplo n.º 21
0
def test_local_gemm():
    if not tvm.module.enabled("opengl"):
        return
    if not tvm.module.enabled("llvm"):
        return

    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A', dtype='int32')
    B = tvm.placeholder((m, l), name='B', dtype='int32')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
                    name='CC')

    s = tvm.create_schedule(C.op)
    s[C].opengl()
    print(tvm.lower(s, [A, B, C], simple_mode=True))

    f = tvm.build(s, [A, B, C], "opengl", name="gemm")
    print("------opengl code------")
    print(f.imported_modules[0].get_source(fmt="gl"))

    ctx = tvm.opengl()
    n, m, l = nn, nn, nn
    a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype)
    b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
    f(a, b, c)

    tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
Ejemplo n.º 22
0
def test_rfactor():
    n = tvm.var('n')
    k1 = tvm.reduce_axis((0, n), name="k1")
    k2 = tvm.reduce_axis((0, n), name="k2")
    A = tvm.placeholder((n, n, n), name='A')
    B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2]))
    # normal schedule
    s = tvm.create_schedule(B.op)
    BF = s.rfactor(B, k1)
    assert(tuple(BF.shape) == (n, n))
    assert(set(BF.op.body[0].axis) == set([k2]))
    assert(s[B].op.body[0].axis[0].dom.extent == n)
    assert(len(s[B].all_iter_vars) == 2)
    # schedule with splot
    s = tvm.create_schedule(B.op)
    ko, ki = s[B].split(k1, factor=4)
    xo, xi = s[B].split(B.op.axis[0], factor=8)
    BF = s.rfactor(B, ki)
    assert(BF.shape[0].value == 4)
    assert(BF.shape[1] == n)
    assert(BF.op.body[0].axis[0] ==  k2)
    assert(BF.op.body[0].axis[1].var ==  ko.var)
    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
    # schedule with factor_axis
    s = tvm.create_schedule(B.op)
    ko, ki = s[B].split(k1, factor=4)
    xo, xi = s[B].split(B.op.axis[0], factor=8)
    BF = s.rfactor(B, ki, 1)
    assert(n == BF.shape[0])
    assert(BF.shape[1].value == 4)
    assert(BF.op.body[0].axis[0] ==  k2)
    assert(BF.op.body[0].axis[1].var ==  ko.var)
    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
Ejemplo n.º 23
0
def test_in_bounds_conv_llvm(loop_tiling=False):
    HSTR = WSTR = 1
    in_channel = 128
    kernel_height = kernel_width = 3
    out_channel = 64
    batch_size = 1
    in_height = in_width = 64
    out_height = out_width = in_height - kernel_height + 1
    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
        out_channel), name='kernel')
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')
    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                     kernel[kh, kw, ic, oc],
                                                     axis=[ic, kh, kw]),
                       name="conv2d")
    s = tvm.create_schedule(conv.op)

    n, oc, oh, ow = conv.op.axis
    if loop_tiling:
        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
    print (lowered_func.body)
    ctx = tvm.cpu (0)

    f = tvm.build(s, [data, kernel, conv], "llvm")
    data_input = tvm.nd.array(np.random.uniform(
          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
    kernel_input = tvm.nd.array(np.random.uniform(
          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
    f(data_input, kernel_input, conv_out)
Ejemplo n.º 24
0
def test_rfactor():
    n = tvm.convert(1027)
    A = tvm.placeholder((n,), name='A')
    k = tvm.reduce_axis((0, n))
    B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B')
    # schedule
    s = tvm.create_schedule(B.op)
    kf, ki = s[B].split(k, nparts=4)
    BF = s.rfactor(B, kf)
    s[BF].parallel(BF.op.axis[0])
    # one line to build the function.
    def check_target(target="llvm"):
        if not tvm.module.enabled(target):
            return
        ctx = tvm.cpu(0)
        fapi = tvm.lower(s, args=[A, B])
        fsum = tvm.build(fapi,
                         target=target,
                         name="mysum")
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
        b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
        fsum(a, b)
        res = np.sum(a.asnumpy(), axis=0)
        tvm.testing.assert_allclose(
            b.asnumpy(), res, rtol=1e-4)

    check_target()
Ejemplo n.º 25
0
Archivo: conv2d.py Proyecto: gwli/tvm
def _spatial_pack(data, kernel, stride, padding, out_dtype):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
    sch = _get_schedule(wkl)

    H, W = wkl.height, wkl.width
    CI, CO = wkl.in_filter, wkl.out_filter
    KH, KW = wkl.hkernel, wkl.wkernel
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    HCAT, WCAT = KH-1, KW-1

    VH = sch.vh
    VW = sch.vw
    VC = sch.vc
    UNROLL = sch.unroll

    TH = H + 2*HPAD
    TW = W + 2*WPAD
    OH = (H + 2*HPAD - KH) // HSTR + 1
    OW = (W + 2*WPAD - KW) // WSTR + 1

    dshape = (1, CI, H, W)
    dpshape = (1, CI, TH, TW)
    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)

    kshape = (CO, CI, KH, KW)
    kvshape = (CO/VC, CI, KH, KW, VC)

    ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (1, CO, OH, OW)

    DOPAD = (HPAD != 0 and WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \
        data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')

    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \
        kernel[co*VC+vc][ci][dh][dw], name='kernel_vec')

    ci = tvm.reduce_axis((0, CI), name='ci')
    dh = tvm.reduce_axis((0, KH), name='dh')
    dw = tvm.reduce_axis((0, KW), name='dw')

    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) *
                kernel_vec[co, ci, dh, dw, vc].astype(out_dtype),
                axis=[ci, dh, dw]), name='conv')

    output = tvm.compute(oshape, lambda n, co, h, w:
                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
                         name='output_unpack', tag='spatial_conv_output')

    return output
Ejemplo n.º 26
0
def test_make_sum():
    A = tvm.placeholder((2, 10), name='A')
    k = tvm.reduce_axis((0,10), "k")
    B = tvm.compute((2,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
    json_str = tvm.save_json(B)
    BB = tvm.load_json(json_str)
    assert B.op.body[0].combiner is not None
    assert BB.op.body[0].combiner is not None
Ejemplo n.º 27
0
 def _conv(nn, ff, yy, xx):
     b1b2 = (b1+b2).astype(out_dtype)
     return tvm.sum(
         ((tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
                        Filter_q[ff, rc, ry, rx, b2]) -
           tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
                        ~Filter_q[ff, rc, ry, rx, b2]))
          << (b1b2)).astype(out_dtype),
         axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
Ejemplo n.º 28
0
def dp4a(x_scope='local', y_scope='local', z_scope='local'):
    """
    Int8 dot product reduced by every 4 elements using __dp4a

    Parameters
    ----------
    x_scope : str, optional
        The storage scope of buffer for lhs
    y_scope : str, optional
        The storage scope of buffer for rhs
    z_scope : str, optional
        The storage scope of buffer for result

    Returns
    -------
    intrin : TensorIntrin
        The dp4a TensorIntrin that can be used in tensorizing schedule.
    """

    n = 4  # dp4a requires operands packed by 4
    x = tvm.placeholder((n,), name='x', dtype='int8')
    y = tvm.placeholder((n,), name='y', dtype='int8')

    k = tvm.reduce_axis((0, n), name='rc')

    z = tvm.compute((1,), lambda i: tvm.sum(
        x[k].astype('int32') * y[k].astype('int32'), axis=[k]))

    def _intrin_func(ins, outs):
        def _instr(index):
            xx, yy = ins
            zz = outs[0]

            if index == 1:
                return zz.vstore(0, 0)

            ib = tvm.ir_builder.create()

            vec_x = xx.vload(0, dtype='int8x4')
            vec_y = yy.vload(0, dtype='int8x4')
            prev_z = 0 if index == 0 else zz.vload(0)

            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
            ib.emit(zz.vstore(0, new_z))

            return ib.get()

        return _instr(0), _instr(1), _instr(2) # body, reset, update

    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
        scopes = {x: x_scope, y: y_scope, z: z_scope}
        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
                                    data_alignment=cfg.data_alignment,
                                    offset_factor=cfg.offset_factor,
                                    scope=scopes[t]) for t in [x, y, z]}

        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
Ejemplo n.º 29
0
def test_rank_zero():
    m = tvm.var('m')
    A = tvm.placeholder((m,), name='A')
    scale = tvm.placeholder((), name='s')
    k = tvm.reduce_axis((0, m), name="k")
    T = tvm.compute((), lambda : tvm.sum(A[k] * scale(), axis=k))
    print(T)
    print(T.op.body)
    assert(tuple(T.shape) == ())
Ejemplo n.º 30
0
def test_gemm_bound():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n, n), name='A')
    B = tvm.placeholder((n, n), name='B')
    k = tvm.reduce_axis((0, n), name='k')
    C = tvm.compute(
        (n, n),
        lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
        name='CC')
    # schedule
    s = tvm.create_schedule(C.op)
    xtile, ytile = 32, 32
    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis("threadIdx.y")

    CC = s.cache_write(C, "local")
    AA = s.cache_read(A, "shared", [CC])
    BB = s.cache_read(B, "shared", [CC])
    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].reorder(by, bx, yi, xi)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    ty, yi = s[C].split(yi, nparts=num_thread)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].reorder(ty, tx, yi, xi)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)

    s[CC].compute_at(s[C], tx)
    s[AA].compute_at(s[CC], k)
    s[BB].compute_at(s[CC], k)

    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)

    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    assert(bounds[BB.op.axis[0]].extent.value==64)
    assert(bounds[AA.op.axis[0]].extent.value==64)
    assert(bounds[CC.op.axis[0]].extent.value == 8)
    assert(bounds[CC.op.axis[1]].extent.value == 8)
Ejemplo n.º 31
0
def test_tensorize_matmul():
    n = 1024
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((m, l), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m), lambda i, j:
                    tvm.sum(B[j, k] * A[i, k], axis=k), name='C')

    def check(factor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        yo, yi = s[C].split(y, factor=factor)
        gemv = intrin_gemv(factor, l)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])


    def check_rfactor(factor, rfactor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        s[C].reorder(yo, ro, yi, ri)
        gemv = intrin_gemv(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor_no_reset(factor, rfactor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        s[C].reorder(yo, ro, yi, ri)
        gemv = intrin_gemv_no_reset(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor_no_reset_multi_reduction(factor, rfactor):
        s = tvm.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        roo, roi = s[C].split(ro, factor=2)
        s[C].reorder(yo, roo, roi, yi, ri)
        gemv = intrin_gemv_no_reset(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    check(16)
    check_rfactor(16, 16)
    check_rfactor_no_reset(16, 16)
    check_rfactor_no_reset_multi_reduction(16, 16)
Ejemplo n.º 32
0
    def _run(env, remote):
        # declare
        o = 4
        n = 1
        m = 4
        x = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN),
                            name="x",
                            dtype=env.inp_dtype)
        w = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                            name="w",
                            dtype=env.wgt_dtype)
        x_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i),
                            "x_buf")
        w_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                            lambda *i: w(*i), "w_buf")
        ko = tvm.reduce_axis((0, n), name="ko")
        ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
        y_gem = tvm.compute(
            (o, m, env.BATCH, env.BLOCK_OUT),
            lambda bo, co, bi, ci: tvm.sum(x_buf[bo, ko, bi, ki].astype(
                env.acc_dtype) * w_buf[co, ko, ci, ki].astype(env.acc_dtype),
                                           axis=[ko, ki]),
            name="y_gem")
        y_shf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT),
                            lambda *i: y_gem(*i) >> 8,
                            name="y_shf")
        y_max = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT),
                            lambda *i: tvm.max(y_shf(*i), 0), "y_max")  #relu
        y_min = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT),
                            lambda *i: tvm.min(y_max(*i),
                                               (1 << (env.INP_WIDTH - 1)) - 1),
                            "y_min")  #relu
        y = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT),
                        lambda *i: y_min(*i).astype(env.inp_dtype),
                        name="y")

        if not remote:
            return

        def verify(s):
            mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
            temp = util.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            ctx = remote.ext_dev(0)
            x_np = np.random.randint(-128,
                                     128,
                                     size=(o, n, env.BATCH,
                                           env.BLOCK_IN)).astype(x.dtype)
            w_np = np.random.randint(-128,
                                     128,
                                     size=(m, n, env.BLOCK_OUT,
                                           env.BLOCK_IN)).astype(w.dtype)
            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
            x_nd = tvm.nd.array(x_np, ctx)
            w_nd = tvm.nd.array(w_np, ctx)
            y_nd = tvm.nd.array(y_np, ctx)
            y_np = y_np.astype(env.acc_dtype)
            for b in range(o):
                for i in range(m):
                    for j in range(n):
                        y_np[b, i, :] += np.dot(
                            x_np[b, j, :].astype(env.acc_dtype),
                            w_np[i, j].T.astype(env.acc_dtype))
            y_np = np.right_shift(y_np, 8)
            y_np = np.clip(y_np, 0, (1 <<
                                     (env.INP_WIDTH - 1)) - 1).astype(y.dtype)

            if env.TARGET == "sim":
                simulator.clear_stats()
                f(x_nd, w_nd, y_nd)
                print(simulator.stats())
            else:
                f(x_nd, w_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.asnumpy())

        def test_schedule1():
            # default schedule with no smt
            s = tvm.create_schedule(y.op)
            # set the scope of the SRAM buffers
            s[x_buf].set_scope(env.inp_scope)
            s[w_buf].set_scope(env.wgt_scope)
            s[y_gem].set_scope(env.acc_scope)
            s[y_shf].set_scope(env.acc_scope)
            s[y_max].set_scope(env.acc_scope)
            s[y_min].set_scope(env.acc_scope)
            # set pragmas for DMA transfer and ALU ops
            s[x_buf].compute_at(s[y_gem], ko)
            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
            s[w_buf].compute_at(s[y_gem], ko)
            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
            s[y].pragma(s[y].op.axis[0], env.dma_copy)
            # tensorization
            s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1],
                             s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki)
            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
            verify(s)

        def test_smt():
            # test smt schedule
            s = tvm.create_schedule(y.op)
            s[x_buf].set_scope(env.inp_scope)
            s[w_buf].set_scope(env.wgt_scope)
            s[y_gem].set_scope(env.acc_scope)
            s[y_shf].set_scope(env.acc_scope)
            s[y_max].set_scope(env.acc_scope)
            s[y_min].set_scope(env.acc_scope)
            abo, aco, abi, aci = s[y].op.axis
            abo1, abo2 = s[y].split(abo, nparts=2)
            s[y].bind(abo1, tvm.thread_axis("cthread"))
            s[y_gem].compute_at(s[y], abo1)
            s[y_shf].compute_at(s[y], abo1)
            s[y_max].compute_at(s[y], abo1)
            s[y_min].compute_at(s[y], abo1)
            s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1],
                             s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki)
            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
            s[x_buf].compute_at(s[y_gem], ko)
            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
            s[w_buf].compute_at(s[y_gem], ko)
            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
            s[y].pragma(abo2, env.dma_copy)
            verify(s)

        test_schedule1()
        test_smt()
Ejemplo n.º 33
0
def _decl_cl_spatialpack(data,
                         kernel,
                         stride,
                         padding,
                         layout,
                         out_dtype='float16'):
    batch, in_channel, in_height, in_width = [
        util.get_const_int(x) for x in data.shape
    ]
    num_filter, channel, kernel_h, kernel_w = [
        util.get_const_int(x) for x in kernel.shape
    ]
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        stride_h, stride_w = stride
    else:
        stride_h, stride_w = stride, stride

    out_channel = num_filter
    out_height = simplify((in_height - kernel_h + pad_top + pad_down) //
                          stride_h + 1)
    out_width = simplify((in_width - kernel_w + pad_left + pad_right) //
                         stride_w + 1)
    oshape = (batch, out_channel, out_height, out_width)
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(data, pad_before, pad_after, name="pad_temp")

    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')

    block_w = 0
    block_h = 0
    if stride_h == 2:
        if num_filter + kernel_h == 515:
            conv_tag = "4_4"
            block_h = 4
            block_w = 4
        else:
            conv_tag = "4_5"
            block_h = 4
            block_w = 5
    elif kernel_h == 3:
        if num_filter == 512:
            conv_tag = "2_7"
            block_h = 2
            block_w = 7
        else:
            conv_tag = "2_14"
            block_h = 2
            block_w = 14
    else:
        conv_tag = "1_16"
        block_h = 1
        block_w = 16

    c_h = out_height
    c_w = out_width

    if not out_height % block_h == 0:
        c_h = (out_height // block_h + 1) * block_h

    if not out_width % block_w == 0:
        c_w = (out_width // block_w + 1) * block_w

    nv = 16
    cshape = (batch, out_channel // nv, c_h, c_w, nv)
    kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)

    kernel_vec = tvm.compute(
        kvshape,
        lambda co, ci, kh, kw, vc: kernel[co * nv + vc][ci][kh][kw],
        name='kernel_vec')

    conv = tvm.compute(
        cshape,
        lambda nn, ff, yy, xx, vc:\
          tvm.sum(
              temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
              kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
              axis=[rc, ry, rx]), tag=conv_tag, name='conv')

    output = tvm.compute(
        oshape,
        lambda nn, ff, yy, xx: conv[nn][ff // nv][yy][xx][ff % nv],
        name='output_unpack',
        tag=conv_tag)

    return output
Ejemplo n.º 34
0
def dot_16x1x16_int8_int8_int32():
    """
    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
    This function takes two arrays of int8 datatype -- data[4] and
    kernel[16][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[16] of int32 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
                int32 output[16]){
            for (int i = 0; i < 16; i++){
                out[i] = 0;
                for (int k = 0; k < 4; k++){
                    out[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in an AVX512 vector register and
    the data[4] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int32_lanes = 16  # 16 int32 lanes in AVX512
    num_int8_elements = 4  # 4 int8 elements in int32
    data = tvm.placeholder((num_int8_elements, ), dtype='uint8', name='data')
    kernel = tvm.placeholder((int32_lanes, num_int8_elements),
                             dtype='int8',
                             name='kernel')
    k = tvm.reduce_axis((0, num_int8_elements), name='k')
    C = tvm.compute(
        (int32_lanes, ),
        lambda i: tvm.sum(
            data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k),
        name="C")

    a_buffer = tvm.decl_buffer(data.shape,
                               dtype='uint8',
                               name="a_buffer",
                               offset_factor=1,
                               strides=[1])
    b_buffer = tvm.decl_buffer(kernel.shape,
                               dtype='int8',
                               name="b_buffer",
                               offset_factor=1,
                               strides=[tvm.var('ldw'), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
            vec_ai32 = re_int32.astype('int32x16')
            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], "int8x64")
            vec_one = tvm.const(1, "int16x32")
            pair_reduction = tvm.call_llvm_intrin(
                'int16x32', 'llvm.x86.avx512.pmaddubs.w.512',
                tvm.const(0, 'uint32'), vec_a, vec_b)
            quad_reduction = tvm.call_llvm_intrin(
                'int32x16', 'llvm.x86.avx512.pmaddw.d.512',
                tvm.const(0, 'uint32'), pair_reduction, vec_one)
            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(
                    0, quad_reduction + outs[0].vload([0], 'int32x16')))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(C.op,
                                      _intrin_func,
                                      binds={
                                          data: a_buffer,
                                          kernel: b_buffer
                                      })
Ejemplo n.º 35
0
Archivo: conv2d.py Proyecto: zhiqiu/tvm
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    idxd = tvm.indexdiv
    idxm = tvm.indexmod

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    K = CO
    C = CI

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m-1) // m, (W + m-1) // m
    P = N * nH * nW

    cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
    cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
    VP = cfg['tile_p'].size[-1]
    VK = cfg['tile_k'].size[-1]

    # pack input tile
    input_tile = tvm.compute((C, idxd(P, VP), alpha, alpha, VP),
                             lambda c, b, eps, nu, bb:
                             data_pad[idxd(b*VP + bb, nH*nW), c,
                                      idxm(idxd(b*VP + bb, nW), nH) * m + eps,
                                      idxm(b*VP + bb, nW) * m + nu],
                             name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk:
                        tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
                                G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')

    # transform image
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    V = tvm.compute((alpha, alpha, idxd(P, VP), C, VP), lambda eps, nu, b, c, bb:
                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) *
                            B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V')

    # batch gemm
    c = tvm.reduce_axis((0, C), name='c')
    M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b:
                    tvm.sum(U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] *
                            V[eps][nu][idxd(b, VP)][c][idxm(b, VP)], axis=c), name='M')

    # inverse transform
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
                            axis=[r_eps, r_nu]), name='Y')

    # unpack output
    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                         Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m),
                              idxm(h, m), idxm(w, m)],
                         name='output', tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * K * H * W * KH * KW * C)
    return output
Ejemplo n.º 36
0
A0 = tvm.placeholder((n, ), name='A0', dtype='float32')
A1 = tvm.placeholder((n, ), name='A1', dtype='float32')
A2 = tvm.placeholder((n, ), name='A2', dtype='float32')

B0 = tvm.placeholder((n, ), name='B0', dtype='float32')
B1 = tvm.placeholder((n, ), name='B1', dtype='float32')
B2 = tvm.placeholder((n, ), name='B2', dtype='float32')

D = tvm.placeholder((n, ), name='D', dtype='float32')

D_ij = lambda i : (A0[i] - B0[j]) * (B0[j] - A0[i]) \
                + (A1[i] - B1[j]) * (B1[j] - A1[i]) \
                + (A2[i] - B2[j]) * (B2[j] - A2[i])
K_ij = lambda i: tvm.call_pure_extern("float32", "__expf", D_ij(i))

C0 = tvm.compute((n, ), lambda i: tvm.sum(K_ij(i) * D[j], axis=j), name="C0")

# Scheduled the computation
s0 = tvm.create_schedule(C0.op)
bx, tx = s0[C0].split(C0.op.axis[0], factor=192)
s0[C0].bind(bx, tvm.thread_axis("blockIdx.x"))
s0[C0].bind(tx, tvm.thread_axis("threadIdx.x"))

# Actually build the binary
fconv0 = tvm.build(s0, [A0, A1, A2, B0, B1, B2, D, C0],
                   tgt,
                   target_host=tgt_host,
                   name="myconv0")

# Benchmark
nits = 10
Ejemplo n.º 37
0
    a_buf = tvm.compute(
        shape1_tiled, lambda ico, no, ni, ici: a[no * gemm_shape[0] + ni, ico *
                                                 factor + ici], 'a_buf')
    b_buf = tvm.compute(
        shape2_tiled, lambda ico, oco, oci, ici: b[oco * gemm_shape[
            2] + oci, ico * factor + ici], 'b_buf')

    out_shape_tiled = (shape1_tiled[1], shape2_tiled[1], shape1_tiled[2],
                       shape2_tiled[2])
    ko = tvm.reduce_axis((0, shape1[1] // factor), 'ko')
    ki = tvm.reduce_axis((0, factor), 'ki')

    out_buf = tvm.compute(
        out_shape_tiled,
        lambda xo, yo, xi, yi: tvm.sum(a_buf[ko, xo, xi, ki].astype(dtype_w) *
                                       b_buf[ko, yo, yi, ki].astype(dtype_w),
                                       axis=[ko, ki]), 'out_buf')
    out_acc = out_buf
    # nnpu.utils.MarkScope(out_acc, 'acc')
    # out_buf = tvm.compute(out_shape_tiled, lambda *i: out_acc(*i), 'out_host')
    # nnpu.utils.MarkScope(out_buf)
    out_host = tvm.compute(out_shape_tiled, lambda *i: out_buf(*i), 'out_host')

    # schedule
    s = nnpu.create_schedule(out_host.op)
    # al = s.cache_read(a_buf, env.get_scope('buffer1'), out_acc)
    # bl = s.cache_read(b_buf, env.get_scope('buffer2'), out_acc)
    al = a_buf
    bl = b_buf

    a_buffer_scope = 'buffer1'
Ejemplo n.º 38
0
import os

input(os.getpid())
tgt_host = "c"
device = "dpu"

M = tvm.var("M")
K = tvm.var("K")
N = tvm.var("N")

A = tvm.placeholder((M, K), name='A', dtype='float32')
B = tvm.placeholder((K, N), name='B', dtype='float32')

k = tvm.reduce_axis((0, K), 'k')
C = tvm.compute((M, N),
                lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                name='C')

s = tvm.create_schedule(C.op)
func = tvm.build(s, [A, B, C], target_host=tgt_host, name='DPUGemm')

#print("------------------DPU_LOWER code---------------------")
#print(tvm.lower(s, [A, B, C], simple_mode=True))
"""
scale = 4
num_thread = 8
block_factor = scale * num_thread

block_x = tvm.thread_axis("blockIdx.x")
thread_x = tvm.thread_axis( "threadIdx.x")
block_y = tvm.thread_axis("blockIdx.y")
Ejemplo n.º 39
0
def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2):
    """Perform the across channels local response normalisation
    on the input data.

    sum_sqr_up^i{x, y} = (bias+((alpha/size)* \
                                {sum_{j=max(0, i-size/2)}^{min(N-1,i+size/2)} \
                                     (data^j{x,y})^2}))^beta
    output^i{x, y} = data^i{x, y}/sum_sqr_up^i{x, y}
    N is the number for input channels

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, channel, height, width]

    size : int
        normalisation window size

    axis : int
        input data layout channel axis
        default value is 1 for NCHW format

    bias : float
        offset to avoid dividing by 0

    alpha : float
        to be divided

    beta : float
        exponent

    Returns
    -------
    output : tvm.Tensor
        4-D output with same shape
    """
    assert len(data.shape) == 4, "only support 4-dim lrn"
    assert (size % 2) == 1, "size should be odd number"
    assert (axis == 1) or (axis == 3), "axis should 1 or 3 for NCHW and NHWC"
    ##Add padding on left & right of size radius first
    pad_after = pad_before = [0, 0, 0, 0]
    pad_after[axis] = pad_before[axis] = (size // 2)
    pad_data = pad(data, pad_before, pad_after, name="pad_data")

    rxs = tvm.reduce_axis((0, size), name='rxs')
    if axis == 1:
        #NCHW layout
        sqr_sum = tvm.compute(
            data.shape, lambda i, j, k, l: tvm.sum(pad_data[i, j + rxs, k, l] *
                                                   pad_data[i, j + rxs, k, l],
                                                   axis=rxs))
    elif axis == 3:
        #NHWC layout
        sqr_sum = tvm.compute(
            data.shape, lambda i, j, k, l: tvm.sum(pad_data[i, j, k, l + rxs] *
                                                   pad_data[i, j, k, l + rxs],
                                                   axis=rxs))

    sqr_sum_up = tvm.compute(
        data.shape, lambda i, j, k, l: tvm.power(
            (bias + (alpha * sqr_sum[i, j, k, l] / size)), beta))

    return topi.broadcast_div(data, sqr_sum_up)
Ejemplo n.º 40
0
def _depthwise_conv2d_NCHWc_cpu(cfg,
                                data,
                                kernel,
                                strides,
                                padding,
                                dilation,
                                layout,
                                out_layout,
                                out_dtype=None):
    out_dtype = data.dtype if out_dtype is None else out_dtype
    batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(
        data.shape)
    out_channel_chunk, _, filter_height, filter_width, __, out_channel_block \
        = get_const_tuple(kernel.shape)

    strides = strides if isinstance(strides,
                                    (tuple, list)) else (strides, strides)
    HSTR, WSTR = strides
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (filter_height, filter_width))

    dh, dw = dilation if isinstance(dilation,
                                    (tuple, list)) else (dilation, dilation)
    assert (dh, dw) == (1, 1), "Does not support dilation"

    in_channel = in_channel_chunk * in_channel_block
    out_channel = out_channel_chunk * out_channel_block
    channel_multiplier = out_channel // in_channel

    out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1
    out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1

    # get workload and related schedule config
    wkl = _get_workload(
        tvm.placeholder((batch, in_channel, in_height, in_width),
                        dtype=data.dtype),
        tvm.placeholder((out_channel, in_channel, filter_height, filter_width),
                        dtype=kernel.dtype), strides, padding, out_dtype)
    if cfg.is_fallback:
        _fallback_schedule(cfg, wkl)

    # padding stage
    DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0)
    if DOPAD:
        pad_before = [0, 0, pad_top, pad_left, 0]
        pad_after = [0, 0, pad_down, pad_right, 0]
        data_pad = pad(data, pad_before, pad_after, name="PaddedInput")
    else:
        data_pad = data

    # depthconv stage
    idxdiv = tvm.indexdiv
    idxmod = tvm.indexmod

    kh = tvm.reduce_axis((0, filter_height), name='kh')
    kw = tvm.reduce_axis((0, filter_width), name='kw')
    Output = tvm.compute(
        (batch, out_channel_chunk, out_height, out_width, out_channel_block),
        lambda b, oco, oh, ow, oci: tvm.sum((data_pad[
            b,
            idxdiv(idxdiv(oco * out_channel_block + oci, channel_multiplier),
                   in_channel_block), oh * HSTR + kh, ow * WSTR + kw,
            idxmod(idxdiv(oco * out_channel_block + oci, channel_multiplier),
                   in_channel_block)].astype(out_dtype) * kernel[
                       oco, 0, kh, kw, 0, oci].astype(out_dtype)),
                                            axis=[kh, kw]),
        name='DepthwiseConv2d',
        tag="depthwise_conv2d_NCHWc")
    return Output
Ejemplo n.º 41
0
def test_vectorize_commreduce():
    V = tvm.placeholder((128, ), name='V')
    ax = tvm.reduce_axis((0, 128), name='ax')
    O = tvm.compute((1, ), lambda _: tvm.sum(V[ax], axis=[ax]))
    s = tvm.create_schedule(O.op)
    s[O].vectorize(ax)  # should throw here
Ejemplo n.º 42
0
def intrin_libxsmm_tuned(ofmblock, ofw, ifmblock, stride_width, ifw, rco, ifh,
                         r, s, ifh_stride, ifw_stride, in_channel):
    last_input_width_index = (ofw - 1) * stride_width + s - 1
    A = tvm.placeholder((rco, r, s, ifmblock, ofmblock), name='w')
    B = tvm.placeholder((rco, r, last_input_width_index + 1, ifmblock),
                        name='b')
    k = tvm.reduce_axis((0, ifmblock), name='k')
    k_outer = tvm.reduce_axis((0, rco), name='k_outer')
    ry = tvm.reduce_axis((0, r), name='ry')
    rx = tvm.reduce_axis((0, s), name='rx')
    C = tvm.compute((ofw, ofmblock),
                    lambda m, n: tvm.sum(A[k_outer, ry, rx, k, n] * B[
                        k_outer, ry, rx + m * stride_width, k],
                                         axis=[k_outer, ry, rx, k]),
                    name='out')
    s1 = tvm.create_schedule(C.op)
    w, ofm = s1[C].op.axis
    kco, ky, kx, kci = s1[C].op.reduce_axis
    s1[C].reorder(kco, ky, kx, w, ofm, kci)
    xx_ptr = tvm.decl_buffer(A.shape,
                             A.dtype,
                             name="W",
                             offset_factor=1,
                             data_alignment=64)

    yy_ptr = tvm.decl_buffer(
        B.shape,
        B.dtype,
        name="some",
        offset_factor=1,
        strides=[tvm.var("s3"), tvm.var("s2"), ifmblock, 1],
        data_alignment=64)

    zz_ptr = tvm.decl_buffer(C.shape,
                             C.dtype,
                             name="OUT",
                             offset_factor=1,
                             data_alignment=64)

    def intrin_func(ins, outs):
        # tvm call extern is used to interface to libxsmm batch reduce kernel gemm implementation
        # rco*r*s is the number of batches
        init_and_compute = tvm.call_extern ("int32","batch_reduce_kernel_init_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"),\
                                               rco*r*s,ofmblock,ifmblock,r,s,ifh_stride,ifw_stride, ofw, stride_width)
        reset = tvm.call_extern("int32", "batch_reduce_kernel_init",
                                outs[0].access_ptr("w"), ofmblock, ofw)
        body = tvm.call_extern ("int32","batch_reduce_kernel_update", ins[0].access_ptr("r"),ins[1].access_ptr("r"),outs[0].access_ptr("w"), rco*r*s,ofmblock,\
                                       ifmblock,ofw, stride_width,r,s, ifh_stride,ifw_stride)
        if math.ceil(in_channel / ifmblock) == rco:
            return init_and_compute, None, init_and_compute
        else:
            return init_and_compute, reset, body

    with tvm.build_config(data_alignment=64):
        return tvm.decl_tensor_intrin(C.op,
                                      intrin_func,
                                      name="GEMM",
                                      binds={
                                          A: xx_ptr,
                                          B: yy_ptr,
                                          C: zz_ptr
                                      })
Ejemplo n.º 43
0
def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
    """Depthwise convolution nchw forward operator.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    Filter : tvm.Tensor
        4-D with shape [in_channel, channel_multiplier, filter_height, filter_width]

    stride : tuple of two ints
        The spatial stride along height and width

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    out_dtype: str, optional
        Output data type

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    out_dtype = Input.dtype if out_dtype is None else out_dtype

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel, in_height, in_width = Input.shape
    # shape of dilated kernel
    filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape

    dilated_kernel_h = (filter_height - 1) * dilation_h + 1
    dilated_kernel_w = (filter_width - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_channel = simplify(in_channel * channel_multiplier)
    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    # padding stage
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
    # depthconv stage
    idxdiv = tvm.indexdiv
    idxmod = tvm.indexmod
    di = tvm.reduce_axis((0, filter_height), name='di')
    dj = tvm.reduce_axis((0, filter_width), name='dj')
    Output = tvm.compute(
        (batch, out_channel, out_height, out_width),
        lambda b, c, i, j: tvm.sum(
            (PaddedInput[b, idxdiv(c, channel_multiplier), i*stride_h+di*dilation_h,
                         j*stride_w+dj*dilation_w].astype(out_dtype) *
             Filter[idxdiv(c, channel_multiplier),
                    idxmod(c, channel_multiplier), di, dj].astype(out_dtype)),
            axis=[di, dj]),
        name='DepthwiseConv2d', tag="depthwise_conv2d_nchw")
    return Output
Ejemplo n.º 44
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout,
                   out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:

        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    ##### space definition begin #####
    tile_bna_candidates = [1, 2, 4, 8, 16]
    factors = get_factors(CO)
    cfg.define_knob('tile_bna',
                    [x for x in tile_bna_candidates if x in factors])
    cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
    cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
    cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
    cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
    cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
    ##### space definition end #####

    if cfg.is_fallback:
        cfg['tile_bnb'].val = 4
        cfg['tile_bna'].val = 4
        while CO % cfg['tile_bna'].val != 0:
            cfg['tile_bna'].val //= 2
        cfg['yt'].val = 8
        cfg.fallback_split('tile_t1', [-1, 128])
        cfg.fallback_split('tile_t2', [-1, 128])
        cfg.fallback_split('c_unroll', [-1, 8])

    bna = cfg['tile_bna'].val
    bnb = cfg['tile_bnb'].val

    P_round = (P + bnb - 1) // bnb * bnb
    assert CO % bna == 0 and P_round % bnb == 0

    # pack input tile
    input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
         tvm.if_then_else(
             b * bnb + bb < P,
             data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
             [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute(
            (alpha, alpha, CO // bna, CI, bna),
            lambda eps, nu, co, ci, vco: tvm.sum(kernel[co * bna + vco][ci][
                r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
                                                 axis=[r_kh, r_kw]),
            name='U')

    # transform image
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb),
                    lambda eps, nu, p, ci, vp: tvm.sum(input_tile[ci][p][r_a][
                        r_b][vp] * B[r_a][eps] * B[r_b][nu],
                                                       axis=[r_a, r_b]),
                    name='V')

    idxdiv = tvm.indexdiv
    idxmod = tvm.indexmod

    # batch gemm
    ci = tvm.reduce_axis((0, CI), name='c')
    M = tvm.compute(
        (alpha, alpha, CO, P_round),
        lambda eps, nu, co, p: tvm.sum(U[eps][nu][idxdiv(co, bna)][ci][idxmod(
            co, bna)] * V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)],
                                       axis=ci),
        name='M')

    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    Y = tvm.compute(
        (CO, P, m, m),
        lambda co, p, vh, vw: tvm.sum(
            M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name='Y')

    # unpack output
    output = tvm.compute(
        (N, CO, H, W),
        lambda n, co, h, w: Y[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(
            w, m),
                              idxmod(h, m),
                              idxmod(w, m)]
        # The following hack term is used to make the padding in batch gemm ("M")
        # effective, otherwise the padding will be eliminated by bound inference.
        # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding.
        + tvm.expr.Mul(tvm.const(0, out_dtype), M[alpha - 1][alpha - 1][CO - 1]
                       [P_round - 1]),
        name='output',
        tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
    return output
Ejemplo n.º 45
0
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout,
                  out_dtype, pre_computed):
    """Compute declaration for winograd"""
    assert layout == 'NCHW'

    tile_size = _infer_tile_size(data, kernel)

    N, CI, H, W = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation
    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides

    if not pre_computed:  # kernel tensor is raw tensor, do strict check
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilation(kernel, (1, 1, dilation_h, dilation_w))
        CO, CI, KH, KW = get_const_tuple(kernel.shape)
        alpha = KW + tile_size - 1
        assert HSTR == 1 and WSTR == 1 and KH == KW
    else:
        # kernel tensor is pre-transfomred. this op is created by alter op layout.
        # dilation is not supported
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        KH = KW = alpha + 1 - tile_size
        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1

    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")

    r = KW
    m = tile_size
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    H = (H + pt + pb - KH) // HSTR + 1
    W = (W + pl + pr - KW) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    # transform kernel
    if not pre_computed:
        r_kh = tvm.reduce_axis((0, KH), name='r_kh')
        r_kw = tvm.reduce_axis((0, KW), name='r_kw')
        kernel_pack = tvm.compute(
            (alpha, alpha, CI, CO),
            lambda eps, nu, ci, co: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps]
                                            [r_kh] * G[nu][r_kw],
                                            axis=[r_kh, r_kw]),
            name='kernel_pack')
    else:
        kernel_pack = kernel

    idxdiv = tvm.indexdiv
    idxmod = tvm.indexmod
    # pack input tile
    input_tile = tvm.compute(
        (CI, P, alpha, alpha),
        lambda c, p, eps, nu: data_pad[idxdiv(p, (nH * nW))][c][idxmod(
            idxdiv(p, nW), nH) * m + eps][idxmod(p, nW) * m + nu],
        name='d')

    # transform data
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    data_pack = tvm.compute((alpha, alpha, CI, P),
                            lambda eps, nu, ci, p: tvm.sum(input_tile[ci][p][
                                r_a][r_b] * B[r_a][eps] * B[r_b][nu],
                                                           axis=[r_a, r_b]),
                            name='data_pack')

    # do batch gemm
    ci = tvm.reduce_axis((0, CI), name='ci')
    bgemm = tvm.compute((alpha, alpha, CO, P),
                        lambda eps, nu, co, p: tvm.sum(kernel_pack[eps][nu][
                            ci][co] * data_pack[eps][nu][ci][p],
                                                       axis=[ci]),
                        name='bgemm')

    # inverse transform
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    inverse = tvm.compute(
        (CO, P, m, m),
        lambda co, p, vh, vw: tvm.sum(
            bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name='inverse')

    # output
    output = tvm.compute((N, CO, H, W),
                         lambda n, co, h, w: inverse[co, n * nH * nW + idxdiv(
                             h, m) * nW + idxdiv(w, m),
                                                     idxmod(h, m),
                                                     idxmod(w, m)],
                         name='output',
                         tag='conv2d_nchw_winograd')
    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)

    return output
Ejemplo n.º 46
0
                               'N = ' + str(N) + '\n'
                               'dtype = "float32"\n'
                               'a = numpy.random.rand(M, K).astype(dtype)\n'
                               'b = numpy.random.rand(K, N).astype(dtype)\n',
                               stmt='answer = numpy.dot(a, b)',
                               number=np_repeat)
print("Numpy running time: %f" % (np_runing_time / np_repeat))

answer = numpy.dot(a.asnumpy(), b.asnumpy())

# Algorithm
k = tvm.reduce_axis((0, K), 'k')
A = tvm.placeholder((M, K), name='A')
B = tvm.placeholder((K, N), name='B')
C = tvm.compute((M, N),
                lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
                name='C')

# Default schedule
s = tvm.create_schedule(C.op)
func = tvm.build(s, [A, B, C], target=target, name='mmult')
assert func

c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
func(a, b, c)
tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)

evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
print('Baseline: %f' % evaluator(a, b, c).mean)

################################################################################################
Ejemplo n.º 47
0
# Input placeholder tensors
data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
weight = tvm.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)

# Copy buffers
data_buf = tvm.compute(data_shape,
                       lambda *i: data(*i),
                       "data_buf")
weight_buf = tvm.compute(weight_shape,
                         lambda *i: weight(*i),
                         "weight_buf")

# Declare matrix multiply computation
res_gemm = tvm.compute(output_shape,
                       lambda bo, co, bi, ci: tvm.sum(
                            data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype) *
                            weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype),
                            axis=[ic, ic_tns]),
                       name="res_gem")

# Add shift stage for fix-point normalization
res_shr = tvm.compute(output_shape,
                      lambda *i: res_gemm(*i) >> env.INP_WIDTH,
                      name="res_shr")

# Apply clipping between (0, input max value)
inp_max = (1<<(env.INP_WIDTH-1))-1
res_max = tvm.compute(output_shape,
                      lambda *i: tvm.max(res_shr(*i), 0),
                      "res_max")
res_min = tvm.compute(output_shape,
                      lambda *i: tvm.min(res_max(*i), inp_max),
Ejemplo n.º 48
0
def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout,
                      out_dtype):
    """Convolution operator in NCHW[x]c layout for int8.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template

    data : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width] or
        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]

    kernel : tvm.Tensor
        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
        filter_width, num_filter_block, in_channel_block]

    stride : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]

    padding: int or a list/tuple of two ints
        padding size, or [pad_height, pad_width]

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    layout : str
        layout of data

    out_dtype : str
        The output type. This is used for mixed precision.

    Returns
    -------
    output : tvm.Tensor
        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
    """
    assert layout in ["NCHW", "NCHW4c"]
    ic_block_factor = 4
    oc_block_factor = 4

    pre_computed = len(kernel.shape) == 6
    if not pre_computed:
        batch, channels, height, width = get_const_tuple(data.shape)
        assert channels % ic_block_factor == 0, \
            "Number of input channels should be multiple of {}".format(
                ic_block_factor)
        packed_data = tvm.compute(
            (batch, channels // ic_block_factor, height, width,
             ic_block_factor),
            lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w],
            name="packed_data")

        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
            kernel.shape)
        assert out_channels % 4 == 0, \
            "Number of output channels should be multiple of {}".format(
                oc_block_factor)
        packed_kernel = tvm.compute(
            (out_channels // oc_block_factor, in_channels // ic_block_factor,
             kernel_h, kernel_w, oc_block_factor, ic_block_factor),
            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: kernel[
                oc_chunk * oc_block_factor + oc_block, ic_chunk *
                ic_block_factor + ic_block, kh, kw],
            name="packed_kernel")

    else:
        packed_data = data
        packed_kernel = kernel

    batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(
        packed_data.shape)
    oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
        packed_kernel.shape)

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (kernel_h, kernel_w))
    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")

    # compute the output shape
    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top +
                  pad_down) // stride_h + 1
    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left +
                 pad_right) // stride_w + 1

    oshape = (batch, oc_chunk, out_height, out_width, oc_block)

    icc = tvm.reduce_axis((0, ic_chunk), name='ic_chunk')
    icb = tvm.reduce_axis((0, ic_block), name='ic_block')
    kh = tvm.reduce_axis((0, kernel_h), name='kh')
    kw = tvm.reduce_axis((0, kernel_w), name='kw')

    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
                       tvm.sum(pad_data[n, icc, oh*stride_h+kh*dilation_h, \
                               ow*stride_w+kw*dilation_w, icb]
                               .astype('int32') *
                               packed_kernel[oc_chunk, icc,
                                             kh, kw, oc_block, icb]
                               .astype('int32'),
                               axis=[icc, kh, kw, icb]))

    output = tvm.compute(oshape,
                         lambda n, oc_chunk, oh, ow, oc_block: conv[
                             n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
                         tag="conv2d_NCHWc_int8")

    # num flop
    num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
        ic_chunk * ic_block * kernel_h * kernel_w * 2
    cfg.add_flop(num_flop)

    return output
Ejemplo n.º 49
0
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout,
                  out_dtype, pre_computed):
    """Compute declaration for winograd"""
    assert layout == 'NCHW'

    tile_size = _infer_tile_size(data, kernel)

    N, CI, H, W = get_const_tuple(data.shape)

    if not pre_computed:  # kernel tensor is raw tensor, do strict check
        if isinstance(dilation, int):
            dilation_h = dilation_w = dilation
        else:
            dilation_h, dilation_w = dilation
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))

        CO, CI, KH, KW = get_const_tuple(kernel.shape)
        HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel)
        HSTR, WSTR = (strides,
                      strides) if isinstance(strides, int) else strides
        assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
    else:  # kernel tensor is pre-transfomred. this op is created by
        # alter op layout, do not check
        # dilation is not supported
        HSTR = WSTR = 1
        HPAD = WPAD = 1
        KH = KW = 3
        _, _, CI, CO = get_const_tuple(kernel.shape)

    data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD),
                      name="data_pad")

    if tile_size == 4:
        G_data = np.array(
            [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0],
             [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0],
             [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]],
            dtype=np.float32)

        B_data = np.array(
            [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0],
             [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]],
            out_dtype)

        A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1],
                           [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]],
                          out_dtype)
    elif tile_size == 2:
        G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2],
                           [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]],
                          np.float32)

        B_data = np.array(
            [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]],
            out_dtype)

        A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype)
    else:
        raise ValueError("Unsupported tile size for winograd: " +
                         str(tile_size))

    m = A_data.shape[1]
    r = 3
    alpha = m + r - 1
    H = (H + 2 * HPAD - KH) // HSTR + 1
    W = (W + 2 * WPAD - KW) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    # transform kernel
    if not pre_computed:
        G = const_matrix(G_data, 'G')
        r_kh = tvm.reduce_axis((0, KH), name='r_kh')
        r_kw = tvm.reduce_axis((0, KW), name='r_kw')
        kernel_pack = tvm.compute(
            (alpha, alpha, CI, CO),
            lambda eps, nu, ci, co: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps]
                                            [r_kh] * G[nu][r_kw],
                                            axis=[r_kh, r_kw]),
            name='kernel_pack')
    else:
        kernel_pack = kernel

    # pack input tile
    input_tile = tvm.compute((CI, P, alpha, alpha),
                             lambda c, p, eps, nu: data_pad[p // (nH * nW)][c][
                                 p // nW % nH * m + eps][p % nW * m + nu],
                             name='d')

    # transform data
    B = const_matrix(B_data)
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    data_pack = tvm.compute((alpha, alpha, CI, P),
                            lambda eps, nu, ci, p: tvm.sum(input_tile[ci][p][
                                r_a][r_b] * B[r_a][eps] * B[r_b][nu],
                                                           axis=[r_a, r_b]),
                            name='data_pack')

    # do batch gemm
    ci = tvm.reduce_axis((0, CI), name='ci')
    bgemm = tvm.compute((alpha, alpha, CO, P),
                        lambda eps, nu, co, p: tvm.sum(kernel_pack[eps][nu][
                            ci][co] * data_pack[eps][nu][ci][p],
                                                       axis=[ci]),
                        name='bgemm')

    # inverse transform
    A = const_matrix(A_data)
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_a')
    inverse = tvm.compute(
        (CO, P, m, m),
        lambda co, p, vh, vw: tvm.sum(
            bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name='inverse')

    # output
    output = tvm.compute(
        (N, CO, H, W),
        lambda n, co, h, w: inverse[co][n * nH * nW +
                                        (h // m) * nW + w // m][h % m][w % m],
        name='output',
        tag='conv2d_nchw_winograd')
    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)

    return output
Ejemplo n.º 50
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout,
                   out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    if tile_size == 4:
        G_data = np.array(
            [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0],
             [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0],
             [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]],
            dtype=np.float32)

        B_data = np.array(
            [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0],
             [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]],
            out_dtype)

        A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1],
                           [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]],
                          out_dtype)
    elif tile_size == 2:
        G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2],
                           [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]],
                          np.float32)

        B_data = np.array(
            [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]],
            out_dtype)

        A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype)
    else:
        raise ValueError("Unsupported tile size for winograd: " +
                         str(tile_size))

    m = A_data.shape[1]
    r = 3
    alpha = m + r - 1
    K = CO
    C = CI

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    cfg.define_split('tile_p',
                     cfg.axis(P),
                     num_outputs=2,
                     filter=lambda x: x.size[-1] <= 16)
    cfg.define_split('tile_k',
                     cfg.axis(K),
                     num_outputs=2,
                     filter=lambda x: x.size[-1] <= 16)
    VP = cfg['tile_p'].size[-1]
    VK = cfg['tile_k'].size[-1]

    # pack input tile
    input_tile = tvm.compute(
        (C, P // VP, alpha, alpha, VP),
        lambda c, b, eps, nu, bb: data_pad[(b * VP + bb) // (nH * nW)][c][
            (b * VP + bb) // nW % nH * m + eps][(b * VP + bb) % nW * m + nu],
        name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        G = const_matrix(G_data, 'G')
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute(
            (alpha, alpha, K // VK, C, VK),
            lambda eps, nu, k, c, kk: tvm.sum(kernel[k * VK + kk][c][r_kh][
                r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw],
                                              axis=[r_kh, r_kw]),
            name='U')

    # transform image
    B = const_matrix(B_data, 'B')
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    V = tvm.compute(
        (alpha, alpha, P // VP, C, VP),
        lambda eps, nu, b, c, bb: tvm.sum(input_tile[c][b][r_eps][r_nu][
            bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu],
                                          axis=[r_eps, r_nu]),
        name='V')

    # batch gemm
    c = tvm.reduce_axis((0, C), name='c')
    M = tvm.compute((alpha, alpha, K, P),
                    lambda eps, nu, k, b: tvm.sum(U[eps][nu][k // VK][c][
                        k % VK] * V[eps][nu][b // VP][c][b % VP],
                                                  axis=c),
                    name='M')

    # inverse transform
    A = const_matrix(A_data, 'A')
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    Y = tvm.compute((K, P, m, m),
                    lambda k, b, vh, vw: tvm.sum(M[r_eps][r_nu][k][b] * A[
                        r_eps][vh] * A[r_nu][vw],
                                                 axis=[r_eps, r_nu]),
                    name='Y')

    # unpack output
    output = tvm.compute(
        (N, K, H, W),
        lambda n, k, h, w: Y[k][n * nH * nW +
                                (h // m) * nW + w // m][h % m][w % m],
        name='output',
        tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * K * H * W * KH * KW * C)
    return output
Ejemplo n.º 51
0
def gemm(env, mock=False):
    """Matrix-matrix multiply intrinsic

    Parameters
    ----------
    env : Environment
        The Environment

    mock : bool
        Whether create a mock version.
    """
    wgt_lanes = env.WGT_ELEM_BITS // env.WGT_WIDTH
    assert wgt_lanes == env.BLOCK_OUT * env.BLOCK_IN
    wgt_shape = (env.BLOCK_OUT, env.BLOCK_IN)
    assert wgt_shape[0] * wgt_shape[1] == wgt_lanes

    inp_lanes = env.INP_ELEM_BITS // env.INP_WIDTH
    assert inp_lanes == env.BATCH * env.BLOCK_IN
    inp_shape = (env.BATCH, env.BLOCK_IN)
    assert inp_shape[0] * inp_shape[1] == inp_lanes

    out_lanes = env.ACC_ELEM_BITS // env.ACC_WIDTH
    assert out_lanes == env.BATCH * env.BLOCK_OUT
    out_shape = (env.BATCH, env.BLOCK_OUT)
    assert out_shape[0] * out_shape[1] == out_lanes

    wgt = tvm.placeholder((wgt_shape[0], wgt_shape[1]),
                          dtype="int%d" % env.WGT_WIDTH,
                          name=env.wgt_scope)
    inp = tvm.placeholder((inp_shape[0], inp_shape[1]),
                          dtype="int%d" % env.INP_WIDTH,
                          name=env.inp_scope)
    k = tvm.reduce_axis((0, wgt_shape[1]), name="k")
    out_dtype = "int%d" % env.ACC_WIDTH
    out = tvm.compute((out_shape[0], out_shape[1]),
                      lambda i, j: tvm.sum(inp[i, k].astype(out_dtype) * wgt[
                          j, k].astype(out_dtype),
                                           axis=[k]),
                      name="out")
    wgt_layout = tvm.decl_buffer(wgt.shape,
                                 wgt.dtype,
                                 env.wgt_scope,
                                 scope=env.wgt_scope,
                                 offset_factor=wgt_lanes,
                                 data_alignment=wgt_lanes)
    inp_layout = tvm.decl_buffer(inp.shape,
                                 inp.dtype,
                                 env.inp_scope,
                                 scope=env.inp_scope,
                                 offset_factor=inp_lanes,
                                 data_alignment=inp_lanes)
    out_layout = tvm.decl_buffer(out.shape,
                                 out.dtype,
                                 env.acc_scope,
                                 scope=env.acc_scope,
                                 offset_factor=out_lanes,
                                 data_alignment=out_lanes)

    def intrin_func(ins, outs):
        """Matrix-matrix multiply intrinsic function"""
        dinp, dwgt = ins
        dout = outs[0]

        def instr(index):
            """Generate matrix-matrix multiply VTA instruction"""
            irb = tvm.ir_builder.create()
            dev = env.dev
            irb.scope_attr(dev.vta_axis, "coproc_scope",
                           dev.get_task_qid(dev.QID_COMPUTE))
            irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop)
            if index in (0, 2):
                irb.emit(
                    tvm.call_extern("int32", "VTAUopPush", 0, 0,
                                    dout.access_ptr("rw", "int32"),
                                    dinp.access_ptr("r", "int32"),
                                    dwgt.access_ptr("r", "int32"), 0, 0, 0))
            else:
                irb.emit(
                    tvm.call_extern("int32", "VTAUopPush", 0, 1,
                                    dout.access_ptr("rw", "int32"), 0, 0, 0, 0,
                                    0))
            return irb.get()

        # return a triple of normal-set, reset, update
        nop = tvm.make.Evaluate(0)
        if mock:
            return (nop, nop, nop)
        return (instr(0), instr(1), instr(2))

    return tvm.decl_tensor_intrin(out.op,
                                  intrin_func,
                                  name="GEMM",
                                  binds={
                                      inp: inp_layout,
                                      wgt: wgt_layout,
                                      out: out_layout
                                  })
Ejemplo n.º 52
0
def rnn_matexp():
    n_num_step = 128
    n_num_hidden = 1152
    n_batch_size = 4
    detect_global_barrier = DETECT_GLOBAL_BARRIER

    num_step = tvm.var("num_step")
    num_hidden = tvm.convert(n_num_hidden)
    batch_size = tvm.convert(n_batch_size)
    num_thread_y = 8
    num_thread_x = 16 * 3
    num_sm = 24

    Whh = tvm.placeholder((num_hidden, num_hidden), name="Whh")
    s_init = tvm.compute((1, batch_size, num_hidden),
                         lambda _, i, j: 1.0, name="init")
    s_state = tvm.placeholder((num_step, batch_size, num_hidden))
    kh = tvm.reduce_axis((0, num_hidden), name="kh")
    s_update = tvm.compute(
        (num_step, batch_size, num_hidden),
        lambda t, i, j: tvm.sum(s_state[t-1, i, kh] * Whh[kh, j], axis=kh),
        name="update")
    s_scan = tvm.scan(s_init, s_update, s_state)
    # schedule
    s = tvm.create_schedule(s_scan.op)
    CL = s_update
    SS = s.cache_read(s_state, "shared", [CL])
    SL = s.cache_read(SS, "local", [CL])
    WhhL = s.cache_read(Whh, "local", [CL])
    ko, ki = s[CL].split(s[CL].op.reduce_axis[0], nparts=num_thread_y)
    CLF = s.rfactor(CL, ko)

    block_x = tvm.thread_axis((0, num_sm), "blockIdx.x")
    thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
    thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
    if PERSIST_KERNEL:
        s[s_scan.op].env_threads([block_x, thread_y, thread_x])

    bx, xi = s[s_init].split(s_init.op.axis[2], nparts=num_sm)
    tx, xi = s[s_init].split(xi, nparts=num_thread_x)
    s[s_init].bind(bx, block_x)
    s[s_init].bind(tx, thread_x)

    bx, xi = s[s_update].split(s[CL].op.axis[2], nparts=num_sm)
    tx, xi = s[s_update].split(xi, nparts=num_thread_x)
    s[s_update].bind(bx, block_x)
    s[s_update].bind(tx, thread_x)
    s[CL].bind(s[CL].op.reduce_axis[0], thread_y)
    s[CLF].compute_at(s[CL], s[CL].op.reduce_axis[0])
    # Duplicate store predicate.
    s[CL].set_store_predicate(thread_y.equal(0))

    if PERSIST_KERNEL:
        s[WhhL].compute_at(s[s_scan], thread_x)
        s[WhhL].unroll(WhhL.op.axis[0])
    else:
        s[WhhL].compute_at(s[CLF], CLF.op.axis[3])

    kr, ki = s[CLF].split(CLF.op.reduce_axis[0], nparts=1)
    ko, ki = s[CLF].split(ki, factor=4)
    s[SS].compute_at(s[CLF], kr)
    s[SL].compute_at(s[CLF], ko)

    xo, xi = s[SS].split(SS.op.axis[2], factor=num_thread_x * num_thread_y * 3)
    ty, xi = s[SS].split(xi, nparts=num_thread_y)
    tx, xi = s[SS].split(xi, nparts=num_thread_x)
    s[SS].bind(ty, thread_y)
    s[SS].bind(tx, thread_x)

    def check_device(target):
        with tvm.build_config(
                detect_global_barrier=detect_global_barrier,
                auto_unroll_min_depth=2,
                auto_unroll_max_step=128,
                unroll_explicit=False):
            f = tvm.build(s, [s_scan, Whh], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        res_np = np.zeros(
            (n_num_step, n_batch_size, n_num_hidden)).astype("float32")
        Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
        Whh_np[:] = 2.0 / n_num_hidden
        Whh_np[:, n_num_hidden//2:] = 0

        res_a = tvm.nd.array(res_np, ctx)
        Whh_a = tvm.nd.array(Whh_np, ctx)
        # Skip first pass as it is compilation
        f(res_a, Whh_a)
        ctx.sync()
        # measure time cost of second step.
        tstart = time.time()
        f(res_a, Whh_a)
        ctx.sync()
        tgap = time.time() - tstart
        print("Time cost=%g" % tgap)
        # correctness
        if not SKIP_CHECK:
            res_gpu = res_a.asnumpy()
            res_cmp = np.ones_like(res_np).astype("float64")
            Whh_np = Whh_np.astype("float64")
            for t in range(1, n_num_step):
                res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np)
            for i  in range(n_num_step):
                for j in range(n_num_hidden):
                    if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5:
                        print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j]))
            np.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
    check_device("cuda")
Ejemplo n.º 53
0
def test_gemm():
    # graph
    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((m, l), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m),
                    lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
                    name='CC')
    # schedule
    s = tvm.create_schedule(C.op)
    xtile, ytile = 32, 32
    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis("threadIdx.y")

    CC = s.cache_write(C, "local")
    AA = s.cache_read(A, "shared", [CC])
    BB = s.cache_read(B, "shared", [CC])
    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].reorder(by, bx, yi, xi)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    ty, yi = s[C].split(yi, nparts=num_thread)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].reorder(ty, tx, yi, xi)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)

    s[CC].compute_at(s[C], tx)
    s[AA].compute_at(s[CC], k)
    s[BB].compute_at(s[CC], k)
    s[AA].double_buffer()
    s[BB].double_buffer()
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)

    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)

    # lowering test
    s = s.normalize()

    # one line to build the function.
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("skip because %s is not enabled.." % device)
            return

        with tvm.target.create(device):
            f = tvm.build(s, [A, B, C])

        # launch the kernel.
        n = nn
        m = n
        l = n
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
        tcost = ftimer(a, b, c).mean
        print("%s: exec=%g sec/op" % (ctx, tcost))
        tvm.testing.assert_allclose(c.asnumpy(),
                                    np.dot(a_np, b_np.T),
                                    rtol=1e-5)

    check_device("vulkan")
    check_device("nvptx -mcpu=sm_20")
    check_device("rocm")
    check_device("metal")
    check_device("opencl")
    check_device("cuda")
Ejemplo n.º 54
0
def test_gemm():
    # graph
    nn = 2048
    n = tvm.var('n')
    n = tvm.convert(nn)
    m, l = n, n
    A = tvm.placeholder((l, n), name='A')
    B = tvm.placeholder((l, m), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute(
        (m, n),
        lambda ii, jj: tvm.sum(A[k, jj] * B[k, ii], axis=k),
        name='C')

    # schedule
    s = tvm.create_schedule(C.op)
    AA = s.cache_read(A, "shared", [C])
    BB = s.cache_read(B, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BL = s.cache_read(BB, "local", [C])
    CC = s.cache_write(C, "local")

    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")

    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_thread)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    ko, ki = s[CC].split(k, factor=8)
    kt, ki = s[CC].split(ki, factor=1)
    s[CC].reorder(ko, kt, ki, yo, xo)
    s[AA].compute_at(s[CC], ko)
    s[BB].compute_at(s[CC], ko)
    s[AL].compute_at(s[CC], kt)
    s[BL].compute_at(s[CC], kt)
    # Schedule for A's shared memory load
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)
    s[AA].vectorize(xi)
    # Schedule for B' shared memory load
    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)
    s[BB].vectorize(xi)
    s[AA].double_buffer()
    s[BB].double_buffer()
    # correctness
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        f = tvm.build(s, [A, B, C], device)
        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
        # launch the kernel.
        n, m, l = nn, nn, nn
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        for i in range(2):
            f(a, b, c)
        np.testing.assert_allclose(
            c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)

    with tvm.build_config(auto_unroll_max_step=32,
                          auto_unroll_min_depth=0,
                          unroll_explicit=False):
        check_device("cuda")
Ejemplo n.º 55
0
import tvm
n = 1024
m = 1024

A = tvm.placeholder((n, m), name='A')
l = tvm.reduce_axis((0, m), name='l')

B = tvm.compute((n, ), lambda i: tvm.sum(A[i, l], axis=l), name='B')

s = tvm.create_schedule(B.op)

print(tvm.lower(s, [A, B], simple_mode=True))
print("---------cutting line---------")

s[B].parallel(B.op.reduce_axis[0])
print(tvm.lower(s, [A, B], simple_mode=True))
Ejemplo n.º 56
0
 def _compute_expsum(max_elem, *indices):
     eval_range = insert_reduce_index(indices, k2)
     return tvm.sum(tvm.exp(x[eval_range] - max_elem[indices]), axis=k2)
Ejemplo n.º 57
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size=2):
    """Declare a winograd convolution - only tile_size=2 is currently supported"""
    N, CI, IH, IW = get_const_tuple(data.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if int(kernel.shape[2]) == 3:
        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI = get_const_tuple(kernel.shape)
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    K = CO
    C = CI
    H = (IH + pt + pb - 3) // HSTR + 1
    W = (IW + pl + pr - 3) // WSTR + 1
    nH, nW = (H + m-1) // m, (W + m-1) // m
    P = N * nH * nW

    def upround(x, align):
        return (x + align - 1) // align * align

    ALIGN = 16
    P_round = upround(P, ALIGN)
    K_round = upround(K, ALIGN)

    # CONFIG

    cfg.define_knob("data_transform_wgx", [1, 2, 4, 8, 16, 32, 64])
    cfg.define_knob("data_transform_wgy", [1, 2, 4, 8, 16, 32, 64])

    # Pack input tile
    input_tile = tvm.compute((N, C, H + 2, W + 2),
                             lambda n, c, h, w:
                             data_pad[n][c][h][w],
                             name='d')

    if pre_computed:
        U = kernel
    else:
        U = _decl_winograd_kernel_transform(kernel, tile_size, G)

    # V [alpha * alpha, C, P_round)
    # Perform the image transform
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    V = tvm.compute((alpha * alpha, C, P_round),
                    lambda epsnu, c, b:
                    tvm.sum(input_tile[b // (nH*nW)][c][b // nW % nH * m + r_eps][b % nW * m +r_nu]\
                            * B[r_eps][epsnu // alpha] * B[r_nu][epsnu % alpha],
                            axis=[r_eps, r_nu]),
                    name='V')

    # Winograd GEMM is a wrapper around batched GEMM to convert U to a 3D Tensor
    _, M = decl_winograd_gemm(cfg, U, V)

    # Y [K, P, m, m]
    # Winograd output transform
    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
                    tvm.sum(M[r_eps * alpha + r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
                            axis=[r_eps, r_nu]), name='Y')

    # Output [N, K, H, W]
    # Unpack back to NCHW format
    # The last term ensures alignment is not lost to bound inference
    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
                         + tvm.const(0, out_dtype) * M[(alpha*alpha)-1][K_round-1][P_round-1],
                         name='output', tag='winograd_conv2d_output')

    return output
Ejemplo n.º 58
0
def conv_auto_tuned(ofmblock,ofw, ifmblock, stride_width,input_width,\
                    in_channel,input_height, filter_height, filter_width,ofh, stride_height, batch, out_channel):

    A1 = tvm.placeholder((batch, math.ceil(
        in_channel / ifmblock), input_height, input_width, ifmblock),
                         name='input')
    W1 = tvm.placeholder(
        (math.ceil(out_channel / ofmblock), math.ceil(in_channel / ifmblock),
         filter_height, filter_width, ifmblock, ofmblock),
        name='weight')

    rco1 = tvm.reduce_axis((0, math.ceil(in_channel / ifmblock)), name='rco1')
    ry1 = tvm.reduce_axis((0, filter_height), name='ry1')
    rx1 = tvm.reduce_axis((0, filter_width), name='rx1')
    rci1 = tvm.reduce_axis((0, ifmblock), name='rci1')
    cfg = autotvm.get_config()

    cfg.define_knob("pack", [0, 1])
    pack = False
    w_tile = []

    factor_found = False

    for i in range(6, min(ofw + 1, 29)):
        if ofw % i == 0:
            w_tile.append((i, ofw // i))
            factor_found = True

    if factor_found == False:
        w_tile.append((ofw, 1))

    #tile factors for output width
    cfg.define_knob("tile_w", w_tile)

    # pack data when stride > 1 and pack flag set so that data for brgemm is continuous
    if filter_height == 1 and filter_width == 1 and stride_width > 1 and stride_height > 1 and cfg[
            'pack'].val == 1:
        A2 = tvm.compute(
            (batch, math.ceil(in_channel / ifmblock), ofh, ofw, ifmblock),
            lambda n, c, h, w, vlen1: A1[n, c, h * stride_height, w *
                                         stride_width, vlen1])
        B1 = tvm.compute(
            (batch, math.ceil(out_channel / ofmblock), ofh, ofw, ofmblock),
            lambda nn, ff, yy, xx, vlen1: tvm.sum(W1[
                ff, rco1, ry1, rx1, rci1, vlen1] * A2[nn, rco1, ry1 + yy, rx1 +
                                                      xx, rci1],
                                                  axis=[rco1, ry1, rx1, rci1]),
            name='output')
        pack = True
    else:
        # Compute the convolution
        B1 = tvm.compute(
            (batch, math.ceil(out_channel / ofmblock), ofh, ofw, ofmblock),
            lambda nn, ff, yy, xx, vlen1: tvm.sum(
                W1[ff, rco1, ry1, rx1, rci1, vlen1
                   ] * A1[nn, rco1, ry1 + stride_height * yy, rx1 +
                          stride_width * xx, rci1],
                axis=[rco1, ry1, rx1, rci1]),
            name='output')

    s = tvm.create_schedule(B1.op)
    n, ko, h, w, ki = s[B1].op.axis
    rco, ry, rx, rci = s[B1].op.reduce_axis
    cfg.define_split("tile_h", h, num_outputs=3)  #output height
    cfg.define_split("tile_c", rco, num_outputs=2)  #input channel dimension
    cfg.define_split("tile_k", ko, num_outputs=2)  #output channel dimension
    w_factor_inner, _ = cfg["tile_w"].val
    wo, wi = s[B1].split(w, w_factor_inner)  #tiling
    rco_o, rco_i = cfg["tile_c"].apply(s, B1, rco)
    ko_o, ko_i = cfg["tile_k"].apply(s, B1, ko)
    ho, hm, hi = cfg["tile_h"].apply(s, B1, h)

    s[B1].reorder(n, ko_o, ho, ko_i, rco_o, hm, wo, hi, rco_i, ry, rx, wi, ki,
                  rci)
    cfg.define_reorder("reorder_outer", [ko_i, rco_o, hm, wo], policy="all")
    cfg.add_flop(
        np.prod(get_const_tuple(B1.shape)) * in_channel * filter_height *
        filter_width * 2)
    cfg["reorder_outer"].apply(s, B1, [ko_i, rco_o, hm, wo])
    if (filter_height == 1 and filter_width == 1 and stride_width == 1
            and stride_height == 1) or pack:
        if cfg["tile_h"].size[
                1] > 1 and w_factor_inner == ofw:  #cfg["tile_w"].size[2] == ofw:
            libxsmm_tensorize = intrin_libxsmm_hxw(ofmblock,w_factor_inner,ifmblock, 1, w_factor_inner,
                                                cfg["tile_c"].size[1],cfg["tile_h"].size[2],\
                                                 filter_height, filter_width,ofh,ofw,cfg["tile_h"].size[2],1, out_channel, ofh,ofw, in_channel)
            s[B1].tensorize(hi, libxsmm_tensorize)
        else:
            libxsmm_tensorize = intrin_libxsmm_tuned(ofmblock,w_factor_inner,ifmblock, 1, w_factor_inner,
                                                cfg["tile_c"].size[1], cfg["tile_h"].size[2],\
                                                 filter_height, filter_width,ofh, ofw, in_channel)
            s[B1].tensorize(rco_i, libxsmm_tensorize)

    else:

        libxsmm_tensorize = intrin_libxsmm_tuned(ofmblock,w_factor_inner,ifmblock, stride_width, w_factor_inner,\
                                                cfg["tile_c"].size[1],  cfg["tile_h"].size[2],\
                                                filter_height, filter_width,input_height,input_width, in_channel)
        s[B1].tensorize(rco_i, libxsmm_tensorize)

    par = s[B1].fuse(n, ko_o, ho)
    s[B1].parallel(par)
    if pack:
        n1, c1, h1, w1, v1 = s[A2].op.axis
        par2 = s[A2].fuse(n1, c1, h1)
        s[A2].parallel(par)
        s[A2].vectorize(v1)

    s = s.normalize()

    return s, [W1, A1, B1]
Ejemplo n.º 59
0
# === Start computation
N = tvm.var('N') # Data set size
D = tvm.var('D') # Feature number
L = tvm.var('L') # Label number

label = tvm.placeholder((N, L), name='label')
data = tvm.placeholder((N, D), name='data')
weight = tvm.placeholder((L, D + 1), name='weight')

data_expand = tvm.compute((N, D + 1), lambda n, d:
        tvm.select((d < D), data[n, d], tvm.const(1, dtype=data.dtype)),
        name='data_expand')

rd = tvm.reduce_axis((0, D + 1), name='rd')
dot = tvm.compute((N, L), lambda n, l:
        tvm.sum(weight[l, rd] * data_expand[n, rd], axis=rd),
        name='dot')

factor = tvm.compute((N, L), lambda n, l: 1 / (1 + tvm.exp(-dot[n, l])),
        name='factor')

def argmax_combine(x, y):
    lhs = tvm.select((x[1] > y[1]), x[0], y[0])
    rhs = tvm.select((x[1] > y[1]), x[1], y[1])
    return lhs, rhs

def argmax_identity(t0, t1):
    return tvm.const(-1, t0), tvm.min_value(t1)

argmax = tvm.comm_reducer(argmax_combine, argmax_identity, name='argmax')
dummy_idx = tvm.compute((L, ), lambda l: l, name='dummy_idx')
Ejemplo n.º 60
0
def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=None):
    """Depthwise convolution nhwc forward operator.

    Parameters
    ----------
    Input : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]

    Filter : tvm.Tensor
        4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]

    stride : tuple of two ints
        The spatial stride along height and width

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    out_dtype: str, optional
        Output data type

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_height, out_width, out_channel]
    """
    out_dtype = Input.dtype if out_dtype is None else out_dtype

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if dilation_h != 1 or dilation_w != 1:
        Filter = dilate(Filter, (dilation_h, dilation_w, 1, 1))

    batch, in_height, in_width, in_channel = Input.shape
    # shape of dilated kernel
    filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (filter_height, filter_width))
    out_channel = simplify(in_channel * channel_multiplier)
    out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1)
    out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1)

    # padding stage
    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
    # depthconv stage
    di = tvm.reduce_axis((0, filter_height), name='di')
    dj = tvm.reduce_axis((0, filter_width), name='dj')
    Output = tvm.compute(
        (batch, out_height, out_width, out_channel),
        lambda b, i, j, c: tvm.sum(
            (PaddedInput[b, i*stride_h + di, j*stride_w + dj, c/channel_multiplier].astype(
                out_dtype) *
             Filter[di, dj, c/channel_multiplier, c%channel_multiplier].astype(out_dtype)),
            axis=[di, dj]),
        name='DepthwiseConv2d', tag="depthwise_conv2d_nhwc")
    return Output