Beispiel #1
0
    def gen_ir(
        data_ptr,
        n_fft,
        hop_length,
        win_length,
        window_ptr,
        normalized,
        onesided,
        output_ptr,
    ):
        ib = tir.ir_builder.create()
        data = ib.buffer_ptr(data_ptr)
        window = ib.buffer_ptr(window_ptr)
        output = ib.buffer_ptr(output_ptr)
        max_threads = _get_max_threads(output_ptr.shape[0] *
                                       output_ptr.shape[1])
        output_size = output_ptr.shape[0] * output_ptr.shape[
            1] * output_ptr.shape[2]
        with ib.new_scope():
            nthread_tx = max_threads
            nthread_bx = ceil_div(output_size, max_threads)
            tx = te.thread_axis("threadIdx.x")
            bx = te.thread_axis("blockIdx.x")
            ib.scope_attr(tx, "thread_extent", nthread_tx)
            ib.scope_attr(bx, "thread_extent", nthread_bx)
            tid = bx * max_threads + tx

            with ib.if_scope(tid < output_size):
                matrix_size = output_ptr.shape[1] * output_ptr.shape[2]
                batch = tir.floordiv(tid, matrix_size)
                row = tir.floordiv(tir.indexmod(tid, matrix_size),
                                   output_ptr.shape[2])
                col = tir.indexmod(tir.indexmod(tid, matrix_size),
                                   output_ptr.shape[2])
                output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0)
                output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0)
                with ib.for_range(0, win_length) as wlen:
                    output[batch, row, col,
                           0] += (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.cos(2 * pi * row * wlen / win_length))
                    output[batch, row, col,
                           1] -= (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.sin(2 * pi * row * wlen / win_length))
                with ib.if_scope(normalized):
                    output[batch, row, col,
                           0] /= tir.sqrt(tir.const(n_fft, "float32"))
                    output[batch, row, col,
                           1] /= tir.sqrt(tir.const(n_fft, "float32"))

        return ib.get()
Beispiel #2
0
def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
    """Low level IR to calculate adjacent difference in an 1-D array.

    Parameters
    ----------
    data : Buffer
        Input 1-D Buffer.

    output: Buffer
        A buffer to store adjacent difference, of the same shape as data. The adjacent difference
        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
        where i > 0 and i < len(data).

    binop: function, optional
        A binary associative op to use for calculating adjacent difference. The function takes two
        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
        compute the adjacent difference.
    """
    ib = tir.ir_builder.create()
    data_ptr = ib.buffer_ptr(data)
    output_ptr = ib.buffer_ptr(output)
    with ib.for_range(0, data.shape[0], kind="parallel") as i:
        with ib.if_scope(i == 0):
            output_ptr[0] = 0
        with ib.else_scope():
            output_ptr[i] = tir.Cast(output.dtype,
                                     binop(data_ptr[i], data_ptr[i - 1]))
    return ib.get()
Beispiel #3
0
def test_illegal_cast():
    analyzer = tvm.arith.Analyzer()
    try:
        analyzer.rewrite_simplify(1 * tir.Cast('bool', 77))
        assert False
    except tvm.TVMError:
        pass
    try:
        analyzer.rewrite_simplify(1 * tir.Cast('int8', 171))
        assert False
    except tvm.TVMError:
        pass
    try:
        analyzer.rewrite_simplify(1 * tir.Cast('int32', 2**32))
        assert False
    except tvm.TVMError:
        pass
Beispiel #4
0
def batch_matmul_nkkm_f16(  # pylint: disable=invalid-name,missing-docstring
    B: int,
    N: int,
    M: int,
    K: int,
) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
    x = te.placeholder((B, N, K), name="X", dtype="float16")
    y = te.placeholder((B, K, M), name="Y", dtype="float16")
    k = te.reduce_axis((0, K), name="k")
    z = te.compute(  # pylint: disable=invalid-name
        (B, N, M),
        lambda b, i, j: te.sum(tir.Cast("float32", x[b][i][k]) * tir.Cast(
            "float32", y[b][k][j]),
                               axis=[k]),
        name="Z",
    )
    return (x, y, z)
Beispiel #5
0
    def gen_ir(
        data_ptr,
        n_fft,
        hop_length,
        win_length,
        window_ptr,
        normalized,
        onesided,
        output_ptr,
        loop_kind,
    ):
        ib = tir.ir_builder.create()
        data = ib.buffer_ptr(data_ptr)
        window = ib.buffer_ptr(window_ptr)
        output = ib.buffer_ptr(output_ptr)
        # https://librosa.org/doc/0.7.2/_modules/librosa/core/spectrum.html#stft
        with ib.for_range(0,
                          output_ptr.shape[0] * output_ptr.shape[1],
                          kind="parallel") as batch_row:
            with ib.for_range(0, output_ptr.shape[2], kind=loop_kind) as col:
                batch = ib.allocate("int32", (1), name="batch", scope="local")
                row = ib.allocate("int32", (1), name="row", scope="local")
                batch = tir.floordiv(batch_row, output_ptr.shape[1])
                row = tir.floormod(batch_row, output_ptr.shape[1])
                output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0)
                output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0)
                with ib.for_range(0, win_length) as wlen:
                    output[batch, row, col,
                           0] += (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.cos(2 * pi * row * wlen / win_length))
                    output[batch, row, col,
                           1] -= (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.sin(2 * pi * row * wlen / win_length))
                with ib.if_scope(normalized):
                    output[batch, row, col,
                           0] /= tir.sqrt(tir.const(n_fft, "float32"))
                    output[batch, row, col,
                           1] /= tir.sqrt(tir.const(n_fft, "float32"))

        return ib.get()
Beispiel #6
0
def test_scalar_add():
    # All these types should be interchangeable with each other
    # E.g. float16 + float32 upconverts the float16 --> float32
    # Meanwhile if an int or float or together the int will be
    # cast to the float type.
    lhs_types = ["float32", "float16", "int32", "int64"]
    rhs_types = ["float32", "float16"]
    for lhs_type, rhs_type in itertools.product(lhs_types, rhs_types):
        # Input vars should be float32, we will cast to test for upcasting between them
        lhs_input = tir.Var("lhs", "float32")
        rhs_input = tir.Var("rhs", "float32")
        lhs = tir.Cast(lhs_type, lhs_input)
        rhs = tir.Cast(rhs_type, rhs_input)
        output = lhs + rhs
        output = tir.ret(output)
        output = tir.Evaluate(output)
        func = tir.PrimFunc([lhs_input, rhs_input], output)
        func = build_tir_func(func)
        out = func(1.0, 2.0)
        assert out == 3.0
Beispiel #7
0
def conv2d_nhwc_f16(  # pylint: disable=invalid-name,missing-docstring
    N: int,
    H: int,
    W: int,
    CI: int,
    CO: int,
    kernel_size: int,
    stride: int = 1,
    padding: int = 0,
    dilation: int = 1,
    groups: int = 1,
):
    inputs = te.placeholder((N, H, W, CI), name="inputs", dtype="float16")
    weight = te.placeholder((kernel_size, kernel_size, CI // groups, CO),
                            name="weight",
                            dtype="float16")
    batch_size, in_h, in_w, _ = inputs.shape
    k_h, k_w, channel_per_group, out_channel = weight.shape
    out_channel_per_group = out_channel // groups

    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
    rh = te.reduce_axis((0, k_h), name="rh")
    rw = te.reduce_axis((0, k_w), name="rw")
    rc = te.reduce_axis((0, channel_per_group), name="rc")

    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
    output = te.compute(
        (batch_size, out_h, out_w, out_channel),
        lambda n, h, w, co: te.sum(
            (tir.Cast(
                value=padded[n, h * stride + rh * dilation, w * stride + rw *
                             dilation, co // out_channel_per_group *
                             channel_per_group + rc, ],
                dtype="float32",
            ) * tir.Cast(value=weight[rh, rw, rc, co], dtype="float32")),
            axis=[rh, rw, rc],
        ),
        name="conv2d_nhwc",
    )
    return (inputs, weight, output)
Beispiel #8
0
def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
    """Low level IR to calculate adjacent difference in an 1-D array.

    Parameters
    ----------
    data : Buffer
        Input 1-D Buffer.

    output: Buffer
        A buffer to store adjacent difference, of the same shape as data. The adjacent difference
        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
        where i > 0 and i < len(data).

    binop: function, optional
        A binary associative op to use for calculating adjacent difference. The function takes two
        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
        compute the adjacent difference.
    """
    ib = tir.ir_builder.create()
    data_ptr = ib.buffer_ptr(data)
    output_ptr = ib.buffer_ptr(output)
    batch_size = data.shape[0]
    max_threads = tir.min(
        batch_size,
        tvm.target.Target.current(allow_none=False).max_num_threads)
    with ib.new_scope():
        nthread_tx = max_threads
        nthread_bx = ceil_div(batch_size, max_threads)
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        tid = bx * max_threads + tx
        with ib.if_scope(tid < batch_size):
            with ib.if_scope(tid == 0):
                output_ptr[tid] = 0
            with ib.else_scope():
                output_ptr[tid] = tir.Cast(
                    output.dtype, binop(data_ptr[tid], data_ptr[tid - 1]))
    return ib.get()
Beispiel #9
0
 def f_compute(i, j):
     v_a = tir.Cast(dtype="float32", value=a[i, k])
     v_b = tir.Cast(dtype="float32", value=b[k, j])
     return te.sum(v_a * v_b, axis=[k])
Beispiel #10
0
 def apply(*args):
     bool_args = list(
         map(lambda x: tir.Cast('bool', _clamp_tvm(x, 0, 1)), args))
     return tir.all(*bool_args)
Beispiel #11
0
 def apply(condition, true_expr, false_expr):
     return tir.if_then_else(tir.Cast('bool', _clamp_tvm(condition, 0, 1)),
                             true_expr, false_expr)
Beispiel #12
0
 def apply(condition, true_expr, false_expr):
     return tir.Select(tir.Cast('bool', _clamp_tvm(condition, 0, 1)),
                       _force_int(true_expr), _force_int(false_expr))