コード例 #1
0
ファイル: multibox.py プロジェクト: chisuhua/tvm-ppflow
def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
    """Hybrid routing for multibox_prior operator.

    Parameters
    ----------
    data : tvm.te.Tensor or numpy NDArray
        4-D tensor with shape [batch, channel, height, width]]

    sizes : tvm ConsExpr
        Sizes for anchor boxes.

    ratios : tvm ConsExpr
        Ratios for anchor boxes.

    steps : tvm ConsExpr
        Priorbox step across y and x, -1 for auto calculation.

    offsets : tvm ConsExpr
        Priorbox center offsets, y and x respectively.

    Returns
    -------
    output : tvm.te.Tensor or numpy NDArray
        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
    """
    in_height = data.shape[2]
    in_width = data.shape[3]
    num_sizes = len(sizes)
    num_ratios = len(ratios)
    num_boxes = in_height * in_width * (num_sizes + num_ratios - 1)
    output = output_tensor((1, num_boxes, 4), "float32")
    steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height
    steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width
    offset_h = offsets[0]
    offset_w = offsets[1]

    # Need to define var out of const_range + if
    w = 0.0
    h = 0.0

    for i in parallel(in_height):
        center_h = (i + offset_h) * steps_h
        for j in range(in_width):
            center_w = (j + offset_w) * steps_w
            for k in const_range(num_sizes + num_ratios - 1):
                if k < num_sizes:
                    w = float32(sizes[k] * in_height) / in_width / 2.0
                    h = sizes[k] / 2.0
                else:
                    w = float32(sizes[0] * in_height) / in_width \
                        * sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
                    h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
                count = i * in_width * (num_sizes + num_ratios - 1) \
                    + j * (num_sizes + num_ratios - 1) + k
                output[0, count, 0] = center_w - w
                output[0, count, 1] = center_h - h
                output[0, count, 2] = center_w + w
                output[0, count, 3] = center_h + h

    return output
コード例 #2
0
    def gen_ir(
        data_ptr,
        n_fft,
        hop_length,
        win_length,
        window_ptr,
        normalized,
        onesided,
        output_ptr,
    ):
        ib = tir.ir_builder.create()
        data = ib.buffer_ptr(data_ptr)
        window = ib.buffer_ptr(window_ptr)
        output = ib.buffer_ptr(output_ptr)
        max_threads = _get_max_threads(output_ptr.shape[0] *
                                       output_ptr.shape[1])
        output_size = output_ptr.shape[0] * output_ptr.shape[
            1] * output_ptr.shape[2]
        with ib.new_scope():
            nthread_tx = max_threads
            nthread_bx = ceil_div(output_size, max_threads)
            tx = te.thread_axis("threadIdx.x")
            bx = te.thread_axis("blockIdx.x")
            ib.scope_attr(tx, "thread_extent", nthread_tx)
            ib.scope_attr(bx, "thread_extent", nthread_bx)
            tid = bx * max_threads + tx

            with ib.if_scope(tid < output_size):
                matrix_size = output_ptr.shape[1] * output_ptr.shape[2]
                batch = tir.floordiv(tid, matrix_size)
                row = tir.floordiv(tir.indexmod(tid, matrix_size),
                                   output_ptr.shape[2])
                col = tir.indexmod(tir.indexmod(tid, matrix_size),
                                   output_ptr.shape[2])
                output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0)
                output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0)
                with ib.for_range(0, win_length) as wlen:
                    output[batch, row, col,
                           0] += (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.cos(2 * pi * row * wlen / win_length))
                    output[batch, row, col,
                           1] -= (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.sin(2 * pi * row * wlen / win_length))
                with ib.if_scope(normalized):
                    output[batch, row, col,
                           0] /= tir.sqrt(tir.const(n_fft, "float32"))
                    output[batch, row, col,
                           1] /= tir.sqrt(tir.const(n_fft, "float32"))

        return ib.get()
コード例 #3
0
ファイル: stft.py プロジェクト: junrushao1994/tvm
    def gen_ir(
        data_ptr,
        n_fft,
        hop_length,
        win_length,
        window_ptr,
        normalized,
        onesided,
        output_ptr,
        loop_kind,
    ):
        ib = tir.ir_builder.create()
        data = ib.buffer_ptr(data_ptr)
        window = ib.buffer_ptr(window_ptr)
        output = ib.buffer_ptr(output_ptr)
        # https://librosa.org/doc/0.7.2/_modules/librosa/core/spectrum.html#stft
        with ib.for_range(0,
                          output_ptr.shape[0] * output_ptr.shape[1],
                          kind="parallel") as batch_row:
            with ib.for_range(0, output_ptr.shape[2], kind=loop_kind) as col:
                batch = ib.allocate("int32", (1), name="batch", scope="local")
                row = ib.allocate("int32", (1), name="row", scope="local")
                batch = tir.floordiv(batch_row, output_ptr.shape[1])
                row = tir.floormod(batch_row, output_ptr.shape[1])
                output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0)
                output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0)
                with ib.for_range(0, win_length) as wlen:
                    output[batch, row, col,
                           0] += (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.cos(2 * pi * row * wlen / win_length))
                    output[batch, row, col,
                           1] -= (window[wlen] *
                                  data[batch, col * hop_length + wlen] *
                                  tir.sin(2 * pi * row * wlen / win_length))
                with ib.if_scope(normalized):
                    output[batch, row, col,
                           0] /= tir.sqrt(tir.const(n_fft, "float32"))
                    output[batch, row, col,
                           1] /= tir.sqrt(tir.const(n_fft, "float32"))

        return ib.get()
コード例 #4
0
def transformed_square_sum_square_root(a: ty.handle, d: ty.handle) -> None:
    A = tir.match_buffer(a, [16, 256, 256])
    D = tir.match_buffer(d, [16])
    C = tir.alloc_buffer([16])

    for i0, i1_i2_fused_outer, i1_i2_fused_inner in tir.grid(16, 65536, 1):
        with tir.block(
            [16, tir.reduce_axis(0, 256),
             tir.reduce_axis(0, 256)], "C") as [b, i, j]:
            tir.bind(b, i0)
            tir.bind(i, tir.floordiv(i1_i2_fused_outer, 256))
            tir.bind(j, tir.floormod(i1_i2_fused_outer, 256))
            tir.reads([C[b], A[b, i, j]])
            tir.writes([C[b]])
            with tir.init():
                C[b] = 0.0
            C[b] = C[b] + (A[b, i, j] * A[b, i, j])
    for i0_1 in tir.serial(0, 16):
        with tir.block([16], "D") as [b_1]:
            tir.bind(b_1, i0_1)
            tir.reads([C[b_1]])
            tir.writes([D[b_1]])
            D[b_1] = tir.sqrt(C[b_1], dtype="float32")
コード例 #5
0
def square_sum_square_root_rfactor(a: ty.handle, d: ty.handle) -> None:
    A = tir.match_buffer(a, [16, 256, 256])
    D = tir.match_buffer(d, [16])
    C = tir.alloc_buffer([16])
    C_rf = tir.alloc_buffer([1, 16])

    for i0, i1_i2_fused_outer, i1_i2_fused_inner in tir.grid(16, 65536, 1):
        with tir.block(
            [1, 16, tir.reduce_axis(0, 256),
             tir.reduce_axis(0, 256)], "C_rf") as [
                 vi1_i2_fused_inner,
                 b,
                 i,
                 j,
             ]:
            tir.bind(vi1_i2_fused_inner, i1_i2_fused_inner)
            tir.bind(b, i0)
            tir.bind(i, tir.floordiv(i1_i2_fused_outer, 256))
            tir.bind(j, tir.floormod(i1_i2_fused_outer, 256))
            with tir.init():
                C_rf[vi1_i2_fused_inner, b] = 0.0
            C_rf[vi1_i2_fused_inner,
                 b] = C_rf[vi1_i2_fused_inner, b] + (A[b, i, j] * A[b, i, j])

    for i0_1, i1_i2_fused_inner_1 in tir.grid(16, 1):
        with tir.block([tir.reduce_axis(0, 1), 16],
                       "C") as [vi1_i2_fused_inner_1, b_1]:
            tir.bind(vi1_i2_fused_inner_1, i1_i2_fused_inner_1)
            tir.bind(b_1, i0_1)
            with tir.init():
                C[b_1] = 0.0
            C[b_1] = C[b_1] + C_rf[vi1_i2_fused_inner_1, b_1]

    for i0_2 in tir.serial(0, 16):
        with tir.block([16], "D") as [b_2]:
            tir.bind(b_2, i0_2)
            D[b_2] = tir.sqrt(C[b_2], dtype="float32")