def hybrid_multibox_prior(data, sizes, ratios, steps, offsets): """Hybrid routing for multibox_prior operator. Parameters ---------- data : tvm.te.Tensor or numpy NDArray 4-D tensor with shape [batch, channel, height, width]] sizes : tvm ConsExpr Sizes for anchor boxes. ratios : tvm ConsExpr Ratios for anchor boxes. steps : tvm ConsExpr Priorbox step across y and x, -1 for auto calculation. offsets : tvm ConsExpr Priorbox center offsets, y and x respectively. Returns ------- output : tvm.te.Tensor or numpy NDArray 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] """ in_height = data.shape[2] in_width = data.shape[3] num_sizes = len(sizes) num_ratios = len(ratios) num_boxes = in_height * in_width * (num_sizes + num_ratios - 1) output = output_tensor((1, num_boxes, 4), "float32") steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width offset_h = offsets[0] offset_w = offsets[1] # Need to define var out of const_range + if w = 0.0 h = 0.0 for i in parallel(in_height): center_h = (i + offset_h) * steps_h for j in range(in_width): center_w = (j + offset_w) * steps_w for k in const_range(num_sizes + num_ratios - 1): if k < num_sizes: w = float32(sizes[k] * in_height) / in_width / 2.0 h = sizes[k] / 2.0 else: w = float32(sizes[0] * in_height) / in_width \ * sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0 h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0 count = i * in_width * (num_sizes + num_ratios - 1) \ + j * (num_sizes + num_ratios - 1) + k output[0, count, 0] = center_w - w output[0, count, 1] = center_h - h output[0, count, 2] = center_w + w output[0, count, 3] = center_h + h return output
def gen_ir( data_ptr, n_fft, hop_length, win_length, window_ptr, normalized, onesided, output_ptr, ): ib = tir.ir_builder.create() data = ib.buffer_ptr(data_ptr) window = ib.buffer_ptr(window_ptr) output = ib.buffer_ptr(output_ptr) max_threads = _get_max_threads(output_ptr.shape[0] * output_ptr.shape[1]) output_size = output_ptr.shape[0] * output_ptr.shape[ 1] * output_ptr.shape[2] with ib.new_scope(): nthread_tx = max_threads nthread_bx = ceil_div(output_size, max_threads) tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(tid < output_size): matrix_size = output_ptr.shape[1] * output_ptr.shape[2] batch = tir.floordiv(tid, matrix_size) row = tir.floordiv(tir.indexmod(tid, matrix_size), output_ptr.shape[2]) col = tir.indexmod(tir.indexmod(tid, matrix_size), output_ptr.shape[2]) output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0) output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0) with ib.for_range(0, win_length) as wlen: output[batch, row, col, 0] += (window[wlen] * data[batch, col * hop_length + wlen] * tir.cos(2 * pi * row * wlen / win_length)) output[batch, row, col, 1] -= (window[wlen] * data[batch, col * hop_length + wlen] * tir.sin(2 * pi * row * wlen / win_length)) with ib.if_scope(normalized): output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32")) output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32")) return ib.get()
def gen_ir( data_ptr, n_fft, hop_length, win_length, window_ptr, normalized, onesided, output_ptr, loop_kind, ): ib = tir.ir_builder.create() data = ib.buffer_ptr(data_ptr) window = ib.buffer_ptr(window_ptr) output = ib.buffer_ptr(output_ptr) # https://librosa.org/doc/0.7.2/_modules/librosa/core/spectrum.html#stft with ib.for_range(0, output_ptr.shape[0] * output_ptr.shape[1], kind="parallel") as batch_row: with ib.for_range(0, output_ptr.shape[2], kind=loop_kind) as col: batch = ib.allocate("int32", (1), name="batch", scope="local") row = ib.allocate("int32", (1), name="row", scope="local") batch = tir.floordiv(batch_row, output_ptr.shape[1]) row = tir.floormod(batch_row, output_ptr.shape[1]) output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0) output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0) with ib.for_range(0, win_length) as wlen: output[batch, row, col, 0] += (window[wlen] * data[batch, col * hop_length + wlen] * tir.cos(2 * pi * row * wlen / win_length)) output[batch, row, col, 1] -= (window[wlen] * data[batch, col * hop_length + wlen] * tir.sin(2 * pi * row * wlen / win_length)) with ib.if_scope(normalized): output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32")) output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32")) return ib.get()
def transformed_square_sum_square_root(a: ty.handle, d: ty.handle) -> None: A = tir.match_buffer(a, [16, 256, 256]) D = tir.match_buffer(d, [16]) C = tir.alloc_buffer([16]) for i0, i1_i2_fused_outer, i1_i2_fused_inner in tir.grid(16, 65536, 1): with tir.block( [16, tir.reduce_axis(0, 256), tir.reduce_axis(0, 256)], "C") as [b, i, j]: tir.bind(b, i0) tir.bind(i, tir.floordiv(i1_i2_fused_outer, 256)) tir.bind(j, tir.floormod(i1_i2_fused_outer, 256)) tir.reads([C[b], A[b, i, j]]) tir.writes([C[b]]) with tir.init(): C[b] = 0.0 C[b] = C[b] + (A[b, i, j] * A[b, i, j]) for i0_1 in tir.serial(0, 16): with tir.block([16], "D") as [b_1]: tir.bind(b_1, i0_1) tir.reads([C[b_1]]) tir.writes([D[b_1]]) D[b_1] = tir.sqrt(C[b_1], dtype="float32")
def square_sum_square_root_rfactor(a: ty.handle, d: ty.handle) -> None: A = tir.match_buffer(a, [16, 256, 256]) D = tir.match_buffer(d, [16]) C = tir.alloc_buffer([16]) C_rf = tir.alloc_buffer([1, 16]) for i0, i1_i2_fused_outer, i1_i2_fused_inner in tir.grid(16, 65536, 1): with tir.block( [1, 16, tir.reduce_axis(0, 256), tir.reduce_axis(0, 256)], "C_rf") as [ vi1_i2_fused_inner, b, i, j, ]: tir.bind(vi1_i2_fused_inner, i1_i2_fused_inner) tir.bind(b, i0) tir.bind(i, tir.floordiv(i1_i2_fused_outer, 256)) tir.bind(j, tir.floormod(i1_i2_fused_outer, 256)) with tir.init(): C_rf[vi1_i2_fused_inner, b] = 0.0 C_rf[vi1_i2_fused_inner, b] = C_rf[vi1_i2_fused_inner, b] + (A[b, i, j] * A[b, i, j]) for i0_1, i1_i2_fused_inner_1 in tir.grid(16, 1): with tir.block([tir.reduce_axis(0, 1), 16], "C") as [vi1_i2_fused_inner_1, b_1]: tir.bind(vi1_i2_fused_inner_1, i1_i2_fused_inner_1) tir.bind(b_1, i0_1) with tir.init(): C[b_1] = 0.0 C[b_1] = C[b_1] + C_rf[vi1_i2_fused_inner_1, b_1] for i0_2 in tir.serial(0, 16): with tir.block([16], "D") as [b_2]: tir.bind(b_2, i0_2) D[b_2] = tir.sqrt(C[b_2], dtype="float32")