def test_simplify_if_then_else():
    ck = CanonicalChecker()
    x = tvm.var("x")
    y = tvm.var("y")
    # simplification that takes condition into account.
    res = tvm.if_then_else((x * 4 + y) >= 466036,
                           tvm.if_then_else(24512 <= ((((x*4) + y) - 466036) % 24528),
                                            (((((x*4) + y)  - 466036) % 24528) -24512) % 16,
                                            x), y)
    expected = tvm.if_then_else(
        tvm.expr.LE(466036, (x * 4 + y)),
        tvm.if_then_else(tvm.expr.LE(24512, ((((x*4) + y) - 4) % 24528)),
                         (((x*4) + y)  - 4) % 16,
                         x), y)
    ck.verify(res, expected)
    # can only simplify if condition
    res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3)
    expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10,
                          tvm.if_then_else(x / 3 > 2, x, 0), 0)
    expected = tvm.expr.Select(x >= 10, x, 0)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10,
                          tvm.if_then_else(x / 3 < 2, x, 0), 0)
    ck.verify(res, 0)
Beispiel #2
0
Datei: nms.py Projekt: bddppq/tvm
def get_valid_counts_scan(data, partial_in, partial):
    """Low level IR to do scan.

    Parameters
    ----------
    data: Buffer
        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.

    idx_in : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    idx : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    partial : Buffer
        2D Buffer of valid data indices with shape [batch_size, new_range].

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    ib = tvm.ir_builder.create()
    partial_in = ib.buffer_ptr(partial_in)
    partial = ib.buffer_ptr(partial)
    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
    elem_per_thread = num_anchors // max_threads + 1
    nthread_tx = max_threads
    nthread_bx = batch_size
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    var = tvm.make.node("FloatImm", dtype="float32", value=2)
    new_range = num_anchors // elem_per_thread + 1
    iteration = log(cast(new_range, "float32")) // math.log(2)
    # Scan: Kogge-Stone adder
    with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))):
        with ib.for_range(0, iteration) as k:
            with ib.if_scope(k == 0):
                with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))):
                    partial[bx * new_range + tx] = \
                    partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1]
                with ib.else_scope():
                    partial[bx * new_range] = partial_in[bx * new_range]
            with ib.else_scope():
                with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \
                                         tx < tvm.min(new_range, num_anchors))):
                    partial[bx * new_range + tx] += \
                    partial[bx * new_range + tx - cast(power(var, k), "int32")]
            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                  tvm.convert(['shared']),
                                  tvm.expr.Call.Intrinsic, None, 0))
    return ib.get()
def test_basic():
    a = tvm.var("a")
    b = tvm.var("b")
    c = tvm.var("c")
    m = tvm.arith.DetectClipBound(tvm.all(a * 1 < b * 6,
                                          a - 1 > 0), [a])
    assert tvm.ir_pass.Simplify(m[1] - (b * 6 - 1)).value == 0
    assert m[0].value == 2
    m = tvm.arith.DetectClipBound(tvm.all(a * 1 < b * 6,
                                          a - 1 > 0), [a, b])
    assert len(m) == 0
    m = tvm.arith.DetectClipBound(tvm.all(a + 10 * c <= 20,
                                          b - 1 > 0), [a, b])
    assert tvm.ir_pass.Simplify(m[1] - (20 - 10 * c)).value == 0
    assert tvm.ir_pass.Simplify(m[2] - 2).value == 0
def test_copy_pad_split():
    m = 4 * 3
    A = tvm.placeholder((m, ), name="A")
    Apad = tvm.compute((m + 2,), lambda i:
                       tvm.select(tvm.all(i >= 1, i <= m),
                                  A[i - 1], 0.0), "Apad")
    B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=4)
    s[Apad].compute_at(s[B], xo)
    s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    def cb(src, dst, pad_before, pad_after, pad_value):
        assert(dst.elem_offset.value == 0)
        assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1)

        rpad_before = tvm.max(1 - xo * 4, 0)
        rpad_after = tvm.max(xo * 4 - 7, 0)
        assert_expr_equal(pad_before[0], rpad_before)
        assert_expr_equal(pad_after[0], rpad_after)
        assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after)
        return tvm.make.Evaluate(0)
    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
Beispiel #5
0
def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
    """Non-maximum supression.

    Parameters
    ----------
    sorted_bbox_buf : tvm.schedule.Buffer
        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
        [w_start, h_start, w_end, h_end, score].

    out_buf : tvm.schedule.Buffer
        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.

    nms_threshold : float
        Non-maximum suppression threshold.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
        """Calculate overlap of two boxes.
        """
        w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
                    - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0)
        h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
                    - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) + 1.0)
        i = w * h
        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0) * \
            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0) + \
            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx] + 1.0) * \
            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1] + 1.0) - i
        return i / u

    batch, num_bbox = get_const_tuple(out_buf.shape)
    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib = tvm.ir_builder.create()
    p_data = ib.buffer_ptr(sorted_bbox_buf)
    p_out = ib.buffer_ptr(out_buf)
    nthread_tx = max_threads
    nthread_bx = num_bbox // max_threads + 1
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    i = bx * max_threads + tx
    with ib.for_range(0, batch, for_type="unroll", name="n") as b:
        base_idx = b * num_bbox
        with ib.if_scope(i < num_bbox):
            p_out[base_idx + i] = False
        with ib.for_range(0, num_bbox - 1) as l:
            with ib.if_scope(tvm.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
                iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5)
                with ib.if_scope(iou > nms_threshold):
                    p_out[base_idx + i] = True
        ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                              tvm.convert(['shared']),
                              tvm.expr.Call.Intrinsic, None, 0))
    return ib.get()
Beispiel #6
0
 def select_array(i, j):
     now = tvm.const(0.0, dtype)
     for ii in range(row):
         for jj in range(col):
             now = tvm.expr.Select(tvm.all(i % row == ii, j % col == jj),
                                   tvm.const(matrix[ii][jj], dtype),
                                   now)
     return now
Beispiel #7
0
Datei: nms.py Projekt: bddppq/tvm
def get_valid_counts_upsweep(data, idx_in, idx, partial):
    """Low level IR of first step of scan: unsweep.

    Parameters
    ----------
    data: Buffer
        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.

    idx_in : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    idx : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    partial : Buffer
        2D Buffer of valid data indices with shape [batch_size, new_range].

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    ib = tvm.ir_builder.create()
    data = ib.buffer_ptr(data)
    idx_in = ib.buffer_ptr(idx_in)
    idx = ib.buffer_ptr(idx)
    partial = ib.buffer_ptr(partial)
    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
    elem_per_thread = num_anchors // max_threads + 1
    nthread_tx = max_threads
    nthread_bx = batch_size
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    new_range = num_anchors // elem_per_thread + 1
    # Scan: Upsweep:
    with ib.if_scope(tvm.all(bx < batch_size, tx < new_range)):
        with ib.for_range(0, elem_per_thread) as i:
            with ib.if_scope(bx * num_anchors + \
                             tx * elem_per_thread + i < batch_size * num_anchors):
                with ib.if_scope(i == 0):
                    partial[bx * new_range + tx] = idx_in[bx * num_anchors + tx * elem_per_thread]
                    idx[bx * num_anchors + tx * elem_per_thread] = \
                    idx_in[bx * num_anchors + tx * elem_per_thread]
                with ib.else_scope():
                    partial[bx * new_range + tx] += \
                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
                    idx[bx * num_anchors + tx * elem_per_thread + i] = \
                    idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \
                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                  tvm.convert(['shared']),
                                  tvm.expr.Call.Intrinsic, None, 0))
    return ib.get()
Beispiel #8
0
def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
    """Copy output after applying nms to continuous memory.

    Parameters
    ----------
    sorted_bbox_buf : tvm.schedule.Buffer
        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
        [w_start, h_start, w_end, h_end, score].

    remove_mask_buf : tvm.schedule.Buffer
        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.

    out_buf : tvm.schedule.Buffer
        2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
        [batch_index, w_start, h_start, w_end, h_end].

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
    rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
    nthread_tx = batch
    tx = tvm.thread_axis("threadIdx.x")
    ib = tvm.ir_builder.create()
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    i = ib.allocate('int32', (1,), 'i', scope='local')
    i[0] = 0
    p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf)
    p_remove = ib.buffer_ptr(remove_mask_buf)
    p_out = ib.buffer_ptr(out_buf)
    b = tx

    nkeep = ib.allocate('int32', (1,), 'nkeep', scope='local')
    nkeep[0] = 0 # number of bbox after nms

    with ib.for_range(0, num_bbox) as j:
        with ib.if_scope(p_remove[b * num_bbox + j] == False):
            nkeep[0] += 1
    with ib.if_scope(nkeep[0] > 0):
        with ib.for_range(0, tvm.ceil(
            tvm.const(rpn_post_nms_top_n, 'float32') / nkeep[0]).astype('int32')):
            with ib.for_range(0, num_bbox) as j:
                offset_j = (b * num_bbox + j) * 5
                offset_i = (b * rpn_post_nms_top_n + i[0]) * 5
                with ib.if_scope(tvm.all(i[0] < rpn_post_nms_top_n,
                                         p_remove[(b*num_bbox+j)] == False)):
                    p_out[offset_i] = tvm.expr.Cast('float32', b)
                    with ib.for_range(0, 4, for_type='unroll') as k:
                        p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
                    i[0] = i[0] + 1

    body = ib.get()
    return body
def test_schedule_bound_condition():
   A = tvm.placeholder((64,), name='A', dtype="float32")
   Apad = tvm.compute((66,), lambda i: tvm.select(tvm.all(i>0, i < 65), A[i-1], tvm.const(0.)), name='Apad')
   Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2')
   s = tvm.create_schedule(Apad2.op)
   AL1 = s.cache_read(A,"local",[Apad])
   s = s.normalize()
   bounds = tvm.schedule.InferBound(s)
   stmt = tvm.schedule.ScheduleOps(s, bounds)
   stmt = tvm.ir_pass.Simplify(stmt)
   assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
Beispiel #10
0
def test_all():
    x = tvm.var('x')
    y = tvm.var('y')
    z = tvm.var('z')
    try:
        t = x and x
        assert False
    except ValueError:
        pass
    try:
        tvm.all()
        assert False
    except ValueError:
        pass
    assert str(tvm.all(x < y)) == '(%s < %s)' % (x.name, y.name)
    assert str(tvm.all(x < y, x > z)) == '((%s < %s) && (%s > %s))' % (
        x.name, y.name, x.name, z.name)
    assert str(tvm.all(x < y, y > z + 1, x < z * 2)) == \
        '(((%s < %s) && (%s > (%s + 1))) && (%s < (%s*2)))' % (
            x.name, y.name, y.name, z.name, x.name, z.name)
Beispiel #11
0
 def _dilate(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if not equal_const_int(strides[i], 1):
             index_tuple.append(indices[i] // strides[i])
             not_zero.append((indices[i] % strides[i]).equal(0))
         else:
             index_tuple.append(indices[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype))
     return data(*index_tuple)
Beispiel #12
0
def argsort_ir(data_buf, out_index_buf):
    """Batched odd-even transposition sort.

    Parameters
    ----------
    data_buf : tvm.schedule.Buffer
        2-D with shape [batch, num_bbox]

    out_index_buf : tvm.schedule.Buffer
        2-D with shape [batch, num_bbox]. Indices of data in sorted order.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch, num_bbox = get_const_tuple(data_buf.shape)
    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
    ib = tvm.ir_builder.create()
    p_data = ib.buffer_ptr(data_buf)
    index_out = ib.buffer_ptr(out_index_buf)
    nthread_tx = max_threads
    nthread_bx = (num_bbox + 1) // 2 // max_threads + 1
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("vthread")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "virtual_thread", nthread_bx)
    tid = bx * nthread_tx + tx
    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
    temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")

    with ib.for_range(0, batch, for_type="unroll") as b:
        start = b * num_bbox
        for i in range(2):
            bbox_id = tid * 2 + i
            with ib.if_scope(bbox_id < num_bbox):
                index_out[start + bbox_id] = bbox_id
        with ib.for_range(0, num_bbox) as k:
            offset = start + 2 * tid + (k % 2)
            with ib.if_scope(
                tvm.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])):
                temp_data[0] = p_data[offset]
                p_data[offset] = p_data[offset + 1]
                p_data[offset + 1] = temp_data[0]
                temp_index[0] = index_out[offset]
                index_out[offset] = index_out[offset + 1]
                index_out[offset + 1] = temp_index[0]
            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                  tvm.convert(['shared']),
                                  tvm.expr.Call.Intrinsic, None, 0))
    return ib.get()
Beispiel #13
0
def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
    """declare the Im2Col method for conv2d"""
    _, CI, IH, IW = [x.value for x in data.shape]
    CO, _, KH, KW = [x.value for x in kernel.shape]
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride

    N = 1
    OH = (IH + 2*HPAD - KH) // HSTR + 1
    OW = (IW + 2*WPAD - KW) // WSTR + 1

    DO_PAD = (HPAD != 0 and WPAD != 0)
    if DO_PAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    ALIGN = 16
    def upround(x, align):
        return (x + align - 1) // align * align

    # A [CO, CI * KH * KW]
    reduce_len = upround(CI * KH * KW, ALIGN)
    A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j:
                    kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A')

    # B [CI * KH * KW, N * OH * OW]
    B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\
            tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW),
                       data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH]
                       [j % OW*WSTR + i % KW],
                       tvm.const(0, data_pad.dtype)), name='B')

    gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1]

    # C [CO, N * OH * OW]
    k = tvm.reduce_axis((0, gemm_l), name='k')
    C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')

    # output
    # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment,
    # otherwise the alignment above will be eliminated by bound inference
    output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\
                 C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1],
                         name='output', tag='im2col_conv_output')

    return output
 def _dilate(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if not equal_const_int(strides[i], 1):
             index_tuple.append(indices[i] // strides[i])
             not_zero.append((indices[i] % strides[i]).equal(0))
         else:
             index_tuple.append(indices[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple),
                                 tvm.const(0.0, data.dtype))
     return data(*index_tuple)
Beispiel #15
0
    def compute_temp(k, p, eps, nu):
        temp_expr = {}
        for j in range(4):
            t0 = M[0][j][k][p] + M[1][j][k][p]
            t1 = M[1][j][k][p] - M[2][j][k][p]
            temp_expr[(0, j)] = t0 + M[2][j][k][p]
            temp_expr[(1, j)] = t1 - M[3][j][k][p]

        now = tvm.const(0.0, "float32")
        for ii in range(2):
            for jj in range(4):
                now = tvm.select(tvm.all(eps == ii, nu == jj),
                                 temp_expr[(ii, jj)], now)
        return now
Beispiel #16
0
Datei: pad.py Projekt: bddppq/tvm
 def _pad(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if equal_const_int(pad_before[i], 0) and equal_const_int(pad_after[i], 0):
             index_tuple.append(indices[i])
         else:
             index_tuple.append(indices[i] - pad_before[i])
             not_zero.append(indices[i] >= pad_before[i])
             not_zero.append(indices[i] < data.shape[i] + pad_before[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), pad_value)
     return data(*index_tuple)
Beispiel #17
0
 def _pad(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if equal_const_int(pad_before[i], 0) and equal_const_int(pad_after[i], 0):
             index_tuple.append(indices[i])
         else:
             index_tuple.append(indices[i] - pad_before[i])
             not_zero.append(indices[i] >= pad_before[i])
             not_zero.append(indices[i] < data.shape[i] + pad_before[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), pad_value)
     return data(*index_tuple)
Beispiel #18
0
def test_cmp_load_store():
    n = 32
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) > B(*i), name='C')
    D = tvm.compute(C.shape, lambda *i: tvm.all(C(*i), A(*i) > 1), name="D")

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        s = tvm.create_schedule(D.op)
        xo, xi = s[C].split(C.op.axis[0], factor=4)
        xo1, xo2 = s[C].split(xo, factor=13)
        s[C].parallel(xo2)
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, B, D], "llvm")
        ctx = tvm.cpu(0)
        a_np = np.random.uniform(size=n).astype(A.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
        f(a, b, d)
        np.testing.assert_equal(
            d.asnumpy(),
            np.logical_and(a.asnumpy() > b.asnumpy(),
                           a.asnumpy() > 1))

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            return
        s = tvm.create_schedule(D.op)
        for stage in [C, D]:
            xo, xi = s[stage].split(stage.op.axis[0], factor=4)
            s[stage].bind(xo, tvm.thread_axis("blockIdx.x"))
            s[stage].bind(xi, tvm.thread_axis("threadIdx.x"))
        f = tvm.build(s, [A, B, D], device)
        a_np = np.random.uniform(size=n).astype(A.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
        f(a, b, d)
        np.testing.assert_equal(
            d.asnumpy(),
            np.logical_and(a.asnumpy() > b.asnumpy(),
                           a.asnumpy() > 1))

    check_llvm()
    for device in ["vulkan", "opencl", "cuda", "rocm", "metal"]:
        check_device(device)
def test_simplify_if_then_else():
    ck = CanonicalChecker()
    x = tvm.var("x")
    y = tvm.var("y")
    tdiv = tvm.truncdiv
    tmod = tvm.truncmod
    # simplification that takes condition into account.
    res = tvm.if_then_else(
        (x * 4 + y) >= 466036,
        tvm.if_then_else(24512 <= tmod(((x * 4) + y) - 466036, 24528),
                         tmod(tmod(((x * 4) + y) - 466036, 24528) - 24512, 16),
                         x), y)

    res2 = tvm.if_then_else(
        (x * 4) >= 466036 - y,
        tvm.if_then_else(24512 <= tmod(((x * 4) + y) - 466036, 24528),
                         tmod(tmod(((x * 4) + y) - 466036, 24528) - 24512, 16),
                         x), y)
    expected = tvm.if_then_else(
        tvm.expr.LE(466036, (x * 4 + y)),
        tvm.if_then_else(tvm.expr.LE(24512, tmod(((x * 4) + y) - 4, 24528)),
                         tmod(((x * 4) + y) - 4, 16), x), y)
    ck.verify(res, expected)
    ck.verify(res2, expected)
    # can only simplify if condition
    res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), tmod(x + y + 100, 3),
                          tmod(x + 100, 3))
    expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), tmod(x + y + 1, 3),
                               tmod(x + 100, 3))
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10, tvm.if_then_else(tdiv(x, 3) > 2, x, 0), 0)
    expected = tvm.expr.Select(x >= 10, x, 0)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10, tvm.if_then_else(tdiv(x, 3) < 2, x, 0), 0)
    ck.verify(res, 0)
Beispiel #20
0
def dropout2d_compute (input, data, out_dtype=None):
    if out_dtype is None:
        out_dtype = input.dtype
    
    batch, species = input.shape
    
    output_data = lambda on, os: tvm.max(
            tvm.expr.Select(
                tvm.all(data[on,os] > 0.5),
                input[on, os].astype(out_dtype),
                0.0),
            #(input[on, os].astype(out_dtype), relay.const(0.0)),
            axis=[])

    return tvm.compute((batch, species), output_data, tag="dropout2d")
Beispiel #21
0
 def _dilate(*indices):
     not_zero = []
     index_tuple = []
     idxdiv = tvm.indexdiv
     idxmod = tvm.indexmod
     for i in range(n):
         if not util.equal_const_int(strides[i], 1):
             index_tuple.append(idxdiv(indices[i], strides[i]))
             not_zero.append(idxmod(indices[i], strides[i]).equal(0))
         else:
             index_tuple.append(indices[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype))
     return data(*index_tuple)
Beispiel #22
0
def lrn_sqr_nchw(data, size, axis):
    out_dtype = data.dtype
    radius = size // 2
    batch, in_channel, in_height, in_width = data.shape

    ls = tvm.reduce_axis((0, size), name='ls')

    sqr_out = lambda on, oc, oh, ow: tvm.sum(
            tvm.expr.Select(
                tvm.all(oc >= radius, oc < (in_channel+radius)),
                data[on, oc-radius+ls, oh, ow].astype(out_dtype) * data[on, oc-radius+ls, oh, ow].astype(out_dtype),
                0.0) ,
            axis=[ls])

    return tvm.compute((batch, in_channel, in_height, in_width), sqr_out, tag="lrn_sqrt_op")
Beispiel #23
0
def test_schedule_bound_condition():
    A = tvm.placeholder((64, ), name='A', dtype="float32")
    Apad = tvm.compute(
        (66, ),
        lambda i: tvm.select(tvm.all(i > 0, i < 65), A[i - 1], tvm.const(0.)),
        name='Apad')
    Apad2 = tvm.compute((66, ), lambda i: Apad[i] * 2, name='Apad2')
    s = tvm.create_schedule(Apad2.op)
    AL1 = s.cache_read(A, "local", [Apad])
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    stmt = tvm.ir_pass.Simplify(stmt)
    assert (isinstance(stmt.body.body.first.body.body.then_case,
                       tvm.stmt.IfThenElse))
Beispiel #24
0
def test_const_fold3():
    def check_throws(f):
        try:
            f()
        except tvm.TVMError:
            pass
        else:
            raise AssertionError("Should have raised an exception but didn't.")

    # Test that using ints with logic operations is forbidden
    x = tvm.var("x")
    for val in [0, 1]:
        for func in [tvm.all, tvm.any]:
            check_throws(lambda: func(tvm.const(val, 'uint1'), x))
            check_throws(lambda: func(x, tvm.const(val, 'uint1')))

    # Test const folding when both arguments are const
    for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]:
        for v1 in [0, 1]:
            for v2 in [0, 1]:
                assert tvm.ir_pass.Equal(tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')),
                                         tvm.const(py_func(v1, v2), 'uint1'))

    x = tvm.var("x", 'uint1')
    true = tvm.const(1, 'uint1')
    false = tvm.const(0, 'uint1')

    assert tvm.all(x, true).same_as(x)
    assert tvm.all(true, x).same_as(x)
    assert tvm.any(x, false).same_as(x)
    assert tvm.any(false, x).same_as(x)

    assert tvm.all(x, false).same_as(false)
    assert tvm.all(false, x).same_as(false)
    assert tvm.any(x, true).same_as(true)
    assert tvm.any(true, x).same_as(true)
Beispiel #25
0
def relu4d_compute(input, out_dtype=None):
    if out_dtype is None:
        out_dtype = input.dtype

    batch, in_channel, in_height, in_width = input.shape

    output_data = lambda on, oc, oh, ow: tvm.max(
        tvm.expr.Select(tvm.all(input[on, oc, oh, ow] > 0), input[
            on, oc, oh, ow].astype(out_dtype), 0.0),
        #(input[on, oc, oh, ow].astype(out_dtype), relay.const(0.0)),
        axis=[])

    return tvm.compute((batch, in_channel, in_height, in_width),
                       output_data,
                       tag="relu4D")
Beispiel #26
0
def test_const_fold3():
    def check_throws(f):
        try:
            f()
        except tvm.TVMError:
            pass
        else:
            raise AssertionError("Should have raised an exception but didn't.")

    # Test that using ints with logic operations is forbidden
    x = tvm.var("x")
    for val in [0, 1]:
        for func in [tvm.all, tvm.any]:
            check_throws(lambda: func(tvm.const(val, 'uint1'), x))
            check_throws(lambda: func(x, tvm.const(val, 'uint1')))

    # Test const folding when both arguments are const
    for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]:
        for v1 in [0, 1]:
            for v2 in [0, 1]:
                assert tvm.ir_pass.Equal(tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')),
                                         tvm.const(py_func(v1, v2), 'uint1'))

    x = tvm.var("x", 'uint1')
    true = tvm.const(1, 'uint1')
    false = tvm.const(0, 'uint1')

    assert tvm.all(x, true).same_as(x)
    assert tvm.all(true, x).same_as(x)
    assert tvm.any(x, false).same_as(x)
    assert tvm.any(false, x).same_as(x)

    assert tvm.all(x, false).same_as(false)
    assert tvm.all(false, x).same_as(false)
    assert tvm.any(x, true).same_as(true)
    assert tvm.any(true, x).same_as(true)
Beispiel #27
0
def test_cmp_load_store():
    n = 32
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) > B(*i), name='C')
    D = tvm.compute(C.shape, lambda *i: tvm.all(C(*i), A(*i) > 1), name="D")


    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        s = tvm.create_schedule(D.op)
        xo, xi = s[C].split(C.op.axis[0], factor=4)
        xo1, xo2 = s[C].split(xo, factor=13)
        s[C].parallel(xo2)
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, B, D], "llvm")
        ctx = tvm.cpu(0)
        a_np = np.random.uniform(size=n).astype(A.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
        f(a, b, d)
        np.testing.assert_equal(
            d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1))

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            return
        s = tvm.create_schedule(D.op)
        for stage in [C, D]:
            xo, xi = s[stage].split(stage.op.axis[0], factor=4)
            s[stage].bind(xo, tvm.thread_axis("blockIdx.x"))
            s[stage].bind(xi, tvm.thread_axis("threadIdx.x"))
        f = tvm.build(s, [A, B, D], device)
        a_np = np.random.uniform(size=n).astype(A.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
        f(a, b, d)
        np.testing.assert_equal(
            d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1))


    check_llvm()
    for device in ["vulkan", "opencl", "cuda", "rocm", "metal"]:
        check_device(device)
Beispiel #28
0
 def dilate_kernel(
         *indices
 ):  # This function is the same as topi.nn.dilate, but inlined
     not_zero = []
     index_tuple = []
     for i in range(len(dilate_args)):
         if not topi.util.equal_const_int(dilate_args[i], 1):
             index_tuple.append(indices[i] // dilate_args[i])
             not_zero.append((indices[i] % dilate_args[i]).equal(0))
         else:
             index_tuple.append(indices[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.select(not_zero, kernel(*index_tuple),
                           tvm.const(0.0, data.dtype))
     return kernel(*index_tuple)
Beispiel #29
0
def gaussian_blur2d(M, N, k, dtype="float32"):
    A = tvm.placeholder((M, N), dtype=dtype, name="A")
    pad = k // 2
    number = k * k
    Apad = tvm.compute((M + 2 * pad, N + 2 * pad),
                       lambda i, j: tvm.if_then_else(
                           tvm.all(i >= pad, i < M + pad, j >= pad, j < N + pad
                                   ), A[i - pad, j - pad], 0.0),
                       name="Apad")
    rx = tvm.reduce_axis((0, k), name="rx")
    ry = tvm.reduce_axis((0, k), name="ry")
    B = tvm.compute(
        (M, N),
        lambda i, j: tvm.sum(Apad[i + rx, j + ry] / number, axis=[rx, ry]),
        name="B")
    return B.op, [A, B]
Beispiel #30
0
def poolingb(Image, Index, POutput):
    """
    reverse 2*2 max pooling revised

    Parameters
    ----------
    Image : tvm.tensor.Tensor
        4-D with shape [batch_size, image_height, image_width, in_channels]
    Index : tvm.tensor.Tensor, specify where Output[i,j,k,l] is from, this follows the convention of 
        Numpy and PyTorch. You will need this tensor to compute the gradient.
        ------------------------------------------
        For example, if Image is of shape [1, 4, 4, 1] (batch 1 and channel 1), then the slice
        Image[0, :, :, 0] is

        [[0.7243, 0.3236, 0.0124, 0.4314],
        [0.4104, 0.3997, 0.4534, 0.1791],
        [0.0973, 0.2673, 0.6907, 0.9207],
        [0.9268, 0.6590, 0.0312, 0.2364]]

        and Index is of shape [1, 2, 2, 1] and the slice Index[0, :, :, 0] is 

        [[ 0,  6],
        [12, 11]]

        because 0 = 0 * 4 + 0 (0, 0)
                6 = 1 * 4 + 2 (1, 2)
                12= 3 * 4 + 0 (3, 0)
                11= 2 * 4 + 3 (2, 3)
        --------------------------------------------
        4-D with shape [batch_size, out_height, out_width, in_channels]
    POutput:tvm.tensor.Tensor, gradient of Output
        4-D with shape [batch_size, out_height, out_width, in_channels]

    Returns
    -------
    PImage: tvm.tensor.Tensor, gradient of Image
        4-D with shape (Image.shape)
    """
    _, _, W, _ = Image.shape

    PImage = tvm.compute(
        Image.shape, lambda n, i, j, c: tvm.if_then_else(
            tvm.all(i == Index[n, i // 2, j // 2, c] // W, j == Index[
                n, i // 2, j // 2, c] % W), POutput[n, i // 2, j // 2, c], 0.0)
    )

    return PImage
Beispiel #31
0
def max_pool2d_nchw(input, pool_size, stride, padding, out_dtype=None):
    if out_dtype is None:
        out_dtype = input.dtype
    assert isinstance(pool_size, int) or len(pool_size) == 2
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(padding, int) or len(padding) == 2

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(padding, int):
        pad_h = pad_w = padding
    else:
        pad_h, pad_w = padding

    if isinstance(pool_size, int):
        kernel_h = kernel_w = pool_size
    else:
        kernel_h, kernel_w = pool_size

    batch, in_channel, in_height, in_width = input.shape

    out_channel = in_channel
    # In Caffe, when the pooling operator is not divisible, ceil is adopted,
    # while the convolution operator is floor
    #out_height = math.ceil((in_height+2*pad_h-kernel_h)/stride_h+1)
    #out_width = math.ceil((in_width+2*pad_w-kernel_w)/stride_w+1)
    out_height = simplify((in_height + 2 * pad_h - kernel_h) // stride_h + 1)
    out_width = simplify((in_width + 2 * pad_w - kernel_w) // stride_w + 1)

    kh = tvm.reduce_axis((0, kernel_h), name='kh')
    kw = tvm.reduce_axis((0, kernel_w), name='kw')

    output_data = lambda on, oc, oh, ow: tvm.max(tvm.expr.Select(
        tvm.all((oh * stride_h + kh) >= pad_h, (oh * stride_h + kh) <
                (in_height + pad_h), (ow * stride_w + kw >= pad_w),
                (ow * stride_w + kw < in_width + pad_w)),
        input[on, oc, oh * stride_h + kh - pad_h, ow * stride_w + kw - pad_w
              ].astype(out_dtype), 0.0),
                                                 axis=[kh, kw])

    return tvm.compute((batch, out_channel, out_height, out_width),
                       output_data,
                       tag="max_pool2d_nchw")
Beispiel #32
0
    def compute_output(n, k, h, w):
        b = n * nH * nW + (h // m) * nW + w // m
        eps = h % m
        nu = w % m
        output_expr = {}
        for i in range(2):
            t0 = temp[k][b][i][0] + temp[k][b][i][1]
            t1 = temp[k][b][i][1] - temp[k][b][i][2]
            output_expr[(i, 0)] = t0 + temp[k][b][i][2]
            output_expr[(i, 1)] = t1 - temp[k][b][i][3]

        now = tvm.const(0.0, "float32")
        for ii in range(2):
            for jj in range(2):
                now = tvm.select(tvm.all(eps == ii, nu == jj),
                                 output_expr[(ii, jj)], now)
        return now
Beispiel #33
0
def make_conv2d_unoptimized(shapeX,
                            shapeF,
                            tgt,
                            tgt_host,
                            func_name,
                            dtype="float32"):
    in_size, in_size, in_channel, batch = shapeX
    kernel, kernel, in_channel, out_channel = shapeF
    pad = 1
    stride = 1

    A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
    W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W')
    out_size = (in_size - kernel + 2 * pad) // stride + 1
    # Pad input
    Apad = tvm.compute(
        (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch),
        lambda yy, xx, cc, nn: tvm.select(
            tvm.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad <
                    in_size), A[yy - pad, xx - pad, cc, nn], tvm.const(0.)),
        name='Apad')

    # Create reduction variables
    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel), name='ry')
    rx = tvm.reduce_axis((0, kernel), name='rx')
    # Compute the convolution
    B = tvm.compute(
        (out_size, out_size, out_channel, batch),
        lambda yy, xx, ff, nn: tvm.sum(Apad[yy * stride + ry, xx * stride + rx,
                                            rc, nn] * W[ry, rx, rc, ff],
                                       axis=[ry, rx, rc]),
        name='B')

    s = tvm.create_schedule(B.op)

    s[Apad].bind(Apad.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[Apad].bind(Apad.op.axis[1], tvm.thread_axis("threadIdx.x"))

    s[B].bind(B.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[B].bind(B.op.axis[1], tvm.thread_axis("threadIdx.x"))

    f = tvm.build(s, [A, W, B], tgt, target_host=tgt_host, name=func_name)

    return _export_module(f, func_name, remote)
Beispiel #34
0
def conv2db(Image, Filter, POutput):
    """
    convolution with NHWC layout backward

    Parameters
    ----------
    Image : tvm.tensor.Tensor
        4-D with shape [batch_size, image_height, image_width, in_channels]
    Filter: tvm.tensor.Tensor
        4-D with shape [out_channels, in_channels, kernel_height, kernel_width]
    POutput:tvm.tensor.Tensor, gradient of Output
        4-D with shape [batch_size, out_height, out_width, out_channels]

    Returns
    -------
    PImage :tvm.tensor.Tensor, gradient of Image
        4-D with shape (Image.shape)
    PFilter:tvm.tensor.Tensor, gradient of Filter
        4-D with shape (Filter.shape)
    """
    N, H, W, C = Image.shape
    K, _, Hk, Wk = Filter.shape

    rx = tvm.reduce_axis((0, H - (Hk - 1)), name='rx')
    ry = tvm.reduce_axis((0, W - (Wk - 1)), name='ry')
    rn = tvm.reduce_axis((0, N), name='rn')

    PFilter = tvm.compute(
        Filter.shape, lambda o, c, h, w: tvm.sum(Image[rn, h + rx, w + ry, c] *
                                                 POutput[rn, rx, ry, o],
                                                 axis=[rn, rx, ry]))

    rx_k = tvm.reduce_axis((0, Hk), name='rx_k')
    ry_k = tvm.reduce_axis((0, Wk), name='ry_k')
    ro = tvm.reduce_axis((0, K), name='ro')

    PImage = tvm.compute(
        Image.shape, lambda n, h, w, c: tvm.
        sum(Filter[ro, c, Hk - rx_k - 1, Wk - ry_k - 1] * tvm.if_then_else(
            tvm.all(h + rx_k >= Hk - 1, h + rx_k < H, w + ry_k >= Wk - 1, w +
                    ry_k < W), POutput[n, h + rx_k - (Hk - 1), w + ry_k -
                                       (Wk - 1), ro], 0.0),
            axis=[rx_k, ry_k, ro]))

    return (PImage, PFilter)
Beispiel #35
0
def argsort_ir(data_buf, out_index_buf):
    """Batched odd-even transposition sort.

    Parameters
    ----------
    data_buf : tvm.schedule.Buffer
        2-D with shape [batch, num_bbox]

    out_index_buf : tvm.schedule.Buffer
        2-D with shape [batch, num_bbox]. Indices of data in sorted order.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch, num_bbox = get_const_tuple(data_buf.shape)
    ib = tvm.ir_builder.create()
    p_data = ib.buffer_ptr(data_buf)
    index_out = ib.buffer_ptr(out_index_buf)
    temp_data = ib.allocate("float32", (1, ), name="temp_data", scope="local")
    temp_index = ib.allocate("int32", (1, ), name="temp_index", scope="local")
    idxm = tvm.indexmod
    with ib.for_range(0, batch, for_type="unroll") as b:
        start = b * num_bbox
        for i in range(2):
            with ib.for_range(0, (num_bbox + 1) // 2) as tid:
                bbox_id = tid * 2 + i
                with ib.if_scope(bbox_id < num_bbox):
                    index_out[start + bbox_id] = bbox_id
        with ib.for_range(0, num_bbox) as k:
            with ib.for_range(0, (num_bbox + 1) // 2) as tid:
                offset = start + 2 * tid + idxm(k, 2)
                with ib.if_scope(
                        tvm.all(offset + 1 < num_bbox,
                                p_data[offset] < p_data[offset + 1])):
                    temp_data[0] = p_data[offset]
                    p_data[offset] = p_data[offset + 1]
                    p_data[offset + 1] = temp_data[0]
                    temp_index[0] = index_out[offset]
                    index_out[offset] = index_out[offset + 1]
                    index_out[offset + 1] = temp_index[0]
    return ib.get()
Beispiel #36
0
    def _sample(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype('int32')
        roi_start_w = roi[1] * spatial_scale
        roi_start_h = roi[2] * spatial_scale
        roi_end_w = roi[3] * spatial_scale
        roi_end_h = roi[4] * spatial_scale

        roi_h = roi_end_h - roi_start_h
        roi_w = roi_end_w - roi_start_w
        roi_h = roi_h
        roi_w = roi_w
        bin_h = roi_h / pooled_size_h
        bin_w = roi_w / pooled_size_w

        hstart = ph * bin_h
        wstart = pw * bin_w
        hend = (ph + 1) * bin_h
        wend = (pw + 1) * bin_w
        hstart = tvm.min(tvm.max(hstart + roi_start_h, 0), height - 1)
        wstart = tvm.min(tvm.max(wstart + roi_start_w, 0), width - 1)
        hend = tvm.min(tvm.max(hend + roi_start_h, 0), height - 1)
        wend = tvm.min(tvm.max(wend + roi_start_w, 0), width - 1)
        non_empty = tvm.all(hstart < hend, wstart < wend)

        def min_value(dtype):
            return tvm.expr.Select(non_empty, tvm.min_value(dtype),
                                   tvm.const(0.0, dtype))

        stride_h = (hend - hstart) / 3.0
        stride_w = (wend - wstart) / 3.0
        hstart += stride_h
        wstart += stride_w
        stride_h = tvm.max(0.01, stride_h)
        stride_w = tvm.max(0.01, stride_w)
        _max = tvm.comm_reducer(lambda x, y: tvm.make._OpMax(x, y),
                                min_value,
                                name='max')
        rh = tvm.reduce_axis((0, tvm.expr.Select(non_empty, 2, 0)), 'rh')
        rw = tvm.reduce_axis((0, tvm.expr.Select(non_empty, 2, 0)), 'rw')
        return _max(_bilinear(batch_index, c, hstart + rh * stride_h,
                              wstart + rw * stride_w),
                    axis=[rh, rw])
Beispiel #37
0
def conv2d_batch(B, N, M, K, L, stride=1, padding=0, dtype="float32"):
    A = tvm.placeholder((B, N, M), dtype=dtype, name="A")
    W = tvm.placeholder((K, L), dtype=dtype, name="W")
    N_out = max(0, (N + padding * 2 - K) // stride) + 1
    M_out = max(0, (M + padding * 2 - L) // stride) + 1
    Apad = tvm.compute(
        (B, N + 2 * padding, M + 2 * padding),
        lambda b, i, j: tvm.if_then_else(
            tvm.all(i >= padding, j >= padding, i < N + padding, j < M +
                    padding), A[b, i - padding, j - padding], 0.0),
        name="Apad")
    rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L),
                                                                 name="ry")
    Output = tvm.compute((B, N_out, M_out),
                         lambda b, i, j: tvm.sum(Apad[b, i * stride + rx, j *
                                                      stride + ry] * W[rx, ry],
                                                 axis=[rx, ry]),
                         name="Output")
    return Output.op, [A, W, Output]
Beispiel #38
0
def lrn_nchw(data, size, axis, alpha, beta, bias):
    #default : size = 5, axis=1, alpha=0.0001, beta=0.75, bias=1
    #sqrt_out = lrn_sqrt()
    #pow_out = lrn_pow()
    #div_out = lrn_div()
    #return div_out
    out_dtype = data.dtype
    radius = size // 2
    batch, in_channel, in_height, in_width = data.shape
    ls = tvm.reduce_axis((0, size), name='ls')

    # pad and sqrt op:
    output_data1 = lambda on, oc, oh, ow: tvm.sum(tvm.expr.Select(
        tvm.all(oc >= radius, oc < (in_channel + radius)), data[
            on, oc - radius + ls, oh, ow].astype(out_dtype) * data[
                on, oc - radius + ls, oh, ow].astype(out_dtype), 0.0),
                                                  axis=[ls])

    sqr_out = tvm.compute((batch, in_channel, in_height, in_width),
                          output_data1,
                          tag="lrn_sqrt_op")

    #return sqr_out

    # pow op:
    output_data2 = lambda on, oc, oh, ow: tvm.power(
        (1 + (alpha / size * sqr_out[on, oc, oh, ow].astype(out_dtype))), beta)
    pow_out = tvm.compute((batch, in_channel, in_height, in_width),
                          output_data2,
                          tag="lrn_pow_op")

    #return pow_op

    # div op:
    output_data3 = lambda on, oc, oh, ow: tvm.expr.Div(
        data[on, oc, oh, ow].astype(out_dtype), pow_out[on, oc, oh, ow].astype(
            out_dtype))
    div_out = tvm.compute((batch, in_channel, in_height, in_width),
                          output_data3,
                          tag="lrn_div_op")

    return div_out
Beispiel #39
0
    def compute_X_dot_A(k, b, eps, nu, kk, bb):
        temp_expr = {}

        for i in range(m):
            m1_add_m2 = A_T_dot_M[k][b][i][1][kk][bb] + A_T_dot_M[k][b][i][2][
                kk][bb]
            m1_sub_m2 = A_T_dot_M[k][b][i][1][kk][bb] - A_T_dot_M[k][b][i][2][
                kk][bb]
            m3_add_m4 = A_T_dot_M[k][b][i][3][kk][bb] + A_T_dot_M[k][b][i][4][
                kk][bb]
            m3_sub_m4 = A_T_dot_M[k][b][i][3][kk][bb] - A_T_dot_M[k][b][i][4][
                kk][bb]
            m5_add_m6 = A_T_dot_M[k][b][i][5][kk][bb] + A_T_dot_M[k][b][i][6][
                kk][bb]
            m5_sub_m6 = A_T_dot_M[k][b][i][5][kk][bb] - A_T_dot_M[k][b][i][6][
                kk][bb]
            s0 = A_T_dot_M[k][b][i][0][kk][bb] + m1_add_m2
            s5 = A_T_dot_M[k][b][i][7][kk][bb] + m1_sub_m2
            s1 = m1_sub_m2 + m5_sub_m6 * 16
            s4 = m1_add_m2 + m3_add_m4 * 16
            s2 = m1_add_m2 + 8 * m5_add_m6
            s3 = m1_sub_m2 + 8 * m3_sub_m4
            s0 = s0 + m5_add_m6 * 32
            s5 = s5 + m3_sub_m4 * 32
            s1 = s1 + m3_sub_m4 * 2
            s4 = s4 + m5_add_m6 * 2
            s0 = s0 + m3_add_m4
            s5 = s5 + m5_sub_m6
            s2 = s2 + m3_add_m4 * 4
            s3 = s3 + m5_sub_m6 * 4
            temp_expr[(i, 0)] = s0
            temp_expr[(i, 1)] = s1
            temp_expr[(i, 2)] = s2
            temp_expr[(i, 3)] = s3
            temp_expr[(i, 4)] = s4
            temp_expr[(i, 5)] = s5
        now = tvm.const(0.0, "float32")
        for ii in range(m):
            for jj in range(m):
                now = tvm.select(tvm.all(eps == ii, nu == jj),
                                 temp_expr[(ii, jj)], now)
        return now
Beispiel #40
0
    def compute_X_dot_B(b, eps, nu, c, bb):
        temp_expr = {}

        for i in range(alpha):
            wd0 = B_T_dot_X[b][c][i][0][bb] - B_T_dot_X[b][c][i][6][bb]
            d4_sub_d2 = B_T_dot_X[b][c][i][4][bb] - B_T_dot_X[b][c][i][2][bb]
            wd7 = B_T_dot_X[b][c][i][7][bb] - B_T_dot_X[b][c][i][1][bb]
            d3_sub_d5 = B_T_dot_X[b][c][i][3][bb] - B_T_dot_X[b][c][i][5][bb]
            wd1 = B_T_dot_X[b][c][i][2][bb] + B_T_dot_X[b][c][i][6][bb]
            wd2 = B_T_dot_X[b][c][i][1][bb] + B_T_dot_X[b][c][i][5][bb]
            wd4 = B_T_dot_X[b][c][i][5][bb] + B_T_dot_X[b][c][i][1][bb] * 0.25
            wd5 = B_T_dot_X[b][c][i][6][bb] - B_T_dot_X[b][c][i][4][bb] * 5
            wd3 = B_T_dot_X[b][c][i][6][bb] + B_T_dot_X[b][c][i][2][bb] * 0.25
            wd6 = B_T_dot_X[b][c][i][1][bb] + B_T_dot_X[b][c][i][5][bb] * 0.25

            wd0 = wd0 + d4_sub_d2 * 5.25
            wd7 = wd7 + d3_sub_d5 * 5.25

            wd1 = wd1 - B_T_dot_X[b][c][i][4][bb] * 4.25
            wd2 = wd2 - B_T_dot_X[b][c][i][3][bb] * 4.25

            wd3 = wd3 - B_T_dot_X[b][c][i][4][bb] * 1.25
            wd5 = wd5 + B_T_dot_X[b][c][i][2][bb] * 4
            wd4 = wd4 - B_T_dot_X[b][c][i][3][bb] * 1.25
            wd6 = wd6 - B_T_dot_X[b][c][i][3][bb] * 1.25

            temp_expr[(i, 0)] = wd0
            temp_expr[(i, 1)] = wd1 + wd2
            temp_expr[(i, 2)] = wd1 - wd2
            temp_expr[(i, 3)] = wd3 + wd4 * 2
            temp_expr[(i, 4)] = wd3 - wd4 * 2
            temp_expr[(i, 5)] = wd5 + wd6 * 2
            temp_expr[(i, 6)] = wd5 - wd6 * 2
            temp_expr[(i, 7)] = wd7

        now = tvm.const(0.0, "float32")
        for ii in range(alpha):
            for jj in range(alpha):
                now = tvm.select(tvm.all(eps == ii, nu == jj),
                                 temp_expr[(ii, jj)], now)
        return now
def test_lower_floormod():
    data = get_ref_data()
    for dtype in ["int32", "int64", "int16"]:
        x = tvm.var("x", dtype=dtype)
        y = tvm.var("y", dtype=dtype)
        zero = tvm.const(0, dtype)
        # no constraints
        res = lower_intrin(tvm.floormod(x, y))
        check_value(res, x, y, data, lambda a, b: a % b)
        # rhs >= 0
        res = lower_intrin(tvm.expr.Select(y >= 0, tvm.floormod(x, y), zero))
        check_value(res, x, y, data, lambda a, b: a % b if b > 0 else 0)
        # lhs >= 0
        res = lower_intrin(
            tvm.expr.Select(tvm.all(y >= 0, x >= 0), tvm.floormod(x, y), zero))
        check_value(res, x, y, data, lambda a, b: a % b
                    if b > 0 and a >= 0 else 0)
        # const power of two
        res = lower_intrin(tvm.floormod(x, tvm.const(8, dtype=dtype)))
        check_value(res, x, y, [(a, b) for a, b in data if b == 8],
                    lambda a, b: a % b)
Beispiel #42
0
def conv2d_channel(N, M, C, K, L, O, stride=1, padding=0, dtype="float32"):
    A = tvm.placeholder((N, M, C), dtype=dtype, name="A")
    W = tvm.placeholder((K, L, C, O), dtype=dtype, name="W")
    N_out = max(0, (N + padding * 2 - K) // stride) + 1
    M_out = max(0, (M + padding * 2 - L) // stride) + 1
    Apad = tvm.compute(
        (N + 2 * padding, M + 2 * padding, C),
        lambda i, j, k: tvm.if_then_else(
            tvm.all(i >= padding, j >= padding, i < N + padding, j < M +
                    padding), A[i - padding, j - padding, k], 0.0),
        name="Apad")
    rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L),
                                                                 name="ry")
    rc = tvm.reduce_axis((0, C), name="rc")
    Output = tvm.compute(
        (N_out, M_out, O),
        lambda i, j, k: tvm.sum(Apad[i * stride + rx, j * stride + ry, rc] * W[
            rx, ry, rc, k],
                                axis=[rx, ry, rc]),
        name="Output")
    return Output.op, [A, W, Output]
Beispiel #43
0
    def compute_B_T_dot_X(b, c, eps, nu, bb):
        temp_expr = {}
        for j in range(alpha):
            wd0 = input_tile[b][c][0][j][bb] - input_tile[b][c][6][j][bb]
            d4_sub_d2 = input_tile[b][c][4][j][bb] - input_tile[b][c][2][j][bb]
            wd7 = input_tile[b][c][7][j][bb] - input_tile[b][c][1][j][bb]
            d3_sub_d5 = input_tile[b][c][3][j][bb] - input_tile[b][c][5][j][bb]
            wd1 = input_tile[b][c][2][j][bb] + input_tile[b][c][6][j][bb]
            wd2 = input_tile[b][c][1][j][bb] + input_tile[b][c][5][j][bb]
            wd4 = input_tile[b][c][5][j][bb] + input_tile[b][c][1][j][bb] * 0.25
            wd5 = input_tile[b][c][6][j][bb] - input_tile[b][c][4][j][bb] * 5
            wd3 = input_tile[b][c][6][j][bb] + input_tile[b][c][2][j][bb] * 0.25
            wd6 = input_tile[b][c][1][j][bb] + input_tile[b][c][5][j][bb] * 0.25

            wd0 = wd0 + d4_sub_d2 * 5.25
            wd7 = wd7 + d3_sub_d5 * 5.25

            wd1 = wd1 - input_tile[b][c][4][j][bb] * 4.25
            wd2 = wd2 - input_tile[b][c][3][j][bb] * 4.25

            wd3 = wd3 - input_tile[b][c][4][j][bb] * 1.25
            wd5 = wd5 + input_tile[b][c][2][j][bb] * 4
            wd4 = wd4 - input_tile[b][c][3][j][bb] * 1.25
            wd6 = wd6 - input_tile[b][c][3][j][bb] * 1.25

            temp_expr[(0, j)] = wd0
            temp_expr[(1, j)] = wd1 + wd2
            temp_expr[(2, j)] = wd1 - wd2
            temp_expr[(3, j)] = wd3 + wd4 * 2
            temp_expr[(4, j)] = wd3 - wd4 * 2
            temp_expr[(5, j)] = wd5 + wd6 * 2
            temp_expr[(6, j)] = wd5 - wd6 * 2
            temp_expr[(7, j)] = wd7

        now = tvm.const(0.0, "float32")
        for ii in range(alpha):
            for jj in range(alpha):
                now = tvm.select(tvm.all(eps == ii, nu == jj),
                                 temp_expr[(ii, jj)], now)
        return now
Beispiel #44
0
    def _pool(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype('int32')
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[
            3], roi[4]

        roi_start_h = tvm.round(roi_start_h * spatial_scale).astype('int32')
        roi_start_w = tvm.round(roi_start_w * spatial_scale).astype('int32')
        roi_end_h = tvm.round(roi_end_h * spatial_scale).astype('int32')
        roi_end_w = tvm.round(roi_end_w * spatial_scale).astype('int32')

        # force malformed ROIs to be 1x1
        roi_h = tvm.max(roi_end_h - roi_start_h + 1, tvm.const(1, 'int32'))
        roi_w = tvm.max(roi_end_w - roi_start_w + 1, tvm.const(1, 'int32'))

        bin_h = roi_h.astype(dtype) / pooled_size_h
        bin_w = roi_w.astype(dtype) / pooled_size_w

        # use epsilon to prevent floating point precision loss in floor/ceil
        epsilon = tvm.const(0.00001, dtype)
        hstart = tvm.floor(ph * bin_h + epsilon).astype('int32')
        wstart = tvm.floor(pw * bin_w + epsilon).astype('int32')
        hend = tvm.ceil((ph + 1) * bin_h - epsilon).astype('int32')
        wend = tvm.ceil((pw + 1) * bin_w - epsilon).astype('int32')
        hstart = tvm.min(tvm.max(hstart + roi_start_h, 0), height)
        wstart = tvm.min(tvm.max(wstart + roi_start_w, 0), width)
        hend = tvm.min(tvm.max(hend + roi_start_h, 0), height)
        wend = tvm.min(tvm.max(wend + roi_start_w, 0), width)

        non_empty = tvm.all(hstart < hend, wstart < wend)
        min_value = lambda dtype: tvm.if_then_else(
            non_empty, tvm.min_value(dtype), tvm.const(0.0, dtype))
        # pylint: disable=unnecessary-lambda
        _max = tvm.comm_reducer(lambda x, y: tvm.make._OpMax(x, y),
                                min_value,
                                name='max')
        rh = tvm.reduce_axis((0, hend - hstart), 'rh')
        rw = tvm.reduce_axis((0, wend - wstart), 'rw')
        return _max(data[batch_index, c, hstart + rh, wstart + rw],
                    axis=[rh, rw])
Beispiel #45
0
def conv3d(N, M, P, K, L, Q, stride=1, padding=0, dtype="float32"):
    A = tvm.placeholder((N, M, P), dtype=dtype, name="A")
    W = tvm.placeholder((K, L, Q), dtype=dtype, name="W")
    N_out = max(0, (N + padding * 2 - K) // stride) + 1
    M_out = max(0, (M + padding * 2 - L) // stride) + 1
    P_out = max(0, (P + padding * 2 - Q) // stride) + 1
    Apad = tvm.compute(
        (N + 2 * padding, M + 2 * padding, P + 2 * padding),
        lambda i, j, k: tvm.if_then_else(
            tvm.all(i >= padding, j >= padding, k >= padding, i < N + padding,
                    j < M + padding, k < P + padding), A[
                        i - padding, j - padding, k - padding], 0.0),
        name="Apad")
    rx, ry, rz = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis(
        (0, L), name="ry"), tvm.reduce_axis((0, Q), name="rz")
    Output = tvm.compute(
        (N_out, M_out, P_out),
        lambda i, j, k: tvm.sum(Apad[i * stride + rx, j * stride + ry, k *
                                     stride + rz] * W[rx, ry, rz],
                                axis=[rx, ry, rz]),
        name="Output")
    return Output.op, [A, W, Output]
def zero_pad2d(inputs, padding=0):
    """Zero padding for 2d tensor

    Args:
    -----------------------------
    inputs : tvm.tensor.Tensor
        shape [batch, channel, height, width]
    padding: (optional:0) int or tuple
        expected: (h_pad_up, h_pad_down, w_pad_up, w_pad_down)
    -----------------------------

    Returns:
    -----------------------------
    tvm.tensor.Tensor
        shape [batch, channel, padded_height, padded_width]
    -----------------------------
    """
    padding = (padding, padding, padding, padding) if isinstance(
        padding, (int, tvm.expr.IntImm)) else padding
    assert_print(isinstance(padding, tuple),
                 "type(padding)={}".format(type(padding)))
    if len(padding) == 2:
        padding = (padding[0], padding[0], padding[1], padding[1])
    assert_print(len(padding) == 4)

    padding_zero = 0.0 if "float" in inputs.dtype else 0

    batch_size, in_channel, height, width = inputs.shape
    return tvm.compute(
        (batch_size, in_channel, height +
         padding[0] + padding[1], width + padding[2] + padding[3]),
        lambda b, c, h, w: tvm.if_then_else(
            tvm.all(h >= padding[0], h < height + padding[0],
                    w >= padding[2], w < width + padding[2]),
            inputs[b, c, h - padding[0], w - padding[2]],
            padding_zero
        )
    )
def test_copy_pad():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    B = tvm.compute((m + 2, l), lambda i, j:
                    tvm.select(tvm.all(i >= 1, i < m + 1),
                                       A[i - 1, j], 1.0), name='B')
    s = tvm.create_schedule(B.op)
    s[B].pragma(B.op.axis[0], "memcpy")
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    def cb(src, dst, pad_before, pad_after, pad_value):
        assert tvm.ir_pass.Simplify(src.elem_offset).value == 0
        assert pad_before[0].value == 1
        assert pad_before[1].value == 0
        assert pad_after[0].value == 1
        assert pad_after[1].value == 0
        assert pad_value.value == 1.0
        return tvm.make.Evaluate(0)
    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_cmp_simplify():
    ck = RewriteChecker()
    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
    # const int bound
    ck.verify((x % 2 + 10).equal(0), tvm.const(0, "bool"))
    ck.verify(tvm.expr.NE(x % 2 + 10, 0), tvm.const(1, "bool"))
    ck.verify(x % 2 + 10 > 1, tvm.const(1, "bool"))
    ck.verify(x % 2 + 10 <= 1, tvm.const(0, "bool"))
    ck.verify(x * 3 + 10 == 0, tvm.const(0, "bool"))
    ck.verify(x * 3 + 10 != 0, tvm.const(1, "bool"))

    # canonicalization
    ck.verify((x - 10).equal(0), x.equal(10))
    ck.verify((10 - x).equal(0), x.equal(10))
    ck.verify((x * y).equal(0), tvm.expr.Or(x.equal(0), y.equal(0)))

    # cmp bound
    ck.verify(x + y < x + z, y < z)
    ck.verify(x + y < z + x, y < z)
    ck.verify(y + x < x + z, y < z)
    ck.verify(y + x < z + x, y < z)
    ck.verify(y - x < z - x, y < z)
    ck.verify(x - y < x - z, z < y)

    ck.verify(x < z + x, tvm.expr.LT(0, z))
    ck.verify(x < x + z, tvm.expr.LT(0, z))

    ck.verify(100 < x + 1, tvm.expr.LT(99, x))
    ck.verify(1 < 100 - x, tvm.expr.LT(x, 99))
    ck.verify(x * 3 < y * 3, x < y)
    ck.verify(x * (-3) < y * (-3), y < x)
    ck.verify(x * 3 >= y * 3, y <= x)

    ck.verify(x * 4 >= 2, tvm.expr.LE(1, x))
    ck.verify(x * 2 >= 50, tvm.expr.LE(25, x))
    ck.verify(x / 2 < 3, x < 6)
    ck.verify(x * 4 <= 2, x <= 0)
    ck.verify(3 < x / 2, tvm.expr.LT(7, x))

    ck.verify(x / 4 * 4 < x, tvm.expr.LT(0, x % 4))
    ck.verify(x / 4 * 4 >= x, tvm.expr.LE(x % 4, 0))

    ck.verify(x / 4 * 4 < x + y, tvm.expr.LT(0, x % 4 + y))
    ck.verify(x / 4 * 4 < x - y, tvm.expr.LT(y, x % 4))

    ck.verify((x + 2) / 4 * 4 >= x, tvm.expr.LE((x + 2) % 4, 2))
    ck.verify((x + 2) / 4 * 4 >= x + y, tvm.expr.LE((x + 2) % 4 + y, 2))
    ck.verify((x + 2) / 4 * 4 >= x - y, tvm.expr.LE((x + 2) % 4 + (-2), y))


    ck.verify(tvm.min(x, 11) < 10, x < 10)
    ck.verify(tvm.min(x, 8) < 10, tvm.const(1, "bool"))
    ck.verify(tvm.max(8, x) > 10, tvm.expr.LT(10, x))
    ck.verify(x + 1 < tvm.max(8, x), x < 7)

    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True)
    ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True)
    ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True)

    ck.verify(x < 11, tvm.const(1, "bool"))
    ck.verify(x <= 10, tvm.const(1, "bool"))
    ck.verify(z <= 5, tvm.const(1, "bool"))
    ck.verify(x + y <= 10, tvm.const(1, "bool"))
    ck.verify(x + y >= -10, tvm.const(1, "bool"))
    ck.verify(z - 5 <= y + 10, tvm.const(1, "bool"))
    ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool"))
    ck.verify(x*y <= 0, tvm.const(1, "bool"))
    ck.verify((x + 1)*(y - 1) < 0, tvm.const(1, "bool"))
    ck.verify(y*y >= 0, tvm.const(1, "bool"))
Beispiel #49
0
def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp_score, threshold):
    """Low level IR routing for transform location data preparation.

    Parameters
    ----------
    cls_prob : Buffer
        Buffer of class probabilities.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    temp_valid_count : Buffer
        Output intermediate result buffer

    temp_cls_id : Buffer
        Output intermediate result buffer

    temp_score : Buffer
        Output buffer

    threshold : float
        Threshold to be a positive prediction.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = cls_prob.shape[0]
    num_classes = cls_prob.shape[1]
    num_anchors = cls_prob.shape[2]

    ib = tvm.ir_builder.create()

    cls_prob = ib.buffer_ptr(cls_prob)
    cls_id = ib.buffer_ptr(temp_cls_id)
    valid_count = ib.buffer_ptr(valid_count)
    temp_valid_count = ib.buffer_ptr(temp_valid_count)
    score = ib.buffer_ptr(temp_score)

    threshold = tvm.make.node("FloatImm", dtype="float32", value=threshold)

    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
    nthread_tx = max_threads
    nthread_bx = (batch_size *  num_anchors) // max_threads + 1
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    tid = bx * max_threads + tx

    with ib.if_scope(tid < batch_size * num_anchors):
        i = tid / num_anchors
        j = tid % num_anchors
        valid_count[i] = 0
        score[tid] = -1.0
        cls_id[tid] = 0
        with ib.for_range(0, num_classes - 1) as k:
            temp = cls_prob[i * num_classes * num_anchors + (k + 1) * num_anchors + j]
            cls_id[tid] = if_then_else(temp > score[tid], k + 1, cls_id[tid])
            score[tid] = tvm.max(temp, score[tid])
        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
            cls_id[tid] = 0
        with ib.if_scope(cls_id[tid] > 0):
            temp_valid_count[tid] = 1
        with ib.else_scope():
            temp_valid_count[tid] = 0

        with ib.if_scope(tid < batch_size):
            with ib.for_range(0, num_anchors) as k:
                with ib.if_scope(k > 0):
                    temp_valid_count[tid * num_anchors + k] += \
                    temp_valid_count[tid * num_anchors + k - 1]
            valid_count[i] = temp_valid_count[tid * num_anchors + num_anchors - 1]

    return ib.get()
Beispiel #50
0
def sort_oet_ir(data, index, new_data, new_index, loc, out_index, axis_mul_before, \
                axis_mul_after, axis, is_descend):
    """Low level IR routing subfunction 3/4 for Odd-Even-Transposition sorting.

    Parameters
    ----------
    data: Buffer
        Buffer of output boxes with class and score.

    index : Buffer
        Buffer of number of valid output boxes.

    new_data : Buffer
        Buffer of flattened segmented data.

    new_index : Buffer
        Buffer of flattened segmented indices.

    loc : Buffer
        Buffer of start locations of each sorting segment.

    out_index : Buffer
        Output buffer of output box indexes sorted by score in a flattened segmented format.

    axis_mul_before : int
        The multiplication result of axis dimensions before axis.

    axis_mul_after : int
        The multiplication result of axis dimensions after axis.

    axis : int
        The axis used for sorting.

    is_descend : bool
        If the sorted data is in descending order.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    max_threads = int(
        tvm.target.current_target(allow_none=False).max_num_threads)
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib = tvm.ir_builder.create()
    dshape = loc.shape
    fshape = data.shape[axis] * dshape[0]
    temp_data = ib.allocate(
        "float32", dshape, name="temp_data", scope="local")
    p_data = ib.buffer_ptr(data)
    p_index = ib.buffer_ptr(index)
    data_new = ib.buffer_ptr(new_data)
    index_new = ib.buffer_ptr(new_index)
    index_out = ib.buffer_ptr(out_index)
    sizes = ib.buffer_ptr(loc)
    nthread_tx = max_threads
    nthread_bx = fshape // max_threads + 1
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    tid = bx * max_threads + tx

    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
            with ib.if_scope(tid == 0):
                start = 0
            with ib.else_scope():
                start = sizes[tid-1]
            # OddEvenTransposeSort
            with ib.for_range(0, p_index[tid], name="k") as k:
                with ib.for_range(0, p_index[tid] - 1, name="i") as i:
                    with ib.if_scope(i % 2 == k % 2):
                        with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) == is_descend)):
                            temp_data[tid] = data_new[i+start]
                            data_new[i+start] = data_new[i+start+1]
                            data_new[i+start+1] = temp_data[tid]
                            index_out[tid] = index_new[i+start]
                            index_new[i+start] = index_new[i+start+1]
                            index_new[i+start+1] = index_out[tid]
        with ib.if_scope(tid < 1):
            with ib.for_range(0, sizes[dshape[0] - 1], name="i") as i:
                index_out[i] = index_new[i]
    with ib.else_scope():
        with ib.for_range(0, fshape, name="k", for_type="unroll") as k:
            with ib.if_scope(tvm.all(k % 2 == tid % 2, tid < fshape)):
                with ib.if_scope(k % 2 == 0):
                    with ib.if_scope(tvm.all(tid + 1 < fshape, (p_data[tid] < p_data[tid+1]) \
                                             == is_descend)):
                        data_new[tid] = p_data[tid+1]
                        index_out[tid] = index_new[tid+1]
                    with ib.else_scope():
                        data_new[tid] = p_data[tid]
                        index_out[tid] = index_new[tid]
                with ib.else_scope():
                    with ib.if_scope(tvm.all(tid + 1 < fshape, (data_new[tid] < data_new[tid+1]) \
                                             == is_descend)):
                        p_data[tid] = data_new[tid+1]
                        index_new[tid] = index_out[tid+1]
                    with ib.else_scope():
                        p_data[tid] = data_new[tid]
                        index_new[tid] = index_out[tid]
            with ib.if_scope(tvm.all(k % 2 != tid % 2, tid < fshape)):
                with ib.if_scope(k % 2 == 0):
                    with ib.if_scope(tvm.all(tid > 0, (p_data[tid-1] < p_data[tid]) == is_descend)):
                        data_new[tid] = p_data[tid-1]
                        index_out[tid] = index_new[tid-1]
                    with ib.else_scope():
                        data_new[tid] = p_data[tid]
                        index_out[tid] = index_new[tid]
                with ib.else_scope():
                    with ib.if_scope(tvm.all(tid > 0, (data_new[tid-1] < data_new[tid]) \
                                             == is_descend)):
                        p_data[tid] = data_new[tid-1]
                        index_new[tid] = index_out[tid-1]
                    with ib.else_scope():
                        p_data[tid] = data_new[tid]
                        index_new[tid] = index_out[tid]
        with ib.if_scope(fshape % 2 == 1):
            with ib.if_scope(tid < 1):
                with ib.for_range(0, fshape, name="k") as k:
                    index_out[tid] = index_new[tid]
    body = ib.get()
    return body
Beispiel #51
0
def sort_ir(data, output, axis, is_ascend):
    """Low level IR to do nms sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.

    Parameters
    ----------
    data: Buffer
        Buffer of input data.

    output : Buffer
        Output buffer of indicies of sorted tensor with same shape as data.

    axis : Int
        Axis long which to sort the input tensor.

    is_ascend : Boolean
        Whether to sort in ascending or descending order.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    size = 1
    axis_mul_before = 1
    axis_mul_after = 1
    shape = data.shape
    if axis < 0:
        axis = len(shape) + axis
    for i, value in enumerate(shape, 0):
        size *= value
        if i < axis:
            axis_mul_before *= value
        elif i > axis:
            axis_mul_after *= value
    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
    ib = tvm.ir_builder.create()
    data = ib.buffer_ptr(data)
    output = ib.buffer_ptr(output)
    nthread_tx = max_threads
    nthread_bx = size // max_threads + 1
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("vthread")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "virtual_thread", nthread_bx)
    tid = bx * nthread_tx + tx
    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
    temp_index = ib.allocate("float32", (1,), name="temp_index", scope="local")
    is_ascend = tvm.make.node("IntImm", dtype="int32", value=is_ascend)

    with ib.for_range(0, axis_mul_before) as i:
        with ib.for_range(0, axis_mul_after) as j:
            current_sort_num = shape[axis]
            base_idx = i * shape[axis] * axis_mul_after + j
            with ib.if_scope(tid < shape[axis]):
                output[base_idx + tid * axis_mul_after] = tid.astype("float32")
            # OddEvenTransposeSort
            with ib.for_range(0, current_sort_num) as k:
                with ib.if_scope(tid < (current_sort_num + 1) // 2):
                    offset = base_idx + (2 * tid + (k % 2)) * axis_mul_after
                    with ib.if_scope(tvm.all(is_ascend == 1, \
                                             2 * tid + (k % 2) + 1 < current_sort_num, \
                                             data[offset] > data[offset + axis_mul_after])):
                        temp_data[0] = data[offset]
                        data[offset] = data[offset + axis_mul_after]
                        data[offset + axis_mul_after] = temp_data[0]
                        temp_index[0] = output[offset]
                        output[offset] = output[offset + axis_mul_after]
                        output[offset + axis_mul_after] = temp_index[0]
                    with ib.if_scope(tvm.all(is_ascend == 0, \
                                             2 * tid + (k % 2) + 1 < current_sort_num, \
                                             data[offset] < data[offset + axis_mul_after])):
                        temp_data[0] = data[offset]
                        data[offset] = data[offset + axis_mul_after]
                        data[offset + axis_mul_after] = temp_data[0]
                        temp_index[0] = output[offset]
                        output[offset] = output[offset + axis_mul_after]
                        output[offset + axis_mul_after] = temp_index[0]
                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                      tvm.convert(['shared']),
                                      tvm.expr.Call.Intrinsic, None, 0))

    return ib.get()
Beispiel #52
0
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data: Buffer
        Buffer of output boxes with class and score.

    sort_result : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    out : Buffer
        Output buffer.

    nms_threshold : float
        Non-maximum suppression threshold.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    nms_topk : int
        Keep maximum top k detections before nms, -1 for no limit.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
        """Calculate overlap of two boxes.
        """
        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
        i = w * h
        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
        return tvm.select(u <= 0.0, 0.0, i / u)

    max_threads = int(math.sqrt(
        tvm.target.current_target(allow_none=False).max_num_threads))
    tx = tvm.thread_axis("threadIdx.x")
    ty = tvm.thread_axis("threadIdx.y")
    bx = tvm.thread_axis("blockIdx.x")
    by = tvm.thread_axis("blockIdx.y")
    ib = tvm.ir_builder.create()
    p_data = ib.buffer_ptr(data)
    p_sort_result = ib.buffer_ptr(sort_result)
    p_valid_count = ib.buffer_ptr(valid_count)
    p_out = ib.buffer_ptr(out)
    batch_size = out.shape[0]
    num_anchors = out.shape[1]
    nthread_tx = max_threads
    nthread_bx = num_anchors // max_threads + 1
    nthread_ty = max_threads
    nthread_by = 6 // max_threads + 1
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(ty, "thread_extent", nthread_ty)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    ib.scope_attr(by, "thread_extent", nthread_by)
    i = bx * max_threads + tx
    j = by * max_threads + ty

    nms_threshold_node = tvm.make.node(
        "FloatImm", dtype="float32", value=nms_threshold)
    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
    force_suppress_node = tvm.make.node(
        "IntImm", dtype="int32", value=1 if force_suppress else 0)
    with ib.for_range(0, batch_size, for_type="unroll", name="n") as n:
        with ib.if_scope(
            tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
                    p_valid_count[0] > 0)):
            # Reorder output
            nkeep = tvm.select(
                tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
                nms_topk, p_valid_count[n])
            with ib.if_scope(i < nkeep):
                with ib.if_scope(j < 6):
                    p_out[(n * num_anchors * 6
                           + i * 6 + j)] = p_data[(n * num_anchors * 6
                                                   + p_sort_result[n * num_anchors + i] * 6 + j)]
            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
                with ib.if_scope(i < p_valid_count[n] - nkeep):
                    with ib.if_scope(j < 6):
                        p_out[(n * num_anchors * 6
                               + (i + nkeep) * 6 + j)] = p_data[(n * num_anchors * 6
                                                                 + (i + nkeep) * 6 + j)]
            # Apply nms
            with ib.if_scope(i < p_valid_count[n]):
                offset_i = i * 6
                with ib.if_scope(p_out[n * num_anchors * 6 + offset_i] >= 0):
                    with ib.if_scope(j < p_valid_count[n]):
                        offset_j = j * 6
                        with ib.if_scope(tvm.all(j > i, p_out[n * num_anchors * 6
                                                              + offset_j] >= 0)):
                            with ib.if_scope(tvm.any(force_suppress_node > 0,
                                                     p_out[n * num_anchors * 6 + offset_i] ==
                                                     p_out[n * num_anchors * 6 + offset_j])):
                                # When force_suppress == True or class_id equals
                                iou = calculate_overlap(
                                    p_out, n * num_anchors * 6 + offset_i + 2,
                                    n * num_anchors * 6 + offset_j + 2)
                                with ib.if_scope(iou >= nms_threshold):
                                    p_out[
                                        n * num_anchors * 6 + offset_j] = -1.0
        with ib.else_scope():
            with ib.if_scope(i < p_valid_count[n]):
                with ib.if_scope(j < 6):
                    p_out[(n * num_anchors * 6
                           + i * 6 + j)] = p_data[n * num_anchors * 6 + i * 6 + j]
        # Set invalid entry to be -1
        with ib.if_scope(i < num_anchors - p_valid_count[n]):
            with ib.if_scope(j < 6):
                p_out[n * num_anchors * 6 + (i +
                                             p_valid_count[n]) * 6 + j] = -1.0
    body = ib.get()
    return body
Beispiel #53
0
def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold):
    """Low level IR routing for transform location data preparation.

    Parameters
    ----------
    cls_prob : Buffer
        Buffer of class probabilities.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    temp_flag : Buffer
        Output intermediate result buffer

    temp_id : Buffer
        Output intermediate result buffer

    temp_score_out : Buffer
        Output buffer

    threshold : float
        Threshold to be a positive prediction.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = cls_prob.shape[0]
    num_classes = cls_prob.shape[1]
    num_anchors = cls_prob.shape[2]

    max_threads = int(
        tvm.target.current_target(allow_none=False).max_num_threads)
    ib = tvm.ir_builder.create()
    score = ib.buffer_ptr(temp_score_out)
    cls_id = ib.buffer_ptr(temp_id)
    flag = ib.buffer_ptr(temp_flag)
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    nthread_tx = max_threads
    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    tid = bx * max_threads + tx
    p_cls_prob = ib.buffer_ptr(cls_prob)
    p_valid_count = ib.buffer_ptr(valid_count)

    with ib.if_scope(tid < batch_size * num_anchors):
        n = tid / num_anchors  # number of batches
        i = tid % num_anchors  # number of anchors
        score[i] = -1.0
        cls_id[i] = 0
        p_valid_count[n] = 0
        with ib.for_range(0, num_classes-1, name="k") as k:
            temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i]
            with ib.if_scope(temp > score[i]):
                cls_id[i] = k + 1
                score[i] = temp
        with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)):
            cls_id[i] = 0
        with ib.if_scope(cls_id[i] > 0):
            flag[i] = 1
        with ib.else_scope():
            flag[i] = 0

        with ib.if_scope(tid < batch_size):
            with ib.for_range(0, num_anchors, name="k") as k:
                with ib.if_scope(k > 0):
                    flag[tid * num_anchors +
                         k] += flag[tid * num_anchors + k - 1]
            p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1]

    body = ib.get()
    return body
Beispiel #54
0
def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    cls_prob : Buffer
        Buffer of class probabilities.

    loc_pred : Buffer
        Buffer of location regression predictions.

    anchor : Buffer
        Buffer of prior anchor boxes.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    out : Buffer
        Output buffer.

    clip : boolean
        Whether to clip out-of-boundary boxes.

    threshold : float
        Threshold to be a positive prediction.

    variances : tuple of float
        Variances to be decoded from box regression output.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
        """Transform prior anchor box to output box through location predictions.
        """
        al = anchor[anchor_base_idx]
        at = anchor[anchor_base_idx + 1]
        ar = anchor[anchor_base_idx + 2]
        ab = anchor[anchor_base_idx + 3]
        aw = ar - al
        ah = ab - at
        ax = (al + ar) / 2.0
        ay = (at + ab) / 2.0
        px = loc[loc_base_idx]
        py = loc[loc_base_idx + 1]
        pw = loc[loc_base_idx + 2]
        ph = loc[loc_base_idx + 3]
        ox = px * vx * aw + ax
        oy = py * vy * ah + ay
        ow = tvm.exp(pw * vw) * aw / 2.0
        oh = tvm.exp(ph * vh) * ah / 2.0
        return tvm.select(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \
               tvm.select(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
               tvm.select(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
               tvm.select(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)

    batch_size = cls_prob.shape[0]
    num_classes = cls_prob.shape[1]
    num_anchors = cls_prob.shape[2]

    ib = tvm.ir_builder.create()
    p_cls_prob = ib.buffer_ptr(cls_prob)
    p_loc_pred = ib.buffer_ptr(loc_pred)
    p_anchor = ib.buffer_ptr(anchor)
    p_valid_count = ib.buffer_ptr(valid_count)
    p_out = ib.buffer_ptr(out)
    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
        p_valid_count[n] = 0
        with ib.for_range(0, num_anchors, name="i") as i:
            # Find the predicted class id and probability
            score = ib.allocate('float32', (1,), name="score", scope="local")
            cls_id = ib.allocate('int32', (1,), name="id", scope="local")
            score[0] = -1.0
            cls_id[0] = 0
            with ib.for_range(0, num_classes, name="j") as j:
                with ib.if_scope(j > 0):
                    temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
                    cls_id[0] = tvm.select(temp > score[0], j, cls_id[0])
                    score[0] = tvm.max(temp, score[0])
            with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
                cls_id[0] = 0
            # [id, prob, xmin, ymin, xmax, ymax]
            # Remove background, restore original id
            with ib.if_scope(cls_id[0] > 0):
                out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6
                p_out[out_base_idx] = cls_id[0] - 1.0
                p_out[out_base_idx + 1] = score[0]
                offset = i * 4
                p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset,
                                                        p_anchor, offset, clip, variances[0],
                                                        variances[1], variances[2], variances[3])
                p_valid_count[n] += 1

    return ib.get()
Beispiel #55
0
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data: Buffer
        Buffer of output boxes with class and score.

    sort_result : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    out : Buffer
        Output buffer.

    nms_threshold : float
        Non-maximum suppression threshold.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    nms_topk : int
        Keep maximum top k detections before nms, -1 for no limit.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
        """Calculate overlap of two boxes.
        """
        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
        i = w * h
        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
        return tvm.select(u <= 0.0, 0.0, i / u)

    ib = tvm.ir_builder.create()
    p_data = ib.buffer_ptr(data)
    p_sort_result = ib.buffer_ptr(sort_result)
    p_valid_count = ib.buffer_ptr(valid_count)
    p_out = ib.buffer_ptr(out)
    batch_size = out.shape[0]
    num_anchors = out.shape[1]

    nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold)
    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
    force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
        with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
                                 p_valid_count[0] > 0)):
            # Reorder output
            nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
                               nms_topk, p_valid_count[n])
            with ib.for_range(0, nkeep, name="l") as l:
                with ib.for_range(0, 6, name="m") as m:
                    p_out[(n * num_anchors * 6
                           + l * 6 + m)] = p_data[(n * num_anchors * 6
                                                   + p_sort_result[n * num_anchors + l] * 6 + m)]
            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
                with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l:
                    with ib.for_range(0, 6, name="m") as m:
                        p_out[(n * num_anchors * 6
                               + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6
                                                                 + (l + nkeep) * 6 + m)]
            # Apply nms
            with ib.for_range(0, p_valid_count[n], name="l") as l:
                offset_l = l * 6
                with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0):
                    with ib.for_range(0, p_valid_count[n], name="m") as m:
                        offset_m = m * 6
                        with ib.if_scope(tvm.all(m > l, p_out[n * num_anchors * 6
                                                              + offset_m] >= 0)):
                            with ib.if_scope(tvm.any(force_suppress_node > 0,
                                                     p_out[n * num_anchors * 6 + offset_l] ==
                                                     p_out[n * num_anchors * 6 + offset_m])):
                                # When force_suppress == True or class_id equals
                                iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_l + 2,
                                                        n * num_anchors * 6 + offset_m + 2)
                                with ib.if_scope(iou >= nms_threshold):
                                    p_out[n * num_anchors * 6 + offset_m] = -1.0
        with ib.else_scope():
            with ib.for_range(0, p_valid_count[n], name="l") as l:
                with ib.for_range(0, 6, name="m") as m:
                    p_out[(n * num_anchors * 6
                           + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m]
        # Set invalid entry to be -1
        with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l:
            with ib.for_range(0, 6, name="m") as m:
                p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0
    return ib.get()
Beispiel #56
0
Datei: nms.py Projekt: bddppq/tvm
def nms_ir(data, sorted_index, valid_count, out, box_indices,
           max_output_size, iou_threshold, force_suppress,
           top_k, coord_start, id_index):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data : Buffer
        Buffer of output boxes with class and score.

    sort_index : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    out : Buffer
        Output buffer.

    max_output_size : int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : float
        Overlapping(IoU) threshold to suppress object with smaller score.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    top_k : int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : int
        Start index of the consecutive 4 coordinates.

    id_index : int
        index of the class categories, -1 to disable.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
        """Calculate overlap of two boxes.
        """
        w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
                    - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
        h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
                    - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
        i = w * h
        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
        return tvm.expr.Select(u <= 0.0, 0.0, i / u)

    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    box_data_length = data.shape[2]

    ib = tvm.ir_builder.create()

    data = ib.buffer_ptr(data)
    sorted_index = ib.buffer_ptr(sorted_index)
    valid_count = ib.buffer_ptr(valid_count)
    out = ib.buffer_ptr(out)
    box_indices = ib.buffer_ptr(box_indices)
    num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")

    max_threads = int(math.sqrt(
        tvm.target.current_target(allow_none=False).max_num_threads))
    nthread_tx = max_threads
    nthread_bx = num_anchors // max_threads + 1
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    k = bx * max_threads + tx

    iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold)
    top_k = tvm.make.node("IntImm", dtype="int32", value=top_k)
    coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start)
    id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
    force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)

    with ib.for_range(0, batch_size, for_type="unroll") as i:
        base_idx = i * num_anchors * box_data_length
        with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)):
            # Reorder output
            nkeep = if_then_else( \
                    tvm.all(top_k > 0, top_k < valid_count[i]),
                    top_k, valid_count[i])
            with ib.for_range(0, nkeep) as j:
                with ib.if_scope(k < box_data_length):
                    out[(base_idx + j * box_data_length + k)] = \
                    data[(base_idx + sorted_index[i * num_anchors + j] \
                    * box_data_length + k)]
                box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j]
            with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])):
                with ib.for_range(0, valid_count[i] - nkeep) as j:
                    with ib.if_scope(k < box_data_length):
                        out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0
                    box_indices[i * num_anchors + (j + nkeep)] = -1
            # Apply nms
            with ib.for_range(0, valid_count[i]) as j:
                offset_j = j * box_data_length
                with ib.if_scope(out[base_idx + offset_j] >= 0):
                    with ib.if_scope(k < valid_count[i]):
                        offset_k = k * box_data_length
                        with ib.if_scope(tvm.all(k > j, out[base_idx + offset_k] >= 0, \
						 tvm.any(force_suppress > 0, id_index < 0, \
                                                         out[base_idx + offset_j] == \
                                                         out[base_idx + offset_k]))):
                            iou = calculate_overlap(out, base_idx + offset_k + coord_start,
                                                    base_idx + offset_j + coord_start)
                            with ib.if_scope(iou >= iou_threshold):
                                out[base_idx + offset_k] = -1.0
                                box_indices[i * num_anchors + k] = -1
                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                      tvm.convert(['shared']),
                                      tvm.expr.Call.Intrinsic, None, 0))
        with ib.else_scope():
            with ib.for_range(0, valid_count[i]) as j:
                offset_j = j * box_data_length
                with ib.if_scope(k < box_data_length):
                    out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k]
                box_indices[i * num_anchors + j] = j
        # Set invalid entry to be -1
        with ib.for_range(0, num_anchors - valid_count[i]) as j:
            with ib.if_scope(k < box_data_length):
                out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0
            box_indices[i * num_anchors + j + valid_count[i]] = -1
        # Only return max_output_size number of valid boxes
        num_valid_boxes[0] = 0
        with ib.if_scope(max_output_size > 0):
            with ib.for_range(0, valid_count[i]) as j:
                offset_j = j * box_data_length
                with ib.if_scope(out[base_idx + offset_j] >= 0):
                    with ib.if_scope(num_valid_boxes[0] == max_output_size):
                        with ib.if_scope(k < box_data_length):
                            out[base_idx + offset_j + k] = -1.0
                        box_indices[i * num_anchors + j] = -1
                    with ib.else_scope():
                        num_valid_boxes[0] += 1
                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                      tvm.convert(['shared']),
                                      tvm.expr.Call.Intrinsic, None, 0))

    return ib.get()
Beispiel #57
0
in_channel = 256
out_channel = 512
in_size = 14
kernel = 3
pad = 1
stride = 1

# Algorithm
A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W')
out_size = (in_size - kernel + 2*pad) // stride + 1
# Pad input
Apad = tvm.compute(
    (in_size + 2*pad, in_size + 2*pad, in_channel, batch),
    lambda yy, xx, cc, nn: tvm.select(
        tvm.all(yy >= pad, yy - pad < in_size,
                xx >= pad, xx - pad < in_size),
        A[yy - pad, xx - pad, cc, nn], tvm.const(0.)),
    name='Apad')
# Create reduction variables
rc = tvm.reduce_axis((0, in_channel), name='rc')
ry = tvm.reduce_axis((0, kernel), name='ry')
rx = tvm.reduce_axis((0, kernel), name='rx')
# Compute the convolution
B = tvm.compute(
    (out_size, out_size, out_channel, batch),
    lambda yy, xx, ff, nn: tvm.sum(
        Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff],
        axis=[ry, rx, rc]),
    name='B')