def test_add_pipeline(): nn = 64 max_threads = 4 n = tvm.convert(nn) A = tvm.placeholder((n, ), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, (n + 1) // 2) as i: ib.emit(outs[0].vstore( i * 2, ins[0].vload(i * 2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() def extern_generator_gpu(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", (nn + max_threads - 1) // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var with ib.if_scope(ib.likely(idx < n)): ib.emit(outs[0].vstore( idx * 2, ins[0].vload(idx * 2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C') C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C') s_cpu = tvm.create_schedule(C_cpu.op) s_gpu = tvm.create_schedule(C_gpu.op) print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True)) print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True)) def check_target(target): if not tvm.module.enabled(target): return s = s_gpu if target in ['opencl', 'cuda'] else s_cpu C = C_gpu if target in ['opencl', 'cuda'] else C_cpu # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.context(target, 0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) check_target("llvm") check_target("opencl") check_target("cuda")
def argsort_gpu(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0): """Performs sorting along the given axis and returns an array of indicies having same shape as an input array that index data in sorted order. Parameters ---------- data: tvm.Tensor The input array. valid_count : tvm.Tensor The number of valid elements to be sorted. axis : int Axis long which to sort the input tensor. is_ascend : boolean Whether to sort in ascending or descending order. flag : boolean Whether this argsort is used in nms operator Returns ------- out : tvm.Tensor The output of this function. """ sorted_data_buf = api.decl_buffer(data.shape, data.dtype, "sorted_data_buf", data_alignment=8) sorted_data = identity(data) if flag: valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4) out = tvm.extern([data.shape], [sorted_data, valid_count], lambda ins, outs: sort_nms_ir( ins[0], ins[1], outs[0], axis, is_ascend), dtype="int32", in_buffers=[sorted_data_buf, valid_count_buf], out_buffers=[out_buf], name="argsort_nms_gpu", tag="argsort_nms_gpu") else: out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) out = tvm.extern([data.shape], [sorted_data], lambda ins, outs: sort_ir( ins[0], outs[0], axis, is_ascend), dtype=dtype, in_buffers=[sorted_data_buf], out_buffers=[out_buf], name="argsort_gpu", tag="argsort_gpu") return out
def argsort_gpu(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): """Performs sorting along the given axis and returns an array of indicies having same shape as an input array that index data in sorted order. Parameters ---------- data: tvm.Tensor The input array. valid_count : tvm.Tensor, optional The number of valid elements to be sorted. axis : int, optional Axis long which to sort the input tensor. is_ascend : boolean, optional Whether to sort in ascending or descending order. dtype : string, optional DType of the output indices. Returns ------- out : tvm.Tensor The output of this function. """ if valid_count is not None: sorted_data = identity(data) sorted_data_buf = api.decl_buffer(data.shape, data.dtype, "sorted_data_buf", data_alignment=8) valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4) out = tvm.extern([data.shape], [sorted_data, valid_count], lambda ins, outs: sort_nms_ir( ins[0], ins[1], outs[0], axis, is_ascend), dtype="int32", in_buffers=[sorted_data_buf, valid_count_buf], out_buffers=[out_buf], name="argsort_nms_gpu", tag="argsort_nms_gpu") else: value_buf = api.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) indices_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) out = tvm.extern([data.shape, data.shape], [data], lambda ins, outs: sort_ir( ins[0], outs[0], axis, is_ascend, indices_out=outs[1]), out_buffers=[value_buf, indices_buf], name="argsort_gpu", tag="argsort_gpu")[1] return out
def argsort_gpu(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0): """Performs sorting along the given axis and returns an array of indicies having same shape as an input array that index data in sorted order. Parameters ---------- data: tvm.Tensor The input array. valid_count : tvm.Tensor The number of valid elements to be sorted. axis : int Axis long which to sort the input tensor. is_ascend : boolean Whether to sort in ascending or descending order. flag : boolean Whether this argsort is used in nms operator Returns ------- out : tvm.Tensor The output of this function. """ data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) if flag: valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4) out = tvm.extern([data.shape], [data, valid_count], lambda ins, outs: sort_nms_ir( ins[0], ins[1], outs[0], axis, is_ascend), dtype="int32", in_buffers=[data_buf, valid_count_buf], out_buffers=[out_buf], name="argsort_nms_gpu", tag="argsort_nms_gpu") else: out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) out = tvm.extern([data.shape], [data], lambda ins, outs: sort_ir( ins[0], outs[0], axis, is_ascend), dtype=dtype, in_buffers=[data_buf], out_buffers=[out_buf], name="argsort_gpu", tag="argsort_gpu") return out
def test_add_pipeline(): nn = 64 max_threads = 4 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, (n+1) // 2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() def extern_generator_gpu(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var with ib.if_scope(ib.likely(idx < n)): ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C') C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C') s_cpu = tvm.create_schedule(C_cpu.op) s_gpu = tvm.create_schedule(C_gpu.op) print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True)) print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True)) def check_target(target): if not tvm.module.enabled(target): return s = s_gpu if target in ['opencl', 'cuda'] else s_cpu C = C_gpu if target in ['opencl', 'cuda'] else C_cpu # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.context(target, 0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) check_target("llvm") check_target("opencl") check_target("cuda")
def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters ---------- cls_prob : tvm.Tensor Class probabilities. loc_pred : tvm.Tensor Location regression predictions. anchor : tvm.Tensor Prior anchor boxes. clip : boolean Whether to clip out-of-boundary boxes. threshold : float Threshold to be a positive prediction. variances : tuple of float Variances to be decoded from box regression output. Returns ------- ret : tuple of tvm.Tensor composed of out : tvm.Tensor 3-D tensor with shape (batch_size, num_anchors, 6) valid_count : tvm.Tensor 1-D tensor with shape (batch_size,), number of valid anchor boxes. """ batch_size = cls_prob.shape[0] num_anchors = anchor.shape[1] oshape = (batch_size, num_anchors, 6) # Define data alignment for intermediate buffer valid_count_dtype = "int32" valid_count_buf = api.decl_buffer((batch_size, ), valid_count_dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8) valid_count, out = \ tvm.extern([(batch_size,), oshape], [cls_prob, loc_pred, anchor], lambda ins, outs: transform_loc_ir( ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances), dtype=[valid_count_dtype, cls_prob.dtype], out_buffers=[valid_count_buf, out_buf], tag="multibox_transform_loc") return [out, valid_count]
def test_cpu(): n = 1024 dtype = "float32" A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') def test_device_ir(A, B, C): n = A.shape[0] max_threads = 8 ib = tvm.ir_builder.create() Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) with ib.for_range(0, n, name="i") as i: Cptr[i] = Aptr[i] + Bptr[i] body = ib.get() return body C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), name="vector_add", dtype=dtype) s = tvm.create_schedule(C.op) def check_target(target): if not tvm.runtime.enabled(target): return # build and invoke the kernel. fadd = tvm.build(s, [A, B, C], target) ctx = tvm.context(target, 0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_target("llvm")
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def test_cpu(): n = 1024 dtype = "float32" A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') def test_device_ir(A, B, C): n = A.shape[0] max_threads = 8 ib = tvm.ir_builder.create() Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) with ib.for_range(0, n, name="i") as i: Cptr[i] = Aptr[i] + Bptr[i] body = ib.get() return body C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), name="vector_add", dtype=dtype) s = tvm.create_schedule(C.op) def check_target(target): if not tvm.module.enabled(target): return # build and invoke the kernel. fadd = tvm.build(s, [A, B, C], target) ctx = tvm.context(target, 0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_target("llvm")
def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, n/2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C = tvm.extern(A.shape, [A], extern_generator, name='C') s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, C], simple_mode=True)) def check_llvm(): if not tvm.module.enabled("llvm"): return # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_llvm()
def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, n/2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C = tvm.extern(A.shape, [A], extern_generator, name='C') s = tvm.create_schedule(C.op) def check_llvm(): if not tvm.module.enabled("llvm"): return # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_llvm()
def test_pack_buffer_intermediate(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.compute((n,), lambda i: A[i] + 1, name="B") def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline.""" return tvm.call_packed("my_extern_array_func2", ins[0], outs[0]) C = tvm.extern(B.shape, [B], extern_generator, name='C') s = tvm.create_schedule(C.op) def check_target(target): if not tvm.module.enabled(target): return # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) @tvm.register_func def my_extern_array_func2(aa, bb): assert aa.shape == a.shape tvm.testing.assert_allclose( aa.asnumpy(), a.asnumpy() + 1) aa.copyto(bb) f(a, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_target("llvm")
def test_pack_buffer_simple(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n, ), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline.""" return tvm.call_packed("my_extern_array_func1", ins[0], outs[0]) C = tvm.extern(A.shape, [A], extern_generator, name='C') s = tvm.create_schedule(C.op) @tvm.register_func def my_extern_array_func1(aa, bb): aa.copyto(bb) def check_target(target): if not tvm.module.enabled(target): return # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy()) check_target("stackvm") check_target("llvm")
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern( data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed("tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern( data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed("tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): """Get the top k elements in an input tensor along the given axis. Parameters ---------- data : tvm.Tensor The input tensor. k : int, optional Number of top elements to select. Return all elements if k < 1. axis : int, optional Axis long which to sort the input tensor. ret_type: str, optional The return type [both, values, indices]. "both": return both top k data and indices. "values": return top k data only. "indices": return top k indices only. is_ascend : boolean, optional Whether to sort in ascending or descending order. dtype : string, optional The data type of the indices output. Returns ------- out : tvm.Tensor or List[tvm.Tensor] The computed result. """ assert ret_type in ["both", "values", "indices"] data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) out_shape = list(get_const_tuple(data.shape)) if k >= 1: out_shape[axis] = k out_bufs = [] if ret_type in ["both", "values"]: out_bufs.append( api.decl_buffer(out_shape, data.dtype, "value_buf", data_alignment=8)) if ret_type in ["both", "indices"]: out_bufs.append( api.decl_buffer(out_shape, dtype, "indices_buf", data_alignment=8)) out_shapes = [out_shape] * len(out_bufs) out = tvm.extern( out_shapes, [data], lambda ins, outs: tvm.call_packed("tvm.contrib.sort.topk", ins[0], * outs, k, axis, ret_type, is_ascend), in_buffers=[data_buf], out_buffers=out_bufs, name="topk_cpu", tag="topk_cpu") return out
def test_extern(): m = tvm.var('m') A = tvm.placeholder((m,), name='A') def extern_func(ins, outs): assert(isinstance(ins[0], tvm.schedule.Buffer)) return tvm.call_packed("myadd", ins[0].data, outs[0].data, m) B = tvm.extern((m,), [A], extern_func) assert(tuple(B.shape) == (m,))
def test_extern_multi_out(): m = tvm.var('m') A = tvm.placeholder((m,), name='A') B = tvm.compute((m,), lambda i: A[i] * 10) def extern_func(ins, outs): assert(isinstance(ins[0], tvm.schedule.Buffer)) return tvm.call_packed( "myadd", ins[0].data, outs[0].data, outs[1].data, m) res = tvm.extern([A.shape, A.shape], [A, B], extern_func) assert(len(res) == 2) assert(res[1].value_index == 1)
def test_gpu(): n = tvm.var('n') dtype = "float32" A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') fld = tvm.floordiv def test_device_ir(A, B, C): n = A.shape[0] max_threads = 32 ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", fld(n + max_threads - 1, max_threads)) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) with ib.if_scope(ib.likely(idx < n)): Cptr[idx] = Aptr[idx] + Bptr[idx] body = ib.get() return body C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), name="vector_add", dtype=dtype) s = tvm.create_schedule(C.op) bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) def check_target(target): n = 1024 if not tvm.module.enabled(target): return # build and invoke the kernel. fadd = tvm.build(s, [A, B, C], target) ctx = tvm.context(target, 0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_target("opencl") check_target("cuda")
def sparse_transpose(sparse_data, sparse_indices, sparse_indptr): """ Transpose a square sparse matrix, `A` is an n-by-n sparse matrix in the CSR format. ** Currently only support Square Matrices ** Parameters ---------- sparse_data : tvm.Tensor 1-D with shape [nonzeros], dtype of 'float32' sparse_indices : tvm.Tensor 1-D with shape [nonzeros], dtype of 'int32' sparse_indptr : tvm.Tensor 1-D with shape [n+1], dtype of 'int32' Returns ------- out_data : tvm.Tensor 1-D with shape [nonzeros], dtype of 'float32' out_indices : tvm.Tensor 1-D with shape [nonzeros], dtype of 'int32' out_indptr : tvm.Tensor 1-D with shape [n+1], dtype of 'int32' """ assert len(sparse_data.shape) == 1, "error in data dimension" assert len(sparse_indices.shape) == 1, "error in indices dimension" assert len(sparse_indptr.shape) == 1, "error in indptr dimension" nnz = get_const_tuple(sparse_data.shape)[0] n = get_const_tuple(sparse_indptr.shape)[0] - 1 output_shape = [(nnz, ), (nnz, ), (n + 1, )] # TODO: Add BSR transpose support output_data, output_indices, output_indptr = tvm.extern( shape=output_shape, inputs=[sparse_data, sparse_indices, sparse_indptr], fcompute=lambda ins, outs: _csr_transpose_ir(ins[0], ins[1], ins[ 2], outs[0], outs[1], outs[2]), tag="sparse_transpose_csr", dtype=['float32', 'int32', 'int32'], name='out') return [output_data, output_indices, output_indptr]
def gen_copy_reduce_sum(isfwd): indptrN = tvm.var('indptrN') indicesN = tvm.var('indicesN') outN = tvm.var('outN') inN = tvm.var('inN') x_len = tvm.var('x_len') indices = tvm.placeholder((indicesN,), name='indices', dtype=tvm.int32) indptr = tvm.placeholder((indptrN,), name='indptr', dtype=tvm.int32) inbuf = tvm.placeholder((inN, x_len), name='inbuf', dtype=tvm.float32) #outbuf = tvm.placeholder((outN, x_len), name='outbuf') def gen(ins, outs): irb = tvm.ir_builder.create() outptr = irb.buffer_ptr(outs[0]) gen_zero_out_tensor(irb, outs[0]) block_size = 32 x_len_s = topi.util.simplify(x_len) '''with irb.for_range(0, tvm.floordiv(x_len_s + (block_size - 1), block_size), for_type="parallel" ,name='blkIdx') as blkIdx: def workload(irb, src, dst, eid, inptr): with irb.for_range(0, blkIdx * block_size, name='i') as i: #for_type="vectorize" with irb.if_scope(irb.likely(blkIdx * block_size + i < x_len_s)) : if isfwd: outptr[dst * x_len_s + blkIdx * block_size + i] += inptr[src * x_len_s + blkIdx * block_size + i] else: outptr[src * x_len_s + blkIdx * block_size + i] += inptr[dst * x_len_s + blkIdx * block_size + i] gen_csr_iterate(irb, ins[0], ins[1], False, workload, inptr = ins[2])''' def for_each_edge(irb, src, dst, eid, inptr): def assign(idx): if isfwd: outptr[dst * x_len_s + idx] += inptr[src * x_len_s + idx] else: outptr[src * x_len_s + idx] += inptr[dst * x_len_s + idx] gen_vectorized_for_loop(irb, x_len_s, simd_size, assign) gen_csr_iterate(irb, ins[0], ins[1], not isfwd, for_each_edge, inptr = ins[2]) '''def workload(irb, src, dst, eid, inptr): blkSize=16 #with irb.for_range(0, tvm.floordiv(x_len_s, blkSize), name='x_len.outer') as outer: #for_type="vectorize" with irb.for_range(0, x_len_s, name='x_len.inner') as inner: # if isfwd: outptr[dst * x_len_s + inner] += inptr[src * x_len_s + inner] else: outptr[src * x_len_s + inner] += inptr[dst * x_len_s + inner] gen_csr_iterate(irb, ins[0], ins[1], True, workload, inptr = ins[2])''' return irb.get() C = tvm.extern((outN, x_len),[indices, indptr, inbuf], gen, dtype=tvm.float32, name = "C") return C,indices,indptr,inbuf
def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters ---------- cls_prob : tvm.Tensor Class probabilities. loc_pred : tvm.Tensor Location regression predictions. anchor : tvm.Tensor Prior anchor boxes. clip : boolean Whether to clip out-of-boundary boxes. threshold : float Threshold to be a positive prediction. variances : tuple of float Variances to be decoded from box regression output. Returns ------- ret : tuple of tvm.Tensor """ batch_size = cls_prob.shape[0] num_anchors = anchor.shape[1] oshape = (batch_size, num_anchors, 6) # Define data alignment for intermediate buffer valid_count_dtype = "int32" valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8) valid_count, out = \ tvm.extern([(batch_size,), oshape], [cls_prob, loc_pred, anchor], lambda ins, outs: transform_loc_ir( ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances), dtype=[valid_count_dtype, cls_prob.dtype], out_buffers=[valid_count_buf, out_buf], tag="multibox_transform_loc") return [out, valid_count]
def multibox_prior_gpu(data, sizes=(1, ), ratios=(1, ), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. Parameters ---------- data : tvm.Tensor 4-D with shape [batch, c_in, h_in, w_in]] sizes : tuple of float Tuple of sizes for anchor boxes. ratios : tuple of float Tuple of ratios for anchor boxes. steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int Priorbox center offsets, y and x respectively. clip : boolean Whether to clip out-of-boundary boxes. Returns ------- out : tvm.Tensor 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] """ num_sizes = len(sizes) num_ratios = len(ratios) oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4) out = tvm.extern(oshape, [data], lambda ins, outs: multibox_prior_ir( ins[0], outs[0], sizes, ratios, steps, offsets), tag="multibox_prior") if clip: out = topi.clip(out, 0, 1) return out
def lesson1(): ###################################################################### # Use Extern Tensor Function # -------------------------- # In the example below, we use :any:`tvm.extern` to add an extern # array function call. In the extern call, we declare the shape # of output tensors. In the second argument we provide the list of inputs. # # User will need to provide a function describing how to compute the result. # The compute function takes list of symbolic placeholder for the inputs, # list of symbolic placeholder for the outputs and returns the executing statement. # # In this case we simply call a registered tvm function, which invokes a CBLAS call. # TVM does not control internal of the extern array function and treats it as blackbox. # We can further mix schedulable TVM calls that add a bias term to the result. # n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern( (n, m), [A, B], lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") s = tvm.create_schedule(D.op) ###################################################################### # Verify the Result # ----------------- # We can verify that the result matches what we expected. # ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) np.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
def gen_binary_op_dot_bwd_lhs(islhs): indptrN = tvm.var('indptrN') indicesN = tvm.var('indicesN') rhsDataN = tvm.var('rhsDataN') gradoutDataN = tvm.var('gradoutDataN') lhsgradoutDataN = tvm.var('lhsgradoutDataN') #xLen = tvm.var('xLen') xLen = 1 #fix-me: we eliminated x_len dimension here dataLen = tvm.var('dataLen') indices = tvm.placeholder((indicesN,), name='indices', dtype=tvm.int32) indptr = tvm.placeholder((indptrN,), name='indptr', dtype=tvm.int32) rhsData = tvm.placeholder((rhsDataN, dataLen), name='rhsData', dtype=tvm.float32) gradoutData = tvm.placeholder((gradoutDataN, ), name='gradoutData', dtype=tvm.float32) outMapping = tvm.placeholder((gradoutDataN, ), name='outMapping', dtype=tvm.int32) #lhsgradoutData = tvm.placeholder((lhsgradoutDataN, xLen, dataLen), name='lhsgradoutData', dtype=tvm.float32) def gen_func(ins, outs): irb = tvm.ir_builder.create() gen_zero_out_tensor(irb, outs[0]) indices, indptr, rhsData, gradoutData, outMapping = ins[0], ins[1], ins[2], ins[3], ins[4] #with irb.for_range(0, xLen, name='i') as i: def for_each_edge(irb, src, dst, eid, rhsDataPtr, gradoutDataPtr, lhsgradoutDataPtr, outMappingPtr): lhsIdx = topi.util.simplify(src * dataLen) outIdx = topi.util.simplify(outMappingPtr[eid]) rhsIdx = topi.util.simplify(dst * dataLen) grad = gradoutDataPtr[outIdx] def fcompute(j): if islhs: lhsgradoutDataPtr[lhsIdx + j] += grad * rhsDataPtr[rhsIdx +j] else: lhsgradoutDataPtr[rhsIdx + j] += grad * rhsDataPtr[lhsIdx +j] gen_vectorized_for_loop(irb, dataLen, simd_size, fcompute) gen_csr_iterate(irb, indices, indptr, islhs, for_each_edge, rhsDataPtr = rhsData, gradoutDataPtr = gradoutData, lhsgradoutDataPtr= outs[0], outMappingPtr = outMapping ) return irb.get() #outbuf = tvm.placeholder((outN, x_len), name='outbuf') C = tvm.extern((lhsgradoutDataN, dataLen),[indices, indptr, rhsData, gradoutData, outMapping], gen_func, dtype=tvm.float32, name = "lhsgradoutData" ) return C,indices,indptr,rhsData, gradoutData, outMapping
def test_gpu(): n = tvm.var('n') dtype = "float32" A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') def test_device_ir(A, B, C): n = A.shape[0] max_threads = 32 ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", (n+max_threads-1) // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) with ib.if_scope(ib.likely(idx<n)): Cptr[idx] = Aptr[idx] + Bptr[idx] body = ib.get() return body C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), name="vector_add", dtype=dtype) s = tvm.create_schedule(C.op) bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) def check_target(target): n = 1024 if not tvm.module.enabled(target): return # build and invoke the kernel. fadd = tvm.build(s, [A, B, C], target) ctx = tvm.context(target, 0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_target("opencl") check_target("cuda")
def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. Parameters ---------- data : tvm.Tensor 4-D with shape [batch, c_in, h_in, w_in]] sizes : tuple of float Tuple of sizes for anchor boxes. ratios : tuple of float Tuple of ratios for anchor boxes. steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int Priorbox center offsets, y and x respectively. clip : boolean Whether to clip out-of-boundary boxes. Returns ------- out : tvm.Tensor 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] """ num_sizes = len(sizes) num_ratios = len(ratios) oshape = ( 1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4) out = tvm.extern(oshape, [data], lambda ins, outs: multibox_prior_ir( ins[0], outs[0], sizes, ratios, steps, offsets), tag="multibox_prior") if clip: out = topi.clip(out, 0, 1) return out
def main(): ctx = tvm.cpu(0) n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern( (n, m), [A, B], lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C(i, j) + bias, name="D") s = tvm.create_schedule(D.op) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 print(d.asnumpy()) tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
rD = ((P*Q*N-1)//block_len+1)*block_len rF = ((K-1)//block_len+1)*block_len loop_len = (rD*rF//block_len//block_len-1)//block_num+1 # length of the main loop index_len = loop_len*cD//16*8+8 print(loop_len) print(P,Q,cD) OFFIND = np.ones((block_num,thread_num),dtype = np.int32) with tvm.target.create('cuda'): D = tvm.placeholder((N,C,H,W),dtype = 'float16') F = tvm.placeholder((K,C,R,S),dtype = 'float16') LOAD_INDEX_D = tvm.placeholder((block_num,thread_num,index_len),dtype = 'int32') LOAD_INDEX_F = tvm.placeholder((block_num,thread_num,index_len),dtype = 'int32') O = tvm.extern((N,K,P,Q),[D,F,LOAD_INDEX_D,LOAD_INDEX_F],lambda ins,outs:convolutionf16(ins[0],ins[1],ins[2],ins[3],outs[0]),name = "conv",dtype = 'float16') s = schedule_conv_fp16() print(tvm.lower(s,[D,F,LOAD_INDEX_D,LOAD_INDEX_F,O],name ='convf16',simple_mode = True)) f = tvm.build(s, [D,F,LOAD_INDEX_D,LOAD_INDEX_F,O], target='cuda', name='conv') print("build finished") ctx = tvm.context('cuda', 0) a_np = np.float16(np.random.uniform(0.,1.,size=(N,C,H,W))) b_np = np.float16(np.random.uniform(0.,1.,size=(K,C,R,S))) c_np = np.zeros((N,K,P,Q), dtype=O.dtype) d1_np = -1*np.ones((block_num,thread_num,index_len),dtype = np.int32) d2_np = -1*np.ones((block_num,thread_num,index_len),dtype = np.int32) #d1_np = np.zeros((rD,cD),dtype = np.int32) #d2_np = np.zeros((rF,cF),dtype = np.int32) print("now start compute index")
def non_max_suppression_gpu(data, valid_count, max_output_size=-1, iou_threshold=0.5, force_suppress=False, top_k=-1, coord_start=2, score_index=1, id_index=0, return_indices=True, invalid_to_bottom=False): """Non-maximum suppression operator for object detection. Parameters ---------- data : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. max_output_size : optional, int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : optional, float Non-maximum suppression threshold. force_suppress : optional, boolean Whether to suppress all detections regardless of class_id. top_k : optional, int Keep maximum top k detections before nms, -1 for no limit. coord_start : required, int Start index of the consecutive 4 coordinates. score_index : optional, int Index of the scores/confidence of boxes. id_index : optional, int index of the class categories, -1 to disable. return_indices : boolean Whether to return box indices in input data. invalid_to_bottom : optional, boolean Whether to move all valid bounding boxes to the top. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. Example -------- .. code-block:: python # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") iou_threshold = 0.7 force_suppress = True top_k = -1 out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold, force_suppress=force_supress, top_k=top_k, return_indices=False) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "cuda") ctx = tvm.gpu(0) tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) score_axis = score_index score_shape = (batch_size, num_anchors) score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE) sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False) sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8) data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) out_buf = api.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8) out, box_indices = \ tvm.extern([data.shape, score_shape], [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], outs[0], outs[1], max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index), dtype=[data.dtype, "int32"], in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], name="nms", tag="nms") if return_indices: return box_indices if invalid_to_bottom: output_buf = api.decl_buffer(data.shape, data.dtype, "output_buf", data_alignment=8) temp_flag_buf = api.decl_buffer(score_shape, valid_count_dtype, "temp_flag", data_alignment=8) temp_idx_buf = api.decl_buffer(score_shape, valid_count_dtype, "temp_idx", data_alignment=8) temp_flag, temp_idx = tvm.extern( [score_shape, score_shape], [out], lambda ins, outs: invalid_to_bottom_pre(ins[0], outs[0], outs[1]), dtype=["int32", "int32"], in_buffers=[out_buf], out_buffers=[temp_flag_buf, temp_idx_buf], name="invalid_to_bottom_phase_one") output = tvm.extern([data.shape], [out, temp_flag, temp_idx], lambda ins, outs: invalid_to_bottom_ir( ins[0], ins[1], ins[2], outs[0]), dtype=[data.dtype], in_buffers=[out_buf, temp_flag_buf, temp_idx_buf], out_buffers=[output_buf], name="invalid_to_bottom", tag="invalid_to_bottom") return output return out
def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1): """Get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. Parameters ---------- data : tvm.Tensor Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length]. score_threshold : optional, float Lower limit of score for valid bounding boxes. id_index : optional, int index of the class categories, -1 to disable. score_index: optional, int Index of the scores/confidence of boxes. Returns ------- valid_count : tvm.Tensor 1-D tensor for valid number of boxes. out_tensor : tvm.Tensor Rearranged data tensor. """ batch_size = data.shape[0] num_anchors = data.shape[1] max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) elem_per_thread = num_anchors // max_threads + 1 new_range = num_anchors // elem_per_thread + 1 temp_flag_buf = api.decl_buffer(( batch_size, num_anchors, ), "int32", "temp_flag", data_alignment=8) temp_idx_buf = api.decl_buffer(( batch_size, num_anchors, ), "int32", "temp_idx", data_alignment=8) temp_partial_buf = api.decl_buffer((batch_size, new_range), "int32", "temp_partial", data_alignment=8) data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) temp_flag, temp_idx = \ tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data], lambda ins, outs: get_valid_counts_pre( ins[0], outs[0], outs[1], score_threshold, id_index, score_index), dtype=["int32", "int32"], out_buffers=[temp_flag_buf, temp_idx_buf], name="get_valid_counts_phase_one") temp_idx_new, temp_partial = \ tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx], lambda ins, outs: get_valid_counts_upsweep( ins[0], ins[1], outs[0], outs[1]), dtype=["int32", "int32"], out_buffers=[temp_idx_buf, temp_partial_buf], name="get_valid_counts_phase_two") temp_partial_new = \ tvm.extern([(batch_size, new_range)], [data, temp_partial], lambda ins, outs: get_valid_counts_scan( ins[0], ins[1], outs[0]), dtype=["int32"], out_buffers=[temp_partial_buf], name="get_valid_counts_phase_three") temp_idx_final = \ tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new], lambda ins, outs: get_valid_counts_downsweep( ins[0], ins[1], ins[2], outs[0]), dtype=["int32"], out_buffers=[temp_idx_buf], name="get_valid_counts_phase_four") valid_count, out_tensor = \ tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final], lambda ins, outs: get_valid_counts_ir( ins[0], ins[1], ins[2], outs[0], outs[1]), dtype=["int32", data.dtype], in_buffers=[data_buf, temp_flag_buf, temp_idx_buf], name="get_valid_counts_phase_five", tag="get_valid_counts_gpu") return [valid_count, out_tensor]
def dense_sw(data, w_data, w_indices, w_indptr, bias=None): # pylint: disable=invalid-name """The implementation of dense in topi, assuming sparse weight. Parameters ---------- data : tvm.Tensor 2-D with shape [m, k] w_data : tvm.Tensor 1-D with shape [nonzeros] w_indices : tvm.Tensor 1-D with shape [nonzeros] w_indptr : tvm.Tensor 1-D with shape [n+1] bias : tvm.Tensor, optional 1-D with shape [n] Returns ------- output : tvm.Tensor 2-D with shape [m, n] """ assert len(w_data.shape) == 1 and len(w_indices.shape) == 1 and len(w_indptr.shape) == 1 \ and len(data.shape) == 2, "only support 2-dim dense" assert isinstance(data, tvm.tensor.Tensor), \ "data matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(data)) if bias is not None: assert len(bias.shape) == 1 dtype = data.dtype M, _ = data.shape N = simplify(w_indptr.shape[0]-1) def dense_default_ir(data, w_data, w_indices, w_indptr, out): """Define IR for Dense""" dtype = data.dtype irb = tvm.ir_builder.create() data_ptr = irb.buffer_ptr(data) w_data_ptr = irb.buffer_ptr(w_data) w_indices_ptr = irb.buffer_ptr(w_indices) w_indptr_ptr = irb.buffer_ptr(w_indptr) out_ptr = irb.buffer_ptr(out) M, K = data.shape N = simplify(w_indptr.shape[0]-1) with irb.for_range(0, M, for_type="vectorize", name='m') as m: with irb.for_range(0, N, for_type="parallel", name='n') as n: dot = irb.allocate(dtype, (1,), name='dot', scope='local') out_ptr[m*N+n] = tvm.const(0, dtype) dot[0] = tvm.const(0, dtype) row_start = w_indptr_ptr[n] row_elems = w_indptr_ptr[n+1]-row_start with irb.for_range(0, row_elems, name='k') as k: elem = row_start+k dot[0] += w_data_ptr[elem] * data_ptr[w_indices_ptr[elem]+m*K] out_ptr[m*N+n] += dot[0] return irb.get() oshape = (M, N) matmul = tvm.extern(oshape, [data, w_data, w_indices, w_indptr], lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]), tag="dense", dtype=dtype, name='out') if bias is not None: matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \ tag=tag.BROADCAST) return matmul
def proposal_cuda(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss): """Proposal operator. Parameters ---------- cls_prob : tvm.Tensor 4-D with shape [batch, 2 * num_anchors, height, width] bbox_pred : tvm.Tensor 4-D with shape [batch, 4 * num_anchors, height, width] im_info : tvm.Tensor 2-D with shape [batch, 3] scales : list/tuple of float Scales of anchor windoes. ratios : list/tuple of float Ratios of anchor windoes. feature_stride : int The size of the receptive field each unit in the convolution layer of the rpn, for example the product of all stride's prior to this layer. threshold : float Non-maximum suppression threshold. rpn_pre_nms_top_n : int Number of top scoring boxes to apply NMS. -1 to use all boxes. rpn_post_nms_top_n : int Number of top scoring boxes to keep after applying NMS to RPN proposals. rpn_min_size : int Minimum height or width in proposal. iou_loss : bool Usage of IoU loss. Returns ------- out : tvm.Tensor 2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of [batch_index, w_start, h_start, w_end, h_end]. """ batch, _, height, width = get_const_tuple(cls_prob.shape) num_anchors = len(scales) * len(ratios) num_bbox = height * width * num_anchors rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox bbox = tvm.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs: predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios, feature_stride, rpn_min_size, iou_loss), dtype=bbox_pred.dtype) score = tvm.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score') sorted_index = tvm.extern([score.shape], [score], lambda ins, outs: argsort_ir(ins[0], outs[0]), dtype='int32') sorted_bbox = tvm.compute((batch, rpn_pre_nms_top_n, 5), lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox') nms_remove_mask = tvm.extern((batch, rpn_pre_nms_top_n), [sorted_bbox], lambda ins, outs: nms_ir(ins[0], outs[0], threshold), dtype='bool') nms_out = tvm.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask], lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]), dtype=sorted_bbox.dtype) return nms_out
# f = tvm.build(s, [A, B, bias,D], 'llvm') a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx=ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx=ctx) d = tvm.nd.array(np.zeros(shape=(n, m), dtype=D.dtype), ctx=ctx) bb = 10.0 f(a, b, d, bb) np.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5) print(d.shape) @tvm.register_func('tvm.contrib.my_tvm_add_one') def my_tvm_add_one(x, y): print('my tvm add one signatures :%s, %s' % (type(x), type(y))) tvm.nd.array(x.asnumpy() + 1).copyto(y) A = tvm.placeholder((n, ), name='A') B = tvm.extern(A.shape, [A], lambda ins, outs: tvm.call_packed('tvm.contrib.my_tvm_add_one', ins[0], outs[0]), name='C') s = tvm.create_schedule(B.op) f = tvm.build(s, [A, B], 'llvm') a = tvm.nd.array(np.random.uniform(size=(n, )).astype(A.dtype), ctx=ctx) b = tvm.nd.array(np.random.uniform(size=(n, )).astype(B.dtype), ctx=ctx) f(a, b) np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5) print(b.shape)
def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend): """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu. Parameters ---------- data: tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. data_buf: Buffer 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. index : tvm.Tensor 1-D tensor for valid number of boxes. index_buf : Buffer Buffer of number of valid number of boxes. output_buf : Buffer Output buffer of indicies of sorted tensor. axis : int The axis used for sorting. is_descend : bool If the sorted data is in descending order. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors]. """ ndim = len(data.shape) assert data.dtype == "float32", "Currently only supports input dtype to be float32" assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim axis_mul_before = 1 axis_mul_after = 1 if axis < 0: axis = ndim + axis for i in range(0, ndim): if i < axis: axis_mul_before *= data.shape[i] elif i > axis: axis_mul_after *= data.shape[i] dshape = axis_mul_before*axis_mul_after fshape = data.shape[axis] * dshape loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8) new_index_buf = api.decl_buffer( fshape, index.dtype, "index_new", data_alignment=8) out_index_buf = api.decl_buffer( fshape, index.dtype, "index_out", data_alignment=8) new_data_buf = api.decl_buffer( dshape, data.dtype, "data_new", data_alignment=8) loc = \ tvm.extern([(dshape,)], [index], lambda ins, outs: sort_pre_ir( ins[0], outs[0], axis_mul_before, axis_mul_after), dtype=[index.dtype], in_buffers=index_buf, out_buffers=[loc_buf], tag="sorting_prepare") data_new, index_new = \ tvm.extern([(dshape,), (fshape,)], [data, index, loc], lambda ins, outs: sort_pre_ir_data( ins[0], ins[1], ins[2], outs[0], outs[1], axis, axis_mul_before, axis_mul_after), dtype=[data.dtype, index.dtype], in_buffers=[data_buf, index_buf, loc_buf], out_buffers=[new_data_buf, new_index_buf], tag="sorting_data") index_out = \ tvm.extern([(fshape,)], [data, index, data_new, index_new, loc], lambda ins, outs: sort_oet_ir( ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], axis_mul_before, axis_mul_after, axis, is_descend), dtype=[index.dtype], in_buffers=[data_buf, index_buf, new_data_buf, new_index_buf, loc_buf], out_buffers=[out_index_buf], tag="sorting_oet") out = \ tvm.extern([data.shape], [data, index, index_out, loc], lambda ins, outs: sort_ir_out( ins[0], ins[1], ins[2], ins[3], outs[0], axis_mul_before, axis_mul_after, axis), dtype=[index.dtype], in_buffers=[data_buf, index_buf, out_index_buf, loc_buf], out_buffers=output_buf, tag="sorting_output") return out
def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): """Non-maximum suppression operator for object detection. Parameters ---------- data: tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. Example -------- .. code-block:: python # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder( (dshape[0],), dtype="int32", name="valid_count") nms_threshold = 0.7 force_suppress = True nms_topk = -1 out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) score_axis = 1 score_shape = (batch_size, num_anchors) score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor") score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, "score_tensor_buf", data_alignment=8) sort_tensor_dtype = "int32" sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count, valid_count_buf, sort_tensor_buf, score_axis, True) out = \ tvm.extern(data.shape, [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], outs[0], nms_threshold, force_suppress, nms_topk), dtype="float32", in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], tag="nms") return out
def measure_compute_mad(total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, ctx, n_times): """ measure peak compute speed by computing mad for a type The IR for measurement is for each thread for i in 1..item_per_thread x = mad(x, x, y) y = mad(y, y, x) Parameters ---------- total_item: int number of elements in input array item_per_thread: int number of operations each thread does base_type: str can be "int", "float" bits: int can be 16, 32 lanes: int lane of the vector type, can be 1, 2, 4, 8, 16 target: :any:`tvm.target.Target` the target and option of the compilation. target_host : str or :any:`tvm.target.Target` host compilation target remote: tvm.rpc.RPCSession if it is not None, use remote rpc session ctx: TVMcontext the context of array n_times: int number of runs for taking mean Returns ------- GOPS: float giga operation per second """ n = total_item if bits >= 64 or lanes >= 16: n //= 2 max_threads = target.max_num_threads base_type = str(base_type) + str(bits) dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) def extern(ins, outs): # pylint: disable=unused-argument """construct measurement function by building IR directly""" ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", n // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var a = ib.allocate(dtype, (1), name='a', scope='local') b = ib.allocate(dtype, (1), name='b', scope='local') a[0] = outs[0].vload(idx, dtype) b[0] = outs[0].vload(idx, dtype) if base_type.find('float') != -1: mad_func = lambda x, y: (x * x + y) else: mad_func = lambda x, y: y * y + x for _ in range(item_per_thread // 4 // lanes): a[0] = mad_func(a[0], b[0]) b[0] = mad_func(b[0], a[0]) ib.emit(outs[0].vstore(idx, b[0])) return ib.get() y = tvm.extern((n,), [], extern, name="y", dtype=dtype) s = tvm.create_schedule(y.op) try: func = tvm.build(s, [y], target, target_host=target_host) func = _convert_to_remote(func, remote) time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx) time = time_f(y).mean except tvm._ffi.base.TVMError: # build error (occur when device does not support half) return -1 return 1.0 * (n * item_per_thread) / 1e9 / time
def conv2d_c16str1HMMA(cfg, data, kernel, data_shape, kernel_shape, output_shape, dilation, dir_path): #print the size of current kernel print("data size %s:" % data.name, data_shape) print("kernel size in layer %s:" % kernel.name, kernel_shape) print(output_shape) fortype = "unroll" #block_para blk_q = 8 blk_p = 8 blk_size = blk_p * blk_q ko_part = 2 #tiling parameters block_row_warp = 2 block_col_warp = 2 warp_row_tile = 2 warp_col_tile = 2 #offset preset shieft = 8 offset_D_im2col = (2 + blk_q) * (2 + blk_p) * 16 offset_F = offset_D_im2col + (shieft + 16) * blk_size npq = output_shape[1] * output_shape[2] / blk_size #shared memory usage output_copy = blk_size * blk_size im2col_use = offset_D_im2col + (shieft + 16) * blk_size * 2 shmem_use = max(output_copy, im2col_use) def convolutionfp16(D, F, shmem): #ir builder for constructing the main body ib = tvm.ir_builder.create() #id of current warp and offset of shared memory when storing warpid = tidx / 32 warp_offset_output = warpid%block_row_warp*16*warp_row_tile\ +warpid/block_row_warp*warp_col_tile*block_row_warp*warp_row_tile*256 #include necessary head files include_file = tvm.call_intrin("float32", "include_cpp_head", dir_path + "/conv2d_HMMA.h") ib.emit(include_file) #declare the matrix fragment declare_a = tvm.call_intrin("float32", "wmma_fragment", "matrix_a", "half", "row_major", "a_frag", warp_col_tile) declare_b = tvm.call_intrin("float32", "wmma_fragment", "matrix_b", "half", "col_major", "b_frag", warp_row_tile) declare_c = tvm.call_intrin("float32", "wmma_fragment", "accumulator", "half", "c_frag", warp_col_tile, warp_row_tile) ib.emit(declare_a) ib.emit(declare_b) ib.emit(declare_c) #define the shared memory for loading data and offset for loading the data offset_D_warp = offset_D_im2col + tidx / 2 * (16 + shieft) + tidx % 2 * 8 offset_F_warp = offset_F + tidx / 2 * (16 + shieft) + tidx % 2 * 8 #ir template for thread synchronization sync = tvm.call_extern("float32", "__syncthreads") #main for conducting the computation #set the pointer to first address of D Dp = D.access_ptr("r") Sp = shmem.access_ptr("r") Fp = F.access_ptr("r") #load the first data from global memory for the reuse of 9 times load_first_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\ output_shape[0],data_shape[1],data_shape[2],data_shape[3],0,dilation,0) ib.emit(load_first_data) #load the first filter from global memory: load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\ kernel_shape[3],data_shape[0],data_shape[1],data_shape[2],tidx%2*8,0,0) ib.emit(load_filter) #fill fragment c with 0 with ib.for_range(0, warp_col_tile, name="col_id_fi") as col_id_fi: with ib.for_range(0, warp_row_tile, name="row_id_fi") as row_id_fi: fill_O_zero = tvm.call_intrin("float", "wmma_fill_fragment", "c_frag", col_id_fi, row_id_fi, "half", 0.) ib.emit(fill_O_zero) ib.emit(sync) #do im2col for the first data im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, 0, 0) ib.emit(im2col) ib.emit(sync) with ib.for_range(0, data_shape[3] / 16, name="c_id", for_type=fortype) as c_id: with ib.for_range(0, 9, name="ker_id", for_type=fortype) as ker_id: #now load matrix fragment with ib.for_range(0, warp_col_tile, name="col") as col: load_matrix_frag_F = tvm.call_intrin("float32","wmma_load_matrix_sync","a_frag",col,Sp,\ offset_D_im2col+tidx/(32*block_row_warp)*\ (16*warp_col_tile*(16+shieft))+col*(16*(16+shieft)),16+shieft) ib.emit(load_matrix_frag_F) with ib.for_range(0, warp_row_tile, name="row") as row: load_matrix_frag_D = tvm.call_intrin("float32","wmma_load_matrix_sync","b_frag",row,Sp,\ offset_F+tidx%(32*block_row_warp)/32*\ (16*warp_row_tile*(16+shieft))+row*(16*(16+shieft)),16+shieft) ib.emit(load_matrix_frag_D) ib.emit(sync) #now compute with ib.for_range(0, warp_col_tile, name="mma_col") as mma_col: with ib.for_range(0, warp_row_tile, name="mma_row") as mma_row: wmma_compute = tvm.call_intrin("float16", "wmma_mma_sync", "c_frag", "a_frag", "b_frag", "c_frag", mma_col, mma_row) ib.emit(wmma_compute) with ib.if_scope(ker_id < 8): #load filer of the next ieration load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],kernel_shape[3],\ data_shape[0],data_shape[1],data_shape[2],c_id*16+tidx%2*8,ker_id+1,0) ib.emit(load_filter) #load data for next iteration im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, ker_id + 1, 0) ib.emit(im2col) ib.emit(sync) with ib.if_scope(c_id < data_shape[3] / 16 - 1): #load the next 9 iteration data from global memory load_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\ output_shape[0],output_shape[1],output_shape[2],data_shape[3],c_id*16+16,dilation,0) ib.emit(load_data) #load filter for next cd iter load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\ data_shape[3],data_shape[0],data_shape[1],data_shape[2],c_id*16+16+tidx%2*8,0,0) ib.emit(load_filter) ib.emit(sync) #load the first data from shmem to im2col shmem im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, 0, 0) ib.emit(im2col) ib.emit(sync) #store fragment in shared memory first with ib.for_range(0, warp_col_tile, name="col_id_st") as col_id_st: with ib.for_range(0, warp_row_tile, name="row_id_st") as row_id_st: store_O_fragment = tvm.call_intrin( "float32", "wmma_store_matrix_sync", Sp, warp_offset_output + col_id_st * (256 * warp_row_tile * block_row_warp) + row_id_st * 16, "c_frag", col_id_st, row_id_st, 64) ib.emit(store_O_fragment) ib.emit(sync) body = ib.get() return (body) shmem = tvm.extern((shmem_use,),[data,kernel],lambda ins,outs:convolutionfp16(ins[0],ins[1],outs[0]),\ name = "shmem",dtype = 'float16',\ out_buffers=tvm.decl_buffer((shmem_use,),dtype='float16',scope='shared',offset_factor=1)) #O = tvm.compute(output_shape,lambda dn,dp,dq,dk:shmem[dk%blk_size+dq%blk_q*blk_size+dp%blk_p*blk_size*blk_q],tag="conv2d_NHWC_HMMA",\ # attrs={"blk_size":blk_size,"npq":npq}) #conv = tvm.compute(output_shape,lambda dn,dp,dq,dk:shmem[dk%blk_size+dp/dilation%blk_p*blk_q*blk_size+dq/dilation%blk_q*blk_size]) O = tvm.compute(output_shape,lambda dn,dp,dq,dk:shmem[dk%blk_size+dp/dilation%blk_p*blk_q*blk_size+dq/dilation%blk_q*blk_size],tag="conv2d_NHWC_HMMA",\ attrs={"blk_size":blk_size,"dilation":dilation,"version":0}) num_flop = data_shape[0] * output_shape[2] * output_shape[3] * kernel_shape[ 0] * 2 * data_shape[3] * kernel_shape[1] * kernel_shape[2] cfg.add_flop(num_flop) return (O)
def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \ threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters ---------- cls_prob : tvm.Tensor Class probabilities. loc_pred : tvm.Tensor Location regression predictions. anchor : tvm.Tensor Prior anchor boxes. clip : boolean Whether to clip out-of-boundary boxes. threshold : float Threshold to be a positive prediction. variances : tuple of float Variances to be decoded from box regression output. Returns ------- ret : tuple of tvm.Tensor composed of out : tvm.Tensor 3-D tensor with shape (batch_size, num_anchors, 6) valid_count : tvm.Tensor 1-D tensor with shape (batch_size,), number of valid anchor boxes. """ batch_size = cls_prob.shape[0] num_classes = cls_prob.shape[1] num_anchors = cls_prob.shape[2] oshape = (batch_size, num_anchors, 6) # Define data alignment for intermediate buffer valid_count_dtype = "int32" valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer( oshape, cls_prob.dtype, "out_buf", data_alignment=8) size = num_anchors temp_flag_buf = api.decl_buffer( (size,), valid_count_dtype, "flag", data_alignment=8) temp_id_buf = api.decl_buffer( (size,), valid_count_dtype, "cls_id", data_alignment=8) temp_score_buf = api.decl_buffer( (size,), cls_prob.dtype, "score", data_alignment=8) valid_count, temp_flag, temp_id, temp_score = \ tvm.extern([(batch_size,), (size,), (size,), (size,)], [cls_prob], lambda ins, outs: transform_loc_pre( ins[0], outs[0], outs[1], outs[2], outs[3], threshold), dtype=[valid_count_dtype, valid_count_dtype, valid_count_dtype, cls_prob.dtype], out_buffers=[valid_count_buf, temp_flag_buf, temp_id_buf, temp_score_buf], tag="multibox_transform_loc_first_step") out = \ tvm.extern([oshape], [loc_pred, anchor, temp_flag, temp_id, temp_score], lambda ins, outs: transform_loc_ir( ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \ variances, batch_size, num_classes, num_anchors), dtype=[cls_prob.dtype], out_buffers=[out_buf], tag="multibox_transform_loc") return [out, valid_count]
# User will need to provide a function describing how to compute the result. # The compute function takes list of symbolic placeholder for the inputs, # list of symbolic placeholder for the outputs and returns the executing statement. # # In this case we simply call a registered TVM function, which invokes a CBLAS call. # TVM does not control internal of the extern array function and treats it as blackbox. # We can further mix schedulable TVM calls that add a bias term to the result. # n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern((n, m), [A, B], lambda ins, outs: tvm.call_packed( "tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") s = tvm.create_schedule(D.op) ###################################################################### # Verify the Result # ----------------- # We can verify that the result matches what we expected. # ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0
def non_max_suppression_gpu(data, valid_count, max_output_size=-1, iou_threshold=0.5, force_suppress=False, top_k=-1, coord_start=2, score_index=1, id_index=0, return_indices=True, invalid_to_bottom=False): """Non-maximum suppression operator for object detection. Parameters ---------- data : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. max_output_size : optional, int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : optional, float Non-maximum suppression threshold. force_suppress : optional, boolean Whether to suppress all detections regardless of class_id. top_k : optional, int Keep maximum top k detections before nms, -1 for no limit. coord_start : required, int Start index of the consecutive 4 coordinates. score_index : optional, int Index of the scores/confidence of boxes. id_index : optional, int index of the class categories, -1 to disable. return_indices : boolean Whether to return box indices in input data. invalid_to_bottom : optional, boolean Whether to move all valid bounding boxes to the top. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. Example -------- .. code-block:: python # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") iou_threshold = 0.7 force_suppress = True top_k = -1 out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold, force_suppress=force_supress, top_k=top_k, return_indices=False) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "cuda") ctx = tvm.gpu(0) tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) score_axis = score_index score_shape = (batch_size, num_anchors) score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis]) sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False, flag=True) sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8) data_buf = api.decl_buffer( data.shape, data.dtype, "data_buf", data_alignment=8) out_buf = api.decl_buffer( data.shape, data.dtype, "out_buf", data_alignment=8) out, box_indices = \ tvm.extern([data.shape, score_shape], [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], outs[0], outs[1], max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index), dtype=[data.dtype, "int32"], in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], name="nms", tag="nms") if return_indices: return box_indices if invalid_to_bottom: output_buf = api.decl_buffer( data.shape, data.dtype, "output_buf", data_alignment=8) temp_flag_buf = api.decl_buffer( score_shape, valid_count_dtype, "temp_flag", data_alignment=8) temp_idx_buf = api.decl_buffer( score_shape, valid_count_dtype, "temp_idx", data_alignment=8) temp_flag, temp_idx = tvm.extern([score_shape, score_shape], [out], lambda ins, outs: invalid_to_bottom_pre( ins[0], outs[0], outs[1]), dtype=["int32", "int32"], in_buffers=[out_buf], out_buffers=[temp_flag_buf, temp_idx_buf], name="invalid_to_bottom_phase_one") output = tvm.extern([data.shape], [out, temp_flag, temp_idx], lambda ins, outs: invalid_to_bottom_ir( ins[0], ins[1], ins[2], outs[0]), dtype=[data.dtype], in_buffers=[out_buf, temp_flag_buf, temp_idx_buf], out_buffers=[output_buf], name="invalid_to_bottom", tag="invalid_to_bottom") return output return out
def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend): """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu. Parameters ---------- data: tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. data_buf: Buffer 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. index : tvm.Tensor 1-D tensor for valid number of boxes. index_buf : Buffer Buffer of number of valid number of boxes. output_buf : Buffer Output buffer of indicies of sorted tensor. axis : int The axis used for sorting. is_descend : bool If the sorted data is in descending order. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors]. """ ndim = len(data.shape) assert data.dtype == "float32", "Currently only supports input dtype to be float32" assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim axis_mul_before = 1 axis_mul_after = 1 if axis < 0: axis = ndim + axis for i in range(0, ndim): if i < axis: axis_mul_before *= data.shape[i] elif i > axis: axis_mul_after *= data.shape[i] dshape = axis_mul_before * axis_mul_after fshape = data.shape[axis] * dshape loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8) new_index_buf = api.decl_buffer(fshape, index.dtype, "index_new", data_alignment=8) out_index_buf = api.decl_buffer(fshape, index.dtype, "index_out", data_alignment=8) new_data_buf = api.decl_buffer(dshape, data.dtype, "data_new", data_alignment=8) loc = \ tvm.extern([(dshape,)], [index], lambda ins, outs: sort_pre_ir( ins[0], outs[0], axis_mul_before, axis_mul_after), dtype=[index.dtype], in_buffers=index_buf, out_buffers=[loc_buf], tag="sorting_prepare") data_new, index_new = \ tvm.extern([(dshape,), (fshape,)], [data, index, loc], lambda ins, outs: sort_pre_ir_data( ins[0], ins[1], ins[2], outs[0], outs[1], axis, axis_mul_before, axis_mul_after), dtype=[data.dtype, index.dtype], in_buffers=[data_buf, index_buf, loc_buf], out_buffers=[new_data_buf, new_index_buf], tag="sorting_data") index_out = \ tvm.extern([(fshape,)], [data, index, data_new, index_new, loc], lambda ins, outs: sort_oet_ir( ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], axis_mul_before, axis_mul_after, axis, is_descend), dtype=[index.dtype], in_buffers=[data_buf, index_buf, new_data_buf, new_index_buf, loc_buf], out_buffers=[out_index_buf], tag="sorting_oet") out = \ tvm.extern([data.shape], [data, index, index_out, loc], lambda ins, outs: sort_ir_out( ins[0], ins[1], ins[2], ins[3], outs[0], axis_mul_before, axis_mul_after, axis), dtype=[index.dtype], in_buffers=[data_buf, index_buf, out_index_buf, loc_buf], out_buffers=output_buf, tag="sorting_output") return out
def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): """Non-maximum suppression operator for object detection. Parameters ---------- data: tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. Example -------- .. code-block:: python # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder( (dshape[0],), dtype="int32", name="valid_count") nms_threshold = 0.7 force_suppress = True nms_topk = -1 out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) data_buf = api.decl_buffer( data.shape, data.dtype, "data_buf", data_alignment=8) score_axis = 1 score_shape = (batch_size, num_anchors) score_tensor = tvm.compute( score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor") score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, "score_tensor_buf", data_alignment=8) sort_tensor_dtype = "int32" sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count, valid_count_buf, sort_tensor_buf, score_axis, True) out = \ tvm.extern(data.shape, [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], outs[0], nms_threshold, force_suppress, nms_topk), dtype="float32", in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], tag="nms") return out
def csrmm_default(data, indices, indptr, weight, bias=None): # pylint: disable=invalid-name """The default implementation of csrmm in topi. Parameters ---------- data : tvm.Tensor 1-D with shape [nonzeros] indices : tvm.Tensor 1-D with shape [nonzeros] indptr : tvm.Tensor 1-D with shape [m+1] weight : tvm.Tensor 2-D with shape [k, n] bias : tvm.Tensor, optional 1-D with shape [m] Returns ------- output : tvm.Tensor 2-D with shape [m, n] """ assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \ and len(weight.shape) == 2, "only support 2-dim csrmm" assert isinstance(weight, tvm.tensor.Tensor), \ "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight)) if bias is not None: assert len(bias.shape) == 1 M = simplify(indptr.shape[0]-1) _, N = weight.shape def csrmm_default_ir(data, indices, indptr, weight, out): """define ir for csrmm""" irb = tvm.ir_builder.create() data_ptr = irb.buffer_ptr(data) indices_ptr = irb.buffer_ptr(indices) indptr_ptr = irb.buffer_ptr(indptr) weight_ptr = irb.buffer_ptr(weight) out_ptr = irb.buffer_ptr(out) M = simplify(indptr.shape[0]-1) _, N = weight.shape with irb.for_range(0, N, for_type="vectorize", name='n') as n: with irb.for_range(0, M, for_type="parallel", name='row') as row: dot = irb.allocate('float32', (1,), name='dot', scope='local') out_ptr[row*N+n] = 0. dot[0] = 0. row_start = indptr_ptr[row] row_end = indptr_ptr[row+1] row_elems = row_end-row_start with irb.for_range(0, row_elems, name='idx') as idx: elem = row_start+idx dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]*N+n] out_ptr[row*N+n] += dot[0] return irb.get() oshape = (M, N) matmul = tvm.extern(oshape, [data, indices, indptr, weight], lambda ins, outs: csrmm_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]), tag="csrmm", dtype='float32', name='out') if bias is not None: matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[i], \ tag=tag.BROADCAST) return matmul
# The compute function takes list of symbolic placeholder for the inputs, # list of symbolic placeholder for the outputs and returns the executing statement. # # In this case we simply call a registered tvm function, which invokes a CBLAS call. # TVM does not control internal of the extern array function and treats it as blackbox. # We can further mix schedulable TVM calls that add a bias term to the result. # n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern( (n, m), [A, B], lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0], ins[ 1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") s = tvm.create_schedule(D.op) ###################################################################### # Verify the Result # ----------------- # We can verify that the result matches what we expected. # ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
def get_valid_counts_gpu(data, score_threshold=0): """Get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. Parameters ---------- data : tvm.Tensor Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length]. score_threshold : optional, float Lower limit of score for valid bounding boxes. Returns ------- valid_count : tvm.Tensor 1-D tensor for valid number of boxes. out_tensor : tvm.Tensor Rearranged data tensor. """ batch_size = data.shape[0] num_anchors = data.shape[1] max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) elem_per_thread = num_anchors // max_threads + 1 new_range = num_anchors // elem_per_thread + 1 temp_flag_buf = api.decl_buffer( (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8) temp_idx_buf = api.decl_buffer( (batch_size, num_anchors,), "int32", "temp_idx", data_alignment=8) temp_partial_buf = api.decl_buffer( (batch_size, new_range), "int32", "temp_partial", data_alignment=8) data_buf = api.decl_buffer( data.shape, data.dtype, "data_buf", data_alignment=8) temp_flag, temp_idx = \ tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data], lambda ins, outs: get_valid_counts_pre( ins[0], outs[0], outs[1], score_threshold), dtype=["int32", "int32"], out_buffers=[temp_flag_buf, temp_idx_buf], name="get_valid_counts_phase_one") temp_idx_new, temp_partial = \ tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx], lambda ins, outs: get_valid_counts_upsweep( ins[0], ins[1], outs[0], outs[1]), dtype=["int32", "int32"], out_buffers=[temp_idx_buf, temp_partial_buf], name="get_valid_counts_phase_two") temp_partial_new = \ tvm.extern([(batch_size, new_range)], [data, temp_partial], lambda ins, outs: get_valid_counts_scan( ins[0], ins[1], outs[0]), dtype=["int32"], out_buffers=[temp_partial_buf], name="get_valid_counts_phase_three") temp_idx_final = \ tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new], lambda ins, outs: get_valid_counts_downsweep( ins[0], ins[1], ins[2], outs[0]), dtype=["int32"], out_buffers=[temp_idx_buf], name="get_valid_counts_phase_four") valid_count, out_tensor = \ tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final], lambda ins, outs: get_valid_counts_ir( ins[0], ins[1], ins[2], outs[0], outs[1]), dtype=["int32", data.dtype], in_buffers=[data_buf, temp_flag_buf, temp_idx_buf], name="get_valid_counts_phase_five", tag="get_valid_counts_gpu") return [valid_count, out_tensor]
def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0): """Performs sorting along the given axis and returns an array of indices having the same shape as an input array that index data in sorted order. Parameters ---------- data : tvm.Tensor The input tensor. valid_count : tvm.Tensor 1-D tensor for valid number of boxes only for ssd. axis : optional, int Axis along which to sort the input tensor. By default the flattened array is used. is_ascend : optional, boolean Whether to sort in ascending or descending order. dtype : optional, string DType of the output indices. flag : optional, boolean Whether valid_count is valid. Returns ------- out : tvm.Tensor Sorted index tensor. Example -------- .. code-block:: python # An example to use argsort dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") axis = 0 is_ascend = False flag = False out = argsort(data, valid_count, axis, is_ascend, flag) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_argsort(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) if flag: valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=8) out = \ tvm.extern(data.shape, [data, valid_count], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort_nms", ins[0], ins[1], outs[0], axis, is_ascend), dtype="int32", in_buffers=[data_buf, valid_count_buf], out_buffers=out_buf, name="argsort_nms_cpu", tag="argsort_nms_cpu") else: out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) out = \ tvm.extern(data.shape, [data], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], outs[0], axis, is_ascend), dtype=dtype, in_buffers=[data_buf], out_buffers=out_buf, name="argsort_cpu", tag="argsort_cpu") return out
def test_ib(): print('aaaa') env = nnpu.get_env() nnpu.set_device(env) shape = (16, ) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_w, name='a') w = shape[0] e = 16 def build_nms_ir(ten_in, ten_out): ib = tvm.ir_builder.create() imm_value = 10 ib.scope_attr(env.nnpu_axis, "coproc_scope", 0) p_in = ib.buffer_ptr(ten_in[0]) p_out = ib.buffer_ptr(ten_out[0]) #with ib.for_range(0,w, name="k") as k: with ib.for_range(0, w / e, name="i") as i: ib.emit( make_intrin_call( "void", 'VAddI', ten_out[0].access_ptr("w", 'uint32') + i * dtype_bytes(dtype_w), ten_in[0].access_ptr("r", 'uint32') + i * dtype_bytes(dtype_w), tvm.const(imm_value, 'float64'), env.cfg['vector_unit']['size'], 3)) stmt = ib.get() return stmt sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) sph.MarkScope(a_buf) out = tvm.extern(a_buf.shape, [a_buf], build_nms_ir, in_buffers=[ tvm.decl_buffer(a_buf.shape, dtype_w, data_alignment=dtype_bytes(dtype_w), scope='local.nnpu_scratchpad0') ], out_buffers=[ tvm.decl_buffer(a_buf.shape, dtype_w, data_alignment=dtype_bytes(dtype_w), scope='local.nnpu_scratchpad0') ], dtype=dtype_w, name="test_ir") sph.MarkScope(out) out_host, out_dram = nnpu.utils.CopyBufToH(out, 'out', sph) s = tvm.create_schedule([out_host.op]) sph.Transform(s) print(tvm.lower(s, [a, out_host], simple_mode=True)) print(nnpu.lower(s, [a, out_host], simple_mode=True)) # exit(0) func = nnpu.build(s, [a, out_host], 'nnpu', 'llvm', name='nnpu_test') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(16, ), dtype=a.dtype, low=0, high=127) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(16, ).astype(out_host.dtype), ctx) func(a_nd, b_nd) print('a = ') print(a_np) print('xjb sum = ') print(b_nd.asnumpy()) return
def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss): """Proposal operator. Parameters ---------- cls_prob : tvm.Tensor 4-D with shape [batch, 2 * num_anchors, height, width] bbox_pred : tvm.Tensor 4-D with shape [batch, 4 * num_anchors, height, width] im_info : tvm.Tensor 2-D with shape [batch, 3] scales : list/tuple of float Scales of anchor windoes. ratios : list/tuple of float Ratios of anchor windoes. feature_stride : int The size of the receptive field each unit in the convolution layer of the rpn, for example the product of all stride's prior to this layer. threshold : float Non-maximum suppression threshold. rpn_pre_nms_top_n : int Number of top scoring boxes to apply NMS. -1 to use all boxes. rpn_post_nms_top_n : int Number of top scoring boxes to keep after applying NMS to RPN proposals. rpn_min_size : int Minimum height or width in proposal. iou_loss : bool Usage of IoU loss. Returns ------- out : tvm.Tensor 2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of [batch_index, w_start, h_start, w_end, h_end]. """ batch, _, height, width = get_const_tuple(cls_prob.shape) num_anchors = len(scales) * len(ratios) num_bbox = height * width * num_anchors rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox bbox = tvm.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs: predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios, feature_stride, rpn_min_size, iou_loss), dtype=bbox_pred.dtype) score = tvm.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score') sorted_index = tvm.extern([score.shape], [score], lambda ins, outs: argsort_ir(ins[0], outs[0]), dtype='int32') sorted_bbox = tvm.compute((batch, rpn_pre_nms_top_n, 5), lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox') nms_remove_mask = tvm.extern((batch, rpn_pre_nms_top_n), [sorted_bbox], lambda ins, outs: nms_ir(ins[0], outs[0], threshold), dtype='bool') nms_out = tvm.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask], lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]), dtype=sorted_bbox.dtype) return nms_out