def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. Parameters ---------- data : tvm.Tensor 4-D with shape [batch, c_in, h_in, w_in]] sizes : tuple of float Tuple of sizes for anchor boxes. ratios : tuple of float Tuple of ratios for anchor boxes. steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int Priorbox center offsets, y and x respectively. clip : boolean Whether to clip out-of-boundary boxes. Returns ------- out : tvm.Tensor 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] """ out = hybrid_multibox_prior(data, tvm.convert(sizes), tvm.convert(ratios), tvm.convert(steps), tvm.convert(offsets)) if clip: out = topi.clip(out, 0, 1) return out
def test_func_with_invalid_tuple(): tp1 = relay.TypeVar('tp1', relay.Kind.Shape) ret_type = relay.TupleType(tvm.convert([tp1, tp1, tp1])) tf = relay.FuncType(tvm.convert([]), ret_type, tvm.convert([tp1]), tvm.convert([])) check_kind(tf)
def test_rfactor_argmax(): def fcombine(x, y): lhs = tvm.make.Select((x[1] >= y[1]), x[0], y[0]) rhs = tvm.make.Select((x[1] >= y[1]), x[1], y[1]) return lhs, rhs def fidentity(t0, t1): return tvm.const(-1, t0), tvm.min_value(t1) argmax = tvm.comm_reducer(fcombine, fidentity, name='argmax') nn = 1027 mm = 10 n = tvm.convert(nn) m = tvm.convert(mm) A0 = tvm.placeholder((m, n), name='A0', dtype='int32') A1 = tvm.placeholder((m, n), name='A1', dtype='float32') k = tvm.reduce_axis((0, n)) B0, B1 = tvm.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name='B') # schedule s = tvm.create_schedule(B0.op) nthread = 16 ko, kf = s[B0].split(k, factor=nthread) BF0, BF1 = s.rfactor(B0, kf) bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread) s[B0].bind(bx, tvm.thread_axis("blockIdx.x")) s[B0].bind(ty, tvm.thread_axis("threadIdx.y")) tx = s[B0].op.reduce_axis[0] thread_x = tvm.thread_axis("threadIdx.x") s[B0].bind(tx, thread_x) s[BF0.op].compute_at(s[B0], tx) s[B0].set_store_predicate(thread_x.var.equal(0)) def check_target(device): ctx = tvm.context(device, 0) if not ctx.exist: print("skip because %s is not enabled.." % device) return fapi = tvm.lower(s, args=[A0, A1, B0, B1]) fargmax = tvm.build(fapi, target=device, name="argmax") np_idx = np.repeat(np.arange(nn, dtype='int32').reshape(1, nn), mm, axis=0) np_val = np.random.uniform(size=(mm, nn)).astype('float32') np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, ctx) nd_val = tvm.nd.array(np_val, ctx) nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx) nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.asnumpy()) check_target("cuda") check_target("vulkan")
def test_tuple_with_invalid_func(): tensor_type = relay.TensorType(tvm.convert([1, 2, 3]), 'float32') tp1 = relay.TypeVar('tp1', relay.Kind.Shape) tf = relay.FuncType(tvm.convert([]), tp1, tvm.convert([tp1]), tvm.convert([])) tup_ty = relay.TupleType(tvm.convert([tensor_type, tf])) check_kind(tup_ty)
def test_tuple_kind(): # only contain type kinds tp = relay.TypeVar('tp', relay.Kind.Type) tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32') tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([])) fields = tvm.convert([tp, tf, tt]) tup_ty = relay.TupleType(fields) assert check_kind(tup_ty) == relay.Kind.Type
def test_relation_kind(): # only have type kinds for arguments tp = relay.TypeVar('tp', relay.Kind.Type) tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32') tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([])) args = tvm.convert([tf, tt, tp]) tr = relay.TypeRelation(None, args, 2, None) assert check_kind(tr) == relay.Kind.Constraint
def test_tuple_type(): tp = relay.TypeVar('tp', relay.Kind.Type) tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([])) tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32') fields = tvm.convert([tp, tf, tt]) tup_ty = relay.TupleType(fields) assert tup_ty.fields == fields str(tup_ty) check_json_roundtrip(tup_ty)
def test_make_smap(): # save load json x = tvm.const(1, "int32") y = tvm.const(10, "int32") z = tvm.expr.Add(x, y) smap = tvm.convert({"z": z, "x": x}) json_str = tvm.save_json(tvm.convert([smap])) arr = tvm.load_json(json_str) assert len(arr) == 1 assert arr[0]["z"].a == arr[0]["x"]
def test_func_with_invalid_relation(): tp1 = relay.TypeVar('tp1', relay.Kind.Type) tp2 = relay.TypeVar('tp2', relay.Kind.Shape) tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar) func = tvm.get_env_func("tvm.relay.type_relation.Identity") tr = relay.TypeRelation(func, tvm.convert([tp2, tp3]), 1, None) tf = relay.FuncType(tvm.convert([tp1]), tp1, tvm.convert([tp1, tp2, tp3]), tvm.convert([tr])) check_kind(tf)
def test_ext_vec(): ivec = tvm_ext.ivec_create(1, 2, 3) assert(isinstance(ivec, tvm_ext.IntVec)) assert ivec[0] == 1 assert ivec[1] == 2 def ivec_cb(v2): assert(isinstance(v2, tvm_ext.IntVec)) assert v2[2] == 3 tvm.convert(ivec_cb)(ivec)
def test_ref_kind(): # only contain type kinds tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32') ft = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([])) rt1 = relay.RefType(tt) assert check_kind(rt1) == relay.Kind.Type rt2 = relay.RefType(ft) assert check_kind(rt2) == relay.Kind.Type rt3 = relay.RefType(relay.TupleType([rt1, rt2])) assert check_kind(rt3) == relay.Kind.Type
def test_invalid_func_kind(): tp1 = relay.TypeVar('tp1', relay.Kind.Shape) tp2 = relay.TypeVar('tp2', relay.Kind.BaseType) tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar) type_params = tvm.convert([tp1, tp2, tp3]) type_constraints = tvm.convert([]) arg_types = tvm.convert([tp1, tp2]) ret_type = tp3 tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints) check_kind(tf)
def test_function(): param_names = ['a', 'b', 'c', 'd'] params = tvm.convert([relay.Var(n) for n in param_names]) ret_type = relay.TupleType(tvm.convert([])) body = relay.Tuple(tvm.convert([])) type_params = tvm.convert([]) fn = relay.Function(params, body, ret_type, type_params) assert fn.params == params assert fn.body == body assert fn.type_params == type_params assert fn.span == None str(fn) check_json_roundtrip(fn)
def test_func_type(): type_params = tvm.convert([]) type_constraints = tvm.convert([]) # TODO: fill me in arg_types = tvm.convert([]) ret_type = relay.TensorType((1, 2, 3), 'float32') tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints) assert tf.type_params == type_params assert tf.type_constraints == type_constraints assert tf.arg_types == arg_types assert tf.ret_type == ret_type assert tf.span == None # TODO make sure we can set span str(tf) check_json_roundtrip(tf)
def test_type_relation(): tp = relay.TypeVar('tp', relay.Kind.Type) tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([])) tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32') args = tvm.convert([tp, tf, tt]) num_inputs = 2 func = tvm.get_env_func("tvm.relay.type_relation.Broadcast") attrs = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4)) tr = relay.TypeRelation(func, args, num_inputs, attrs) assert tr.args == args assert tr.num_inputs == num_inputs str(tr) check_json_roundtrip(tr)
def test_rfactor_threads(): nn = 1027 mm = 10 n = tvm.convert(nn) m = tvm.convert(mm) A = tvm.placeholder((m, n), name='A') k = tvm.reduce_axis((0, n)) nthread = 16 B = tvm.compute((m,), lambda i: tvm.sum(A[i, k], axis=k, where=(i>1)), name='B') # schedule s = tvm.create_schedule(B.op) ko, kf = s[B].split(k, factor=nthread) BF = s.rfactor(B, kf) bx, ty = s[B].split(s[B].op.axis[0], factor=nthread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(ty, tvm.thread_axis("threadIdx.y")) tx = s[B].op.reduce_axis[0] thread_x = tvm.thread_axis("threadIdx.x") s[B].bind(tx, thread_x) s[BF].compute_at(s[B], tx) s[B].set_store_predicate(thread_x.var.equal(0)) # one line to build the function. def check_target(device, host="stackvm"): ctx = tvm.context(device, 0) if not ctx.exist: print("skip because %s is not enabled.." % device) return fapi = tvm.lower(s, args=[A, B]) fsum = tvm.build(fapi, target=device, name="mysum") # launch the kernel. n = nn m = mm a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx) fsum(a, b) res = np.sum(a.asnumpy(), axis=1) res[:2] = 0 tvm.testing.assert_allclose( b.asnumpy(), res, rtol=1e-4) check_target("vulkan") check_target("cuda") check_target("metal") check_target("opencl")
def test_multiple_func(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def check_llvm(): if not tvm.module.enabled("llvm"): return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], "llvm") fadd1 = m['fadd1'] fadd2 = m['fadd2'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd1(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) fadd2(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) check_llvm()
def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters ---------- cls_prob : tvm.Tensor Class probabilities. loc_pred : tvm.Tensor Location regression predictions. anchor : tvm.Tensor Prior anchor boxes. clip : boolean Whether to clip out-of-boundary boxes. threshold : float Threshold to be a positive prediction. variances : tuple of float Variances to be decoded from box regression output. Returns ------- ret : tuple of tvm.Tensor """ return hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor, tvm.const(clip, "bool"), tvm.const(threshold, "float32"), tvm.convert(variances))
def test_exp(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. px, x = s[B].split(B.op.axis[0], nparts=1) s[B].bind(px, tvm.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fexp = tvm.build(s, [A, B], device, host, name="myexp") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fexp(a, b) tvm.testing.assert_allclose( b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) check_device("sdaccel") if "AWS_PLATFORM" in os.environ: check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM")) check_device("aocl_sw_emu")
def test_exp(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. num_thread = 8 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) # one line to build the function. def check_device(device, host="stackvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fexp = tvm.build(s, [A, B], device, host, name="myexp") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fexp(a, b) np.testing.assert_allclose( b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) check_device("cuda", "llvm") check_device("vulkan") check_device("opencl")
def run(dtype): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A', dtype=dtype) B = tvm.compute(A.shape, lambda *i: tvm.popcount(A(*i)), name='B') s = tvm.create_schedule(B.op) # simple schedule num_thread = 8 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("skip because %s is not enabled.." % device) return target = tvm.target.create(device) if "cpu" not in target.keys: s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) func = tvm.build(s, [A, B], device) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx) b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx) func(a, b) np.testing.assert_allclose( b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5) check_device("llvm") check_device("cuda") check_device("opencl") check_device("metal") if dtype == "uint32": check_device("vulkan")
def test_conv(): batch_size = 1 input_channel = 3 h = 224 w = 224 output_channel = 64 kh = 7 kw = 7 h_padding = 1 w_padding = 1 oh = h + h_padding * 2 - kh + 1 ow = w + w_padding * 2 - kw + 1 dshape = (batch_size, input_channel, h, w) weight = relay.var("weight", shape=(output_channel, input_channel, kh, kw)) data = relay.var("data", shape=dshape) conv2d = relay.nn.conv2d( data, weight, channels=output_channel, kernel_size=(kh, kw), padding=(1, 1)) func = relay.Function([data, weight], relay.Tuple(tvm.convert([conv2d]))) func = relay.ir_pass.infer_type(func) compute_count = relay.ir_pass.get_total_mac_number(func) expect_count = batch_size * input_channel * oh * ow * output_channel * kh * kw assert compute_count == expect_count
def test_rfactor(): n = tvm.convert(1027) A = tvm.placeholder((n,), name='A') k = tvm.reduce_axis((0, n)) B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B') # schedule s = tvm.create_schedule(B.op) kf, ki = s[B].split(k, nparts=4) BF = s.rfactor(B, kf) s[BF].parallel(BF.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.module.enabled(target): return ctx = tvm.cpu(0) fapi = tvm.lower(s, args=[A, B]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx) fsum(a, b) res = np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose( b.asnumpy(), res, rtol=1e-4) check_target()
def test_tuple_type_alpha_equal(): t1 = relay.TensorType((1, 2, 3), "float32") t2 = relay.TensorType((1, 2, 3, 4), "float32") tp1 = relay.TypeVar("v1", relay.Kind.Type) tp2 = relay.TypeVar("v2", relay.Kind.Type) tup1 = relay.TupleType(tvm.convert([t1, t2, tp1])) tup2 = relay.TupleType(tvm.convert([t1, t2, tp1])) tup3 = relay.TupleType(tvm.convert([t2, t1, tp1])) tup4 = relay.TupleType(tvm.convert([t1, t2, tp2])) # as long as types are alpha-equal and in same order, # tuples should be alpha-equal assert tup1 == tup2 assert tup1 != tup3 assert tup1 != tup4
def test_add(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) def check_c(): mhost = tvm.build(s, [A, B, C], "c", name="fadd") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m['fadd'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) check_c()
def test_dot(): nn = 12 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') k = tvm.reduce_axis((0, n), 'k') C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C') s = tvm.create_schedule(C.op) fapi = lower(s, [A, B, C]) def verify(target): if not tvm.module.enabled(target): print("Target %s is not enabled" % target) return f = tvm.codegen.build_module(fapi, target) # verify ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4) verify("llvm")
def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, n/2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C = tvm.extern(A.shape, [A], extern_generator, name='C') s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, C], simple_mode=True)) def check_llvm(): if not tvm.module.enabled("llvm"): return # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_llvm()
def annotated(): conv2d_1 = relay.nn.conv2d( data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_1 = relay.annotation.on_device(conv2d_1, dev2) conv2d_2 = relay.nn.conv2d( data2, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_2 = relay.annotation.on_device(conv2d_2, dev2) add = relay.add(conv2d_1, conv2d_2) _add = relay.annotation.on_device(add, dev1) conv2d_3 = relay.nn.conv2d( add, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2) func = relay.Function([data1, data2, weight], relay.Tuple(tvm.convert([_conv2d_1, _conv2d_2, _conv2d_3, _add, conv2d_3]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, tvm.context(3).device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[4]), func.body[4])
def test_pack_buffer_intermediate(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.compute((n,), lambda i: A[i] + 1, name="B") def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline.""" return tvm.call_packed("my_extern_array_func2", ins[0], outs[0]) C = tvm.extern(B.shape, [B], extern_generator, name='C') s = tvm.create_schedule(C.op) def check_target(target): if not tvm.module.enabled(target): return # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) @tvm.register_func def my_extern_array_func2(aa, bb): assert aa.shape == a.shape tvm.testing.assert_allclose( aa.asnumpy(), a.asnumpy() + 1) aa.copyto(bb) f(a, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_target("llvm")
def test_depthwise_conv2d(): batch_size = 1 dshape = (batch_size, 64, 56, 56) weight_conv = relay.var("weight_depthwiseconv", shape=(64, 1, 3, 3)) data1 = relay.var("data1", shape=dshape) data2 = relay.var("data2", shape=dshape) depthwise_conv2d_1 = relay.nn.conv2d( data1, weight_conv, kernel_size=(3, 3), padding=(1, 1), groups=64) depthwise_conv2d_2 = relay.nn.conv2d( data2, weight_conv, kernel_size=(3, 3), padding=(1, 1), groups=64) add = relay.add(depthwise_conv2d_1, depthwise_conv2d_2) func = relay.Function([data1, data2, weight_conv], relay.Tuple(tvm.convert([depthwise_conv2d_1, depthwise_conv2d_2, add]))) func = relay.ir_pass.infer_type(func) compute_count = relay.ir_pass.get_total_mac_number(func) assert compute_count == 2 * np.prod(dshape) * 3*3
def test_default_value(): num_anchors = 3 num_classes = 3 np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]]).astype("float32") np_loc_preds = np.array( [[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]]).astype("float32") np_anchors = np.array([[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]]).astype("float32") expected_np_out = np.array( [[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108], [0, 0.44999999, 1, 1, 1, 1], [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]]) cls_prob = relay.var( "cls_prob", relay.ty.TensorType((1, num_anchors, num_classes), "float32")) loc_pred = relay.var( "loc_pred", relay.ty.TensorType((1, num_anchors * 4), "float32")) anchors = relay.var( "anchors", relay.ty.TensorType((1, num_anchors, 4), "float32")) mtl = relay.vision.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors) ret = relay.ir_pass.infer_type(mtl.astuple()) ref_type = relay.ty.TupleType( tvm.convert([ relay.ty.TensorType((1, num_anchors, 6), "float32"), relay.ty.TensorType((1, ), "int") ])) assert ret.checked_type == ref_type nms = relay.vision.non_max_suppression(mtl[0], mtl[1], return_indices=False) func = relay.Function([cls_prob, loc_pred, anchors], nms) func = relay.ir_pass.infer_type(func) ctx_list = [("llvm", tvm.cpu(0))] for target, ctx in ctx_list: intrp1 = relay.create_executor("graph", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(np_cls_prob, np_loc_preds, np_anchors) tvm.testing.assert_allclose(op_res1.asnumpy(), expected_np_out, rtol=1e-5) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res2 = intrp2.evaluate(func)(np_cls_prob, np_loc_preds, np_anchors) tvm.testing.assert_allclose(op_res2.asnumpy(), expected_np_out, rtol=1e-5)
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib.dylib") f.export_library(path_dso1, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso1) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.dylib") f.export_library(path_dso2, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso2) # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, options=['-quiet'], libs=[path_dso1, path_dso2]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.metal(0) f1 = remote.load_module("dev_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # CPU ctx = remote.cpu(0) f2 = remote.load_module("cpu_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def test_gemm_bound(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n, n), name='A') B = tvm.placeholder((n, n), name='B') k = tvm.reduce_axis((0, n), name='k') C = tvm.compute( (n, n), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') # schedule s = tvm.create_schedule(C.op) xtile, ytile = 32, 32 scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis("threadIdx.y") CC = s.cache_write(C, "local") AA = s.cache_read(A, "shared", [CC]) BB = s.cache_read(B, "shared", [CC]) by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].reorder(by, bx, yi, xi) s[C].bind(by, block_y) s[C].bind(bx, block_x) ty, yi = s[C].split(yi, nparts=num_thread) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(ty, tx, yi, xi) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) s[CC].compute_at(s[C], tx) s[AA].compute_at(s[CC], k) s[BB].compute_at(s[CC], k) ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s = s.normalize() bounds = tvm.schedule.InferBound(s) assert(bounds[BB.op.axis[0]].extent.value==64) assert(bounds[AA.op.axis[0]].extent.value==64) assert(bounds[CC.op.axis[0]].extent.value == 8) assert(bounds[CC.op.axis[1]].extent.value == 8)
def argsort_ir(data_buf, out_index_buf): """Batched odd-even transposition sort. Parameters ---------- data_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox] out_index_buf : tvm.schedule.Buffer 2-D with shape [batch, num_bbox]. Indices of data in sorted order. Returns ------- stmt : Stmt The result IR statement. """ batch, num_bbox = get_const_tuple(data_buf.shape) max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data_buf) index_out = ib.buffer_ptr(out_index_buf) nthread_tx = max_threads nthread_bx = (num_bbox + 1) // 2 // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("vthread") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "virtual_thread", nthread_bx) tid = bx * nthread_tx + tx temp_data = ib.allocate("float32", (1, ), name="temp_data", scope="local") temp_index = ib.allocate("int32", (1, ), name="temp_index", scope="local") idxm = tvm.indexmod with ib.for_range(0, batch, for_type="unroll") as b: start = b * num_bbox for i in range(2): bbox_id = tid * 2 + i with ib.if_scope(bbox_id < num_bbox): index_out[start + bbox_id] = bbox_id with ib.for_range(0, num_bbox) as k: offset = start + 2 * tid + idxm(k, 2) with ib.if_scope( tvm.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])): temp_data[0] = p_data[offset] p_data[offset] = p_data[offset + 1] p_data[offset + 1] = temp_data[0] temp_index[0] = index_out[offset] index_out[offset] = index_out[offset + 1] index_out[offset + 1] = temp_index[0] ib.emit( tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def test_map(): a = tvm.var('a') b = tvm.var('b') amap = tvm.convert({a: 2, b: 3}) assert a in amap assert len(amap) == 2 dd = dict(amap.items()) assert a in dd assert b in dd assert a + 1 not in amap
def test_map_save_load_json(): a = tvm.var('a') b = tvm.var('b') amap = tvm.convert({a: 2, b: 3}) json_str = tvm.save_json(amap) amap = tvm.load_json(json_str) assert len(amap) == 2 dd = {kv[0].name : kv[1].value for kv in amap.items()} assert(dd == {"a": 2, "b": 3})
def test_return_func(): def addy(y): def add(x): return tvm.convert(x + y) return add myf = tvm.convert(addy) f = myf(10) assert f(11).value == 21
def test_get_callback_with_node(): x = tvm.convert(10) def test(y): assert y.handle != x.handle return y f2 = tvm.convert(test) # register into global function table @tvm.register_func def my_callback_with_node(y, f): assert y == x return f(y) # get it out from global function table f = tvm.get_global_func("my_callback_with_node") assert isinstance(f, tvm.Function) y = f(x, f2) assert (y.value == 10)
def test_call(): op = relay.Var('f') arg_names = ['a', 'b', 'c', 'd'] args = tvm.convert([relay.Var(n) for n in arg_names]) call = relay.Call(op, args, None, None) assert call.op == op assert call.args == args assert call.span == None str(call) check_json_roundtrip(call)
def test_add_pipeline(): nn = 64 max_threads = 4 n = tvm.convert(nn) A = tvm.placeholder((n, ), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, (n + 1) // 2) as i: ib.emit(outs[0].vstore( i * 2, ins[0].vload(i * 2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() def extern_generator_gpu(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", (nn + max_threads - 1) // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var with ib.if_scope(ib.likely(idx < n)): ib.emit(outs[0].vstore( idx * 2, ins[0].vload(idx * 2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C') C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C') s_cpu = tvm.create_schedule(C_cpu.op) s_gpu = tvm.create_schedule(C_gpu.op) print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True)) print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True)) def check_target(target): if not tvm.module.enabled(target): return s = s_gpu if target in ['opencl', 'cuda'] else s_cpu C = C_gpu if target in ['opencl', 'cuda'] else C_cpu # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.context(target, 0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) check_target("llvm") check_target("opencl") check_target("cuda")
def run_and_check(func, args, var_dict={}, target='llvm', sch=None, outs=None): def tvm_val_2_py_val(val): val = tvm.ir_pass.Substitute(val, var_dict) val = tvm.ir_pass.Simplify(val) assert isinstance(val, (tvm.expr.IntImm, tvm.expr.UIntImm)) return val.value ctx = tvm.context(target, 0) op = None if sch is None: outs = func(*tuple( tvm.convert(i) if isinstance(i, list) else i for i in args)) op = outs[0].op if isinstance(outs, list) else outs.op sch = tvm.create_schedule(op) else: assert outs is not None assert isinstance(outs, list) op = outs[0].op emu_args = [] nd_args = [] for i in args: if isinstance(i, tvm.tensor.Tensor): shape = [tvm_val_2_py_val(j) for j in i.shape] emu_args.append(numpy.random.randn(*shape).astype(i.dtype)) nd_args.append(tvm.nd.array(emu_args[-1], ctx)) elif isinstance(i, tvm.expr.Var): emu_args.append(tvm_val_2_py_val(i)) nd_args.append(emu_args[-1]) else: assert isinstance(i, list) emu_args.append(numpy.array(i)) compile_args = [i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.expr.Var))] + \ (outs if isinstance(outs, list) else [outs]) module = tvm.build(sch, compile_args, target=target) assert module out_tensors = [] for i in range(op.num_outputs): output = op.output(i) shape = [tvm_val_2_py_val(j) for j in output.shape] nd_args.append( tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx)) out_tensors.append(nd_args[-1]) ref_data = func(*emu_args) if isinstance(ref_data, numpy.ndarray): ref_data = [ref_data] module(*nd_args) for nd, np in zip(out_tensors, ref_data): tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
def test_bound_fusesplit2(): m = tvm.var("m") l = tvm.convert(6) split = tvm.convert(3) A = tvm.placeholder((m, l), name='A') A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = tvm.create_schedule(A2.op) fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1]) xo, xi = s[A2].split(fused_axes, split) s[A1].compute_at(s[A2], xo) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) vars = tvm.convert({xo.var: tvm.const(5, "int32")}) assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[0]].min, vars)).value == 2) assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[1]].min, vars)).value == 3) assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[0]].extent, vars)).value == 1) assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[1]].extent, vars)).value == 3)
def annotated(): add = relay.add(x, y) _add = relay.annotation.on_device(add, ctx2) sub = relay.subtract(add, z) _sub = relay.annotation.on_device(sub, ctx2) func = relay.Function([x, y, z], relay.Tuple(tvm.convert([_add, _sub, sub]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, ctx1.device_type) return func
def test_const_range(): @tvm.hybrid.script def foo(a, b): c = output_tensor(a.shape, a.dtype) d = output_tensor(a.shape, 'int32') for i in const_range(2): for j in const_range(5): c[i, j] = float32(int32(a[i, j]) + b[i, j]) for i in const_range(len(b)): for j in const_range(len(b[0])): d[i, j] = int32(a[i, j] + b[i, j]) return c, d a = tvm.placeholder((2, 5), name='a', dtype='float32') b = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]] func, ins, outs = run_and_check(foo, [a, b]) run_and_check(func, ins, outs=outs) @tvm.hybrid.script def goo(a, b): c = output_tensor(a.shape, a.dtype) len_b = len(b) for i in const_range(len_b * 2): if i < len_b: c[i] = a[i] + b[i] else: c[i - len_b] = a[i - len_b] + b[i - len_b] return c a = tvm.placeholder((5, ), name='a', dtype='int32') b = [1, 2, 3, 4, 5] c = goo(a, tvm.convert(b)) sch = tvm.create_schedule(c.op) func, ins, outs = run_and_check(goo, [a, b]) run_and_check(func, ins, outs=outs) @tvm.hybrid.script def hoo(a, b): c = output_tensor(a.shape, a.dtype) len_b = len(b) for i in range(a.shape[0]): for j in const_range(len(b)): d = a[i] * b[j] d += a[i] + b[j] c[i] = d return c a = tvm.placeholder((5, ), name='a', dtype='int32') b = [1, 2, 3, 4, 5] func, ins, outs = run_and_check(hoo, [a, b]) run_and_check(func, ins, outs=outs)
def test_alignment(): n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: A[i] * 3, name='B') s = tvm.create_schedule(B.op) bx, tx = s[B].split(B.op.axis[0], factor=8) s[B].vectorize(tx) f = tvm.build(s, [A, B], "llvm") for l in f.get_source().split("\n"): if "align" in l and "4 x float" in l: assert "align 32" in l
def test_llvm_intrin(): ib = tvm.ir_builder.create() n = tvm.convert(4) A = ib.pointer("float32", name="A") args = [tvm.call_pure_intrin("handle", "tvm_address_of", A[0]), 0, 3, 1] ib.emit( tvm.make.Evaluate( tvm.make.Call("int32", "prefetch", args, tvm.expr.Call.Intrinsic, None, 0))) body = ib.get() func = tvm.ir_pass.MakeAPI(body, "prefetch", [A], 0, True) fcode = tvm.build(func, None, "llvm")
def test_split_infer_type(): def verify_split(dshape, indices_or_sections, ret_type, axis=None): x = relay.var("x", relay.ty.TensorType(dshape, "float32")) y = relay.split(x, indices_or_sections, axis=axis) y.astext() yy = relay.ir_pass.infer_type(y.astuple()) assert yy.checked_type == ret_type d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4") axis = tvm.var("axis") verify_split((5, 5, 2, 2), 5, relay.ty.TupleType( tvm.convert([ relay.ty.TensorType((5, 1, 2, 2), "float32"), relay.ty.TensorType((5, 1, 2, 2), "float32"), relay.ty.TensorType((5, 1, 2, 2), "float32"), relay.ty.TensorType((5, 1, 2, 2), "float32"), relay.ty.TensorType((5, 1, 2, 2), "float32") ])), axis=1) verify_split((d1, d2, d3, d4), 4, relay.ty.TupleType( tvm.convert([ relay.ty.TensorType((d1, d2, d3 / 4, d4), "float32"), relay.ty.TensorType((d1, d2, d3 / 4, d4), "float32"), relay.ty.TensorType((d1, d2, d3 / 4, d4), "float32"), relay.ty.TensorType((d1, d2, d3 / 4, d4), "float32") ])), axis=2) verify_split((d1, d2, d3, d4), (2, 4, 7), relay.ty.TupleType( tvm.convert([ relay.ty.TensorType((d1, 2, d3, d4), "float32"), relay.ty.TensorType((d1, 2, d3, d4), "float32"), relay.ty.TensorType((d1, 3, d3, d4), "float32"), relay.ty.TensorType((d1, (d2 - 7), d3, d4), "float32") ])), axis=1)
def test_bind(): if not tvm.gpu(0).exist: print('[Warning] No GPU found! Skip bind test!') return @script def vec_add(a, b): c = output_tensor((1000, ), 'float32') for tx in bind('threadIdx.x', 1000): c[tx] = a[tx] + b[tx] return c a = tvm.placeholder((1000, ), dtype='float32', name='a') b = tvm.placeholder((1000, ), dtype='float32', name='b') func, ins, outs = run_and_check(vec_add, [a, b], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') @script def raw(a, b): c = output_tensor((1000, ), 'float32') for i in range(1000): c[i] = a[i] + b[i] return c c = raw(a, b) sch = tvm.create_schedule(c.op) x = tvm.thread_axis('threadIdx.x') sch[c].bind(c.op.axis[0], x) func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') # Test loop binds @tvm.hybrid.script def goo(a, b): c = output_tensor(a.shape, a.dtype) len_b = len(b) for i in const_range(len_b * 2): if i < len_b: c[i] = a[i] + b[i] else: c[i - len_b] = a[i - len_b] + b[i - len_b] return c a = tvm.placeholder((5, ), name='a', dtype='int32') b = [1, 2, 3, 4, 5] c = goo(a, tvm.convert(b)) sch = tvm.create_schedule(c.op) func, ins, outs = run_and_check(goo, [a, b], sch=sch, outs=[c]) run_and_check(func, ins, outs=outs)
def annotated(): add = relay.add(x, y) _add1 = relay.annotation.on_device(add, ctx2) _add2 = relay.annotation.on_device(add, ctx2) sub = relay.subtract(add, z) func = relay.Function([x, y, z], relay.Tuple(tvm.convert([_add1, _add2, sub]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, ctx1.device_type) func = relay.ir_pass.infer_type(func) return relay.Function(relay.ir_pass.free_vars(func.body[2]), func.body[2])
def annotated(): add = relay.add(a, b) _add = relay.annotation.on_device(add, dev_ctx) mul = relay.multiply(c, d) _mul = relay.annotation.on_device(mul, cpu_ctx) sub = relay.subtract(add, mul) _sub = relay.annotation.on_device(sub, dev_ctx) func = relay.Function([a, b, c, d], relay.Tuple(tvm.convert([_add, _mul, _sub, sub]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, dev_ctx.device_type) return func
def scalar_type(dtype): """Construct a Relay scalar type. Parameters ---------- dtype: dtype The dtype of the scalar type. Returns: scalar_type: relay.Type The scalar type. """ return TensorType(tvm.convert([]), dtype)
def test_attr(): x = tvm.var('x') y = tvm.var('y') stmt = tvm.make.AttrStmt(y, "stride", 10, tvm.make.Evaluate(x + 1)) assert stmt.node == y a = tvm.convert(1) assert a.value == 1 try: a.no_field assert False except AttributeError: pass
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib2.so") f.export_library(path_dso1, ndk.create_shared) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.so") f.export_library(path_dso2, ndk.create_shared) tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=60) print('Run CPU test ...') ctx = remote.cpu(0) remote.upload(path_dso2) f2 = remote.load_module("cpu_lib.so") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) print('Run GPU test ...') ctx = remote.cl(0) remote.upload(path_dso1) f1 = remote.load_module("dev_lib2.so") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def test_scalar_dtype_inference(): for data in [ True, np.bool(1), np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1), np.int8(1), np.int16(1), np.int32(1), np.int64(1), np.float16(1), np.float32(1), np.float64(1) ]: assert tvm.const(data).dtype == str(np.array(data).dtype) assert tvm.const(1).dtype == 'int32' assert tvm.const(1.0).dtype == 'float32' for data in [ True, np.bool(1), np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1), np.int8(1), np.int16(1), np.int32(1), np.int64(1), np.float16(1), np.float32(1), np.float64(1) ]: assert tvm.convert(data).dtype == str(np.array(data).dtype) assert tvm.convert(1).dtype == 'int32' assert tvm.convert(1.0).dtype == 'float32'
def multibox_prior(data, sizes=(1, ), ratios=(1, ), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. Parameters ---------- data : tvm.Tensor 4-D with shape [batch, c_in, h_in, w_in]] sizes : tuple of float Tuple of sizes for anchor boxes. ratios : tuple of float Tuple of ratios for anchor boxes. steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int Priorbox center offsets, y and x respectively. clip : boolean Whether to clip out-of-boundary boxes. Returns ------- out : tvm.Tensor 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] """ out = hybrid_multibox_prior(data, tvm.convert(sizes), tvm.convert(ratios), tvm.convert(steps), tvm.convert(offsets)) if clip: out = topi.clip(out, 0, 1) return out
def annotated(): add = relay.add(x, y) sqrt = relay.sqrt(add) log = relay.log(add) subtract = relay.subtract(sqrt, log) exp = relay.exp(subtract) _exp = relay.annotation.on_device(exp, cpu_ctx) func = relay.Function([x, y], relay.Tuple(tvm.convert([_exp, exp]))) func = relay.ir_pass.infer_type(func) func = relay.ir_pass.rewrite_annotated_ops(func, dev_ctx.device_type) return func
def test_gemm(): n = 512 k = 1024 m = 256 dshape1 = (n, k) dshape2 = (m, k) data1 = relay.var("data1", shape=dshape1) data2 = relay.var("data2", shape=dshape2) gemm = relay.nn.dense(data1, data2) func = relay.Function([data1, data2], relay.Tuple(tvm.convert([gemm]))) func = relay.ir_pass.infer_type(func) compute_count = relay.ir_pass.get_total_mac_number(func) expect_count = n * m * k assert compute_count == expect_count
def check_correct_assembly(type, elements, counts): n = tvm.convert(elements) A = tvm.placeholder(n, dtype=type, name='A') B = tvm.compute(A.shape, lambda i: tvm.popcount(A[i]), name='B') s = tvm.create_schedule(B.op) s[B].vectorize(s[B].op.axis[0]) f = tvm.build(s, [A, B], target) # Verify we see the correct number of vpaddl and vcnt instructions in the assembly assembly = f.get_source('asm') matches = re.findall("vpaddl", assembly) assert (len(matches) == counts) matches = re.findall("vcnt", assembly) assert (len(matches) == 1)
def test_gemm(): n = 512 k = 1024 m = 256 dshape1 = (n, k) dshape2 = (m, k) data1 = relay.var("data1", shape=dshape1) data2 = relay.var("data2", shape=dshape2) gemm = relay.nn.dense(data1, data2) func = relay.Function([data1, data2], relay.Tuple(tvm.convert([gemm]))) func = run_opt_pass(func, transform.InferType()) compute_count = analysis.get_total_mac_number(func) expect_count = n * m * k assert compute_count == expect_count
def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') AA = tvm.compute((n,), lambda *i: A(*i), name='A') BB = tvm.compute((n,), lambda *i: B(*i), name='B') T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T') C = tvm.compute(A.shape, lambda *i: T(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) xo1, xo2 = s[C].split(xo, factor=13) s[C].parallel(xo2) s[C].pragma(xo1, "parallel_launch_point") s[C].pragma(xo2, "parallel_stride_pattern") s[C].pragma(xo2, "parallel_barrier_when_finish") s[C].vectorize(xi) def check_c(): if not tvm.module.enabled("llvm"): return # Specifically allow offset to test codepath when offset is available Ab = tvm.decl_buffer( A.shape, A.dtype, elem_offset=tvm.var('Aoffset'), offset_factor=8, name='A') binds = {A : Ab} # BUILD and invoke the kernel. f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) mhost = tvm.codegen.build_module(fsplits[0], "c") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m["fadd_pipeline"] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) with tvm.build_config(offset_factor=4): check_c()