def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16": major, minor = parse_compute_version(tvm.gpu(0).compute_version) # fp16 starts from 5.3 if major < 6 or (major == 5 and minor < 3): print("skip because gpu does not support fp16") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.compute((n, ), lambda i: A[i] + tvm.const(1, A.dtype), name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "cuda") ctx = tvm.gpu(0) a = tvm.nd.empty((n, ), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n, ), B.dtype, ctx) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def skip_test(dtype, device): if dtype == "float16" and device == "cuda": major, minor = parse_compute_version(tvm.gpu(0).compute_version) # fp16 starts from 5.3 if major < 6 or (major == 5 and minor < 3): print("skip because gpu does not support fp16") return True return False
if not nvcc.have_tensorcore(dev.compute_version): raise Exception("the gpu has no tensorcore, skipping...") M, N, L = 512, 32, 512 dtype = "float16" layout = "NN" if len(sys.argv) >= 4: M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]) if len(sys.argv) >= 5: dtype = sys.argv[4] if len(sys.argv) >= 6: layout = sys.argv[5] # check whether current gpu arch support support current dtype's wmma codegen cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4) major, minor = nvcc.parse_compute_version(cuda_compute_capability) if dtype == "int8": assert major == 7 and minor >= 2 elif dtype == "int4" or dtype == "int1": # int4/int1 only support layout TN assert major == 7 and minor == 5 and layout == "TN" def tune_and_evaluate(M, N, L, dtype, layout): task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout), target="cuda") print(task.config_space) logging.getLogger("autotvm").setLevel(logging.DEBUG) logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))