Esempio n. 1
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "float16":
         major, minor = parse_compute_version(tvm.gpu(0).compute_version)
         # fp16 starts from 5.3
         if major < 6 or (major == 5 and minor < 3):
             print("skip because gpu does not support fp16")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.compute((n, ),
                     lambda i: A[i] + tvm.const(1, A.dtype),
                     name='B')
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, bx)
     s[B].bind(xi, tx)
     fun = tvm.build(s, [A, B], "cuda")
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n, ), A.dtype,
                      ctx).copyfrom(np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n, ), B.dtype, ctx)
     fun(a, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
Esempio n. 2
0
def skip_test(dtype, device):
    if dtype == "float16" and device == "cuda":
        major, minor = parse_compute_version(tvm.gpu(0).compute_version)
        # fp16 starts from 5.3
        if major < 6 or (major == 5 and minor < 3):
            print("skip because gpu does not support fp16")
            return True
    return False
if not nvcc.have_tensorcore(dev.compute_version):
    raise Exception("the gpu has no tensorcore, skipping...")

M, N, L = 512, 32, 512
dtype = "float16"
layout = "NN"
if len(sys.argv) >= 4:
    M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
if len(sys.argv) >= 5:
    dtype = sys.argv[4]
if len(sys.argv) >= 6:
    layout = sys.argv[5]

# check whether current gpu arch support support current dtype's wmma codegen
cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4)
major, minor = nvcc.parse_compute_version(cuda_compute_capability)
if dtype == "int8":
    assert major == 7 and minor >= 2
elif dtype == "int4" or dtype == "int1":
    # int4/int1 only support layout TN
    assert major == 7 and minor == 5 and layout == "TN"


def tune_and_evaluate(M, N, L, dtype, layout):
    task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm",
                               args=(N, L, M, dtype, layout),
                               target="cuda")
    print(task.config_space)

    logging.getLogger("autotvm").setLevel(logging.DEBUG)
    logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))