def evaluate(name, s, bufs, target, dev_id, number=10, rpc_info=None, result_generator=None): if rpc_info is not None: use_rpc = rpc_info.use_rpc target_host = rpc_info.target_host fcompile = rpc_info.fcompile else: use_rpc, target_host, fcompile = None, None, None remote = rpc_info.get_remote() dev = (remote if remote else tvm).device(target, dev_id) np_arys = [ np.random.uniform(-10, 10, size=to_tuple(buf.shape)).astype(buf.dtype) for buf in bufs ] tvm_arys = [tvm.nd.array(arr, dev) for arr in np_arys] func_file = f"{name}.so" time_cost = float("inf") try: func = tvm.build(s, bufs, target=target, target_host=target_host) if use_rpc: func.export_library(os.path.join(LIB_DIR, func_file), fcompile) remote.upload(os.path.join(LIB_DIR, func_file)) func = remote.load_module(func_file) func(*tvm_arys) if result_generator is not None: print("Test whether computed...") result = tvm_arys[-1].asnumpy() test_allclose(result, np_arys[-1], rtol=1e-3, print_diff=True) print("Test correctness...") expected = result_generator(np_arys) test_allclose(result, expected, rtol=1e-3, print_diff=True) evaluator = func.time_evaluator(func.entry_name, dev, number=number) time_cost = evaluator(*tvm_arys).mean * 1e3 except Exception as e: print(e) finally: while len(tvm_arys) > 0: del tvm_arys[-1] if os.path.exists(os.path.join(LIB_DIR, func_file)): try: os.remove(os.path.join(LIB_DIR, func_file)) except Exception as e: print(e) return time_cost
def test_conv2d_nchw(): ################################# # test basic case inputs_np = np.random.uniform(-1, 1, size=[4, 6, 7, 7]).astype(np.float32) * 1000 weight_np = np.random.uniform(-1, 1, size=[9, 2, 3, 3]).astype(np.float32) * 1000 bias_np = np.random.uniform(-1, 1, size=[9]).astype(np.float32) * 1000 inputs_torch = torch.tensor(inputs_np) weight_torch = torch.tensor(weight_np) bias_torch = torch.tensor(bias_np) output_torch = torch.nn.functional.conv2d( inputs_torch, weight_torch, bias_torch, stride=2, padding=1, dilation=2, groups=3) tvm_ctx = tvm.context("llvm", 0) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) weight_tvm = tvm.nd.array(weight_np, tvm_ctx) bias_tvm = tvm.nd.array(bias_np, tvm_ctx) output_tvm = tvm.nd.array(np.zeros(output_torch.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.placeholder(inputs_np.shape, dtype="float32") weight_t = tvm.placeholder(weight_np.shape, dtype="float32") bias_t = tvm.placeholder(bias_np.shape, dtype="float32") output_t = conv2d_nchw(inputs_t, weight_t, bias_t, stride=2, padding=1, dilation=2, groups=3) s = tvm.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, weight_t, bias_t, output_t], "llvm") func(inputs_tvm, weight_tvm, bias_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_torch.numpy(), rtol=1e-5 * 1000, print_diff=True) if passed == 1: print("Conv2d_nchw basic case passed!") else: print("Conv2d_nchw basic case failed!")
def test_bilinear(): ################################# # test basic case inputs_np = np.random.random([2, 3, 2, 3, 17]).astype(np.float32) * 100 another_np = np.random.random([2, 3, 2, 3, 8]).astype(np.float32) * 100 weight_np = np.random.random([5, 17, 8]).astype(np.float32) * 100 bias_np = np.random.random([5]).astype(np.float32) * 100 inputs_torch = torch.tensor(inputs_np) another_torch = torch.tensor(another_np) weight_torch = torch.tensor(weight_np) bias_torch = torch.tensor(bias_np) output_torch = torch.nn.functional.bilinear(inputs_torch, another_torch, weight_torch, bias_torch) tvm_ctx = tvm.context("llvm", 0) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) another_tvm = tvm.nd.array(another_np, tvm_ctx) weight_tvm = tvm.nd.array(weight_np, tvm_ctx) bias_tvm = tvm.nd.array(bias_np, tvm_ctx) output_tvm = tvm.nd.array(np.zeros(output_torch.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.placeholder(inputs_np.shape, dtype="float32") another_t = tvm.placeholder(another_np.shape, dtype="float32") weight_t = tvm.placeholder(weight_np.shape, dtype="float32") bias_t = tvm.placeholder(bias_np.shape, dtype="float32") output_t = bilinear(inputs_t, another_t, weight_t, bias_t) s = tvm.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, another_t, weight_t, bias_t, output_t], "llvm") func(inputs_tvm, another_tvm, weight_tvm, bias_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_torch.numpy(), rtol=1e-5, print_diff=True) if passed == 1: print("Bilinear basic case passed!") else: print("Bilinear basic case failed!")
def test_gemm_conv2d_nchw(): ################################# # test basic case inputs_np = np.random.random([1, 384, 27, 27]).astype(np.float32) * 100 weight_np = np.random.random([64, 384, 1, 1]).astype(np.float32) * 100 bias_np = np.random.random([64]).astype(np.float32) * 100 inputs_torch = torch.tensor(inputs_np) weight_torch = torch.tensor(weight_np) bias_torch = torch.tensor(bias_np) output_torch = torch.nn.functional.conv2d( inputs_torch, weight_torch, bias_torch, stride=1, padding=0, dilation=1, groups=1) tvm_ctx = tvm.context("llvm", 0) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) weight_tvm = tvm.nd.array(weight_np, tvm_ctx) bias_tvm = tvm.nd.array(bias_np, tvm_ctx) output_tvm = tvm.nd.array(np.zeros(output_torch.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.placeholder(inputs_np.shape, dtype="float32") weight_t = tvm.placeholder(weight_np.shape, dtype="float32") bias_t = tvm.placeholder(bias_np.shape, dtype="float32") output_t = gemm_conv2d_nchw(inputs_t, weight_t, bias_t, stride=1, padding=0, dilation=1, groups=1) s = tvm.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, weight_t, bias_t, output_t], "llvm") func(inputs_tvm, weight_tvm, bias_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_torch.numpy(), rtol=1e-5, print_diff=True) if passed == 1: print("Gemm_conv2d_nchw basic case passed!") else: print("Gemm_conv2d_nchw basic case failed!")
def test_batch_norm(): ################################# # test basic case inputs_np = np.random.random([100, 200]).astype(np.float32) * 100 inputs_torch = torch.tensor(inputs_np) running_mean = torch.mean(inputs_torch, dim=0) running_var = inputs_torch.var(dim=0) output_torch = torch.nn.functional.batch_norm(inputs_torch, running_mean, running_var) tvm_ctx = tvm.device("llvm", 0) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) output_tvm = tvm.nd.array( np.zeros(output_torch.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.te.placeholder(inputs_np.shape, dtype="float32") output_t = batch_normalization2d(inputs_t) s = tvm.te.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, output_t], "llvm") func(inputs_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_torch.numpy(), rtol=1e-2, print_diff=True) if passed == 1: print("Batch_norm basic case passed!") else: print("Batch_norm basic case failed!")
def test_variance(): ################################# # test basic case inputs_np = np.random.random([2, 3, 27, 3, 17]).astype(np.float32) * 100 inputs_torch = torch.tensor(inputs_np) output_torch = inputs_torch.var(dim=2) tvm_ctx = tvm.device("llvm", 0) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) output_tvm = tvm.nd.array( np.zeros(output_torch.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.te.placeholder(inputs_np.shape, dtype="float32") output_t = variance(inputs_t, dim=2) s = tvm.te.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, output_t], "llvm") func(inputs_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_torch.numpy(), rtol=1e-5, print_diff=True) if passed == 1: print("Variance basic case passed!") else: print("Variance basic case failed!")
def check_result(configs, shape, target="cuda", dev_id=0): ctx = tvm.context(target, dev_id) name, configs = configs batch, in_channel, H, W, out_channel, k, _, stride, padding, dilation, groups = shape A_np = np.random.uniform(-10, 10, size=[batch, in_channel, H, W]).astype("float32") A_tvm = tvm.nd.array(A_np, ctx) A_torch = torch.tensor(A_np) # .cuda("cuda:" + str(dev_id)) W_np = np.random.uniform(-10, 10, size=[out_channel, in_channel // groups, k, k]).astype("float32") W_tvm = tvm.nd.array(W_np, ctx) W_torch = torch.tensor(W_np) # .cuda("cuda:" + str(dev_id)) Output_torch = torch.nn.functional.conv2d(A_torch, W_torch, stride=stride, padding=padding, dilation=dilation, groups=groups) Output_np = np.zeros(Output_torch.shape).astype(np.float32) Output_tvm = tvm.nd.array(Output_np, ctx) s, bufs = schedule_with_config(name, configs) func = tvm.build(s, bufs, target) func(A_tvm, W_tvm, Output_tvm) passed = test_allclose(Output_tvm.asnumpy(), Output_torch.cpu().numpy(), rtol=1e-5, print_diff=True) if passed == 1: print("Passed!") else: print("Failed!")
def test_conv2d_nchwc(): ################################# # test basic case inputs_np = np.random.uniform(-1, 1, size=[4, 6, 8, 8, 4]).astype( np.float32) * 1000 weight_np = np.random.uniform(-1, 1, size=[9, 2, 3, 3, 4, 4]).astype( np.float32) * 1000 bias_np = np.random.uniform(-1, 1, size=[9, 4]).astype(np.float32) * 1000 # inputs_np = np.ones([1, 1, 3, 3, 1], dtype=np.float32) # weight_np = np.ones([1, 1, 3, 3, 1, 1], dtype=np.float32) # bias_np = np.zeros([1, 1], dtype=np.float32) output_np = pyimpl.conv2d_nchwc(inputs_np, weight_np, bias_np, stride=2, padding=1, dilation=2, groups=3) tvm_ctx = tvm.device("llvm", 0) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) weight_tvm = tvm.nd.array(weight_np, tvm_ctx) bias_tvm = tvm.nd.array(bias_np, tvm_ctx) output_tvm = tvm.nd.array( np.zeros(output_np.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.te.placeholder(inputs_np.shape, dtype="float32") weight_t = tvm.te.placeholder(weight_np.shape, dtype="float32") bias_t = tvm.te.placeholder(bias_np.shape, dtype="float32") output_t = conv2d_nchwc(inputs_t, weight_t, bias_t, stride=2, padding=1, dilation=2, groups=3) s = tvm.te.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, weight_t, bias_t, output_t], "llvm") func(inputs_tvm, weight_tvm, bias_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5 * 1000, print_diff=True) if passed == 1: print("Conv2d_nchwc basic case passed!") else: print("Conv2d_nchwc basic case failed!")
def test_conv_transpose1d(): ################################# # test basic case inputs_np = np.random.random([4, 9, 10]).astype(np.float32) * 100 weight_np = np.random.random([9, 2, 3]).astype(np.float32) * 100 bias_np = np.random.random([6]).astype(np.float32) * 100 inputs_torch = torch.tensor(inputs_np) weight_torch = torch.tensor(weight_np) bias_torch = torch.tensor(bias_np) output_torch = torch.nn.functional.conv_transpose1d(inputs_torch, weight_torch, bias_torch, stride=2, padding=1, output_padding=1, dilation=1, groups=3) tvm_ctx = tvm.device("llvm", 0) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) weight_tvm = tvm.nd.array(weight_np, tvm_ctx) bias_tvm = tvm.nd.array(bias_np, tvm_ctx) output_tvm = tvm.nd.array( np.zeros(output_torch.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.te.placeholder(inputs_np.shape, dtype="float32") weight_t = tvm.te.placeholder(weight_np.shape, dtype="float32") bias_t = tvm.te.placeholder(bias_np.shape, dtype="float32") output_t = conv_transpose1d(inputs_t, weight_t, bias_t, stride=2, padding=1, output_padding=1, dilation=1, groups=3) s = tvm.te.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, weight_t, bias_t, output_t], "llvm") func(inputs_tvm, weight_tvm, bias_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_torch.numpy(), rtol=1e-5, print_diff=True) if passed == 1: print("Conv_transpose1d basic case passed!") else: print("Conv_transpose1d basic case failed!")
def test_depthwise_conv2d_nchw(): ################################# # test basic case inputs_np = np.random.random([4, 6, 7, 7]).astype(np.float32) * 100 weight_np = np.random.random([18, 1, 3, 3]).astype(np.float32) * 100 bias_np = np.random.random([18]).astype(np.float32) * 100 inputs_torch = torch.tensor(inputs_np) weight_torch = torch.tensor(weight_np) bias_torch = torch.tensor(bias_np) output_torch = torch.nn.functional.conv2d(inputs_torch, weight_torch, bias_torch, stride=2, padding=1, dilation=2, groups=6) tvm_ctx = tvm.device("llvm", 0) # for depthwise weight_np = np.reshape( weight_np, [6, 3, 3, 3]) # np.ones([6, 3, 3, 3]).astype(np.float32) inputs_tvm = tvm.nd.array(inputs_np, tvm_ctx) weight_tvm = tvm.nd.array(weight_np, tvm_ctx) bias_tvm = tvm.nd.array(bias_np, tvm_ctx) output_tvm = tvm.nd.array( np.zeros(output_torch.shape).astype(np.float32), tvm_ctx) inputs_t = tvm.te.placeholder(inputs_np.shape, dtype="float32") weight_t = tvm.te.placeholder(weight_np.shape, dtype="float32") bias_t = tvm.te.placeholder(bias_np.shape, dtype="float32") output_t = depthwise_conv2d_nchw(inputs_t, weight_t, bias_t, stride=2, padding=1, dilation=2) s = tvm.te.create_schedule(output_t.op) func = tvm.build(s, [inputs_t, weight_t, bias_t, output_t], "llvm") func(inputs_tvm, weight_tvm, bias_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_torch.numpy(), rtol=1e-5, print_diff=True) if passed == 1: print("Depthwise_conv2d_nchw basic case passed!") else: print("Depthwise_conv2d_nchw basic case failed!")
def test_block_circulant_matrix(): ROW, COL, FFT = 1024, 40, 16 input_np = np.random.random([ROW, COL]).astype(np.float32) # input_np = np.ones([ROW, COL], dtype=np.float32) output_np = np.zeros([ROW, COL], dtype=np.float32) for i in range(ROW // FFT): sub_vec = np.zeros([FFT], dtype=np.float32) vec = np.zeros([COL], dtype=np.float32) for t in range(COL // FFT): for m in range(FFT): for n in range(FFT): vec[t * FFT + m] += \ input_np[FFT * i + n][t * FFT + (m + n) % FFT] / FFT for j in range(FFT): for k in range(COL // FFT): if j >= 1: sub_vec[0] = vec[FFT * (k + 1) - 1] sub_vec[1:FFT] = vec[FFT * k:FFT * (k + 1) - 1] vec[FFT * k:FFT * (k + 1)] = sub_vec output_np[FFT * i + j][:] = copy.deepcopy(vec) tvm_ctx = tvm.device('llvm', 0) input_tvm = tvm.nd.array(input_np, tvm_ctx) output_tvm = tvm.nd.array( np.zeros(output_np.shape).astype(np.float32), tvm_ctx) input_t = tvm.te.placeholder(input_np.shape, dtype='float32') output_t = block_circulant_matrix(input_t, FFT) s = tvm.te.create_schedule(output_t.op) func = tvm.build(s, [input_t, output_t], 'llvm') func(input_tvm, output_tvm) passed = test_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5, print_diff=True) if passed == 1: print("Block_circulant_matrix basic case passed") else: print("Block_circulant_matrix case failed")
def try_yolo_conv(batch_size=2, number=100): # get the compute yolo_conv = SqueezeNetFire8Gemm() input_shape = yolo_conv.get_intput_shape() inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32", name='inputs') weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) bias = yolo_conv.get_bias() s = tvm.te.create_schedule(outputs.op) schedule_yolo_conv_x86(s, outputs, inputs, weight, bias) arg_bufs = [inputs, weight, bias, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) print(stmt) dev_id = 0 time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, number=number) print("Yolo conv24 use", time_cost, "ms") """ For pytorch """ out_channel, in_channel, kernel_height, kernel_width = yolo_conv.weight_shape padding, stride, dilation, groups = (yolo_conv.padding, yolo_conv.stride, yolo_conv.dilation, yolo_conv.groups) conv2d_torch = torch.nn.Conv2d(in_channel, out_channel, (kernel_height, kernel_width), padding=padding, stride=stride, dilation=dilation, groups=groups) # warm up inputs = torch.rand(batch_size, *input_shape) res = conv2d_torch(inputs) times = time.time() for _ in range(number): res = conv2d_torch(inputs) times = time.time() - times print("Pytorch on cpu use: {}ms".format(times / number * 1e3)) # to test the correctness, currently the result is wrong becasue of the schedule # if you change line 148 to 'outer = s[write_cache].fuse(gemm_g, gemm_go)' # the result is correct ctx = tvm.device("llvm", 0) inputs_np = np.random.random(inputs.shape).astype("float32") * 100 weight_np = np.random.random(to_tuple(weight.shape)).astype( weight.dtype) * 100 outputs_np = np.zeros(shape=to_tuple(outputs.shape), dtype=np.float32) bias_np = np.random.random(size=to_tuple(bias.shape)).astype( bias.dtype) * 100 inputs_tvm = tvm.nd.array(inputs_np, ctx) weight_tvm = tvm.nd.array(weight_np, ctx) outputs_tvm = tvm.nd.array(outputs_np, ctx) bias_tvm = tvm.nd.array(bias_np, ctx) inputs_torch = torch.tensor(inputs_np) weight_torch = torch.tensor(weight_np) bias_torch = torch.tensor(bias_np) func_tvm = tvm.build(s, arg_bufs, "llvm") func_tvm(inputs_tvm, weight_tvm, bias_tvm, outputs_tvm) outputs_torch = torch.nn.functional.conv2d(inputs_torch, weight_torch, bias=bias_torch, padding=padding, stride=stride, dilation=dilation, groups=groups) the_same = test_allclose(outputs_tvm.asnumpy(), outputs_torch.numpy(), rtol=1e-5, print_diff=True) if the_same: print("The same!") else: print("Not the same!")