def pixelcnn_autotvm(N, H, W, CO, CI, KH, KW, mask_type, bias, stride, padding, dilation): # assert N == 1, "Only consider batch_size = 1 in this template" # data = tvm.te.placeholder((N, CI, H, W), name='data') # kernel = tvm.te.placeholder((CO, CI, KH, KW), name='kernel') # conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=dilation, out_dtype='float32') convop, tensors = pixelcnn(N, H, W, CI, CO, KH, KW, mask_type, bias=bias, stride=stride, padding=padding, dilation=dilation) s = tvm.te.create_schedule(convop) cfg = autotvm.get_config() ##### space definition begin ##### schedule_direct_cuda(cfg, s, *tensors[-2:]) return s, [*tensors]
def tvm_PixelCNN_cpu(B, H, W, C, out_C, kernel_height, kernel_width, mask_type, bias, dilation, stride, padding, number=10, dev=0): Input = torch.rand([B, H, W, C], dtype=torch.float32) Kernel = torch.zeros([out_C, C, kernel_height, kernel_width], dtype=torch.float32) s, bufs = pixelcnn(B, H, W, C, out_C, kernel_height, kernel_width, mask_type, bias, dilation=dilation, stride=stride, padding=padding) ctx = tvm.cpu(dev_id=dev) s = tvm.te.create_schedule(s) f = tvm.build(s, bufs, "llvm") im = tvm.nd.array(Input.numpy().astype(np.float32), ctx) fi = tvm.nd.array(Kernel.numpy().astype(np.float32), ctx) in_height = H in_width = W out_height = (H + 2 * padding - dilation * (kernel_height - 1) - 1) // stride + 1 out_width = (W + 2 * padding - dilation * (kernel_width - 1) - 1) // stride + 1 output_shape = (B, out_height, out_width, out_C) un = tvm.nd.array(np.zeros(output_shape).astype(np.float32), ctx) start_time = time.time() for i in range(number): f(im, fi, un) end_time = time.time() return (end_time - start_time) * 1e3 / number