def try_yolo_conv_cuda(batch_size=1): # get the compute yolo_conv = YoloConvLayer10() input_shape = yolo_conv.get_intput_shape() inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32") weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) s = tvm.te.create_schedule(outputs.op) schedule_yolo_conv_cuda(s, outputs, inputs, weight) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) print(stmt) dev_id = 3 ctx = tvm.nd.context("cuda", dev_id) max_dims = ctx.max_thread_dimensions kwargs = { "max_shared_memory_per_block": ctx.max_shared_memory_per_block, "max_threads_per_block": ctx.max_threads_per_block, "max_thread_x": max_dims[0], "max_thread_y": max_dims[1], "max_thread_z": max_dims[2] } verify = tvm.tir.ir_pass.VerifyGPUCode(stmt, kwargs) print(verify) time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10) print("Yolo conv10 use", time_cost, "ms")
def try_yolo_conv_opencl(batch_size=1): # get the compute yolo_conv = YoloConvLayer10() input_shape = yolo_conv.get_intput_shape() inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32") weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) s = tvm.te.create_schedule(outputs.op) schedule_yolo_conv_opencl(s, outputs, inputs, weight) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) # print(stmt) func = tvm.build(s, arg_bufs, "opencl") print(func.imported_modules[0].get_source())