def try_yolo_conv_cuda(batch_size=1):
    # get the compute
    yolo_conv = YoloConvLayer10()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32")
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)

    s = tvm.te.create_schedule(outputs.op)
    schedule_yolo_conv_cuda(s, outputs, inputs, weight)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    print(stmt)
    dev_id = 3
    ctx = tvm.nd.context("cuda", dev_id)
    max_dims = ctx.max_thread_dimensions
    kwargs = {
        "max_shared_memory_per_block": ctx.max_shared_memory_per_block,
        "max_threads_per_block": ctx.max_threads_per_block,
        "max_thread_x": max_dims[0],
        "max_thread_y": max_dims[1],
        "max_thread_z": max_dims[2]
    }
    verify = tvm.tir.ir_pass.VerifyGPUCode(stmt, kwargs)
    print(verify)
    time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10)
    print("Yolo conv10 use", time_cost, "ms")
def try_yolo_conv_opencl(batch_size=1):
    # get the compute
    yolo_conv = YoloConvLayer10()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32")
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)

    s = tvm.te.create_schedule(outputs.op)
    schedule_yolo_conv_opencl(s, outputs, inputs, weight)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    # print(stmt)
    func = tvm.build(s, arg_bufs, "opencl")
    print(func.imported_modules[0].get_source())