def find_gpu_target():
    # Start with a target suitable for the machine you're running this on.
    target = hl.get_host_target()

    features_to_try = []
    if target.os == hl.TargetOS.Windows:
        # Try D3D12 first; if that fails, try OpenCL.
        if struct.calcsize("P") == 8:
            # D3D12Compute support is only available on 64-bit systems at present.
            features_to_try.append(hl.TargetFeature.D3D12Compute)
        features_to_try.append(hl.TargetFeature.OpenCL)
    elif target.os == hl.TargetOS.OSX:
        # OS X doesn't update its OpenCL drivers, so they tend to be broken.
        # CUDA would also be a fine choice on machines with NVidia GPUs.
        features_to_try.append(hl.TargetFeature.Metal)
    else:
        features_to_try.append(hl.TargetFeature.OpenCL)

    # Uncomment the following lines to also try CUDA:
    # features_to_try.append(hl.TargetFeature.CUDA);
    for f in features_to_try:
        new_target = target.with_feature(f)
        if (hl.host_supports_target_device(new_target)):
            return new_target

    print(
        "Requested GPU(s) are not supported. (Do you have the proper hardware and/or driver installed?)"
    )
    return target
Esempio n. 2
0
def stereoBM(left_image, right_image, width, height, SADWindowSize,
             minDisparity, numDisparities):

    x, y, c = Var("x"), Var("y"), Var("c")
    left, right = Func("left"), Func("right")
    left[x, y, c] = left_image[x, y, c]
    right[x, y, c] = right_image[x, y, c]

    W, H, C = left_image.width(), left_image.height(), left_image.channels()

    # Sobel filtering
    filteredLeft = prefilterXSobel(left, W, H)
    filteredRight = prefilterXSobel(right, W, H)

    # Stereo block-matching
    win2 = SADWindowSize / 2
    maxDisparity = numDisparities - 1
    xmin = maxDisparity + win2
    xmax = width - win2 - 1
    ymin = win2
    ymax = height - win2 - 1
    x_tile_size, y_tile_size = 32, 32
    disp = findStereoCorrespondence(filteredLeft, filteredRight, SADWindowSize,
                                    minDisparity, numDisparities, xmin, xmax,
                                    ymin, ymax, x_tile_size, y_tile_size)

    # Write to pretty html
    args = h.ArgumentsVector()
    disp.compile_to_lowered_stmt("disp.html", args, h.HTML)

    # Compile / Profile
    profile(disp, W, H)

    # Start with a target suitable for the machine you're running
    # this on.
    target = h.get_host_target()
    disp_image = disp.realize(W, H)

    return disp_image
    def schedule_for_cpu(self):
        # Compute the look-up-table ahead of time.
        self.lut.compute_root()

        # Compute color channels innermost. Promise that there will
        # be three of them and unroll across them.
        self.curved.reorder(c, x, y) \
            .bound(c, 0, 3) \
            .unroll(c)

        # Look-up-tables don't vectorize well, so just parallelize
        # curved in slices of 16 scanlines.
        self.curved.split(y, yo, yi, 16) \
            .parallel(yo)

        # Compute sharpen as needed per scanline of curved, reusing
        # previous values computed within the same strip of 16
        # scanlines.
        self.sharpen.compute_at(self.curved, yi)

        # Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide.
        self.sharpen.vectorize(x, 8)

        # Compute the padded input at the same granularity as the
        # sharpen. We'll leave the cast to 16-bit inlined into
        # sharpen.
        self.padded.store_at(self.curved, yo) \
            .compute_at(self.curved, yi)

        # Also vectorize the padding. It's 8-bit, so we'll vectorize
        # 16-wide.
        self.padded.vectorize(x, 16)

        # JIT-compile the pipeline for the CPU.
        target = hl.get_host_target()
        self.curved.compile_jit(target)

        return
Esempio n. 4
0
def stereoBM(left_image, right_image, width, height, 
             SADWindowSize, minDisparity, numDisparities): 
    
    x, y, c = Var("x"), Var("y"), Var("c")
    left, right = Func("left"), Func("right")
    left[x, y, c] = left_image[x, y, c]
    right[x, y, c] = right_image[x, y, c]

    W, H, C = left_image.width(), left_image.height(), left_image.channels()

    # Sobel filtering
    filteredLeft = prefilterXSobel(left, W, H)
    filteredRight = prefilterXSobel(right, W, H)

    # Stereo block-matching 
    win2 = SADWindowSize / 2
    maxDisparity = numDisparities - 1
    xmin = maxDisparity + win2
    xmax = width - win2 - 1
    ymin = win2
    ymax = height - win2 - 1
    x_tile_size, y_tile_size = 32, 32
    disp = findStereoCorrespondence(filteredLeft, filteredRight, SADWindowSize, minDisparity, numDisparities, 
                                    xmin, xmax, ymin, ymax, x_tile_size, y_tile_size)

    # Write to pretty html
    args = h.ArgumentsVector()
    disp.compile_to_lowered_stmt("disp.html", args, h.HTML)

    # Compile / Profile
    profile(disp, W, H)

    # Start with a target suitable for the machine you're running
    # this on.
    target = h.get_host_target()
    disp_image = disp.realize(W, H)

    return disp_image
Esempio n. 5
0
    def schedule_for_gpu(self):
        # We make the decision about whether to use the GPU for each
        # hl.Func independently. If you have one hl.Func computed on the
        # CPU, and the next computed on the GPU, Halide will do the
        # copy-to-gpu under the hood. For this pipeline, there's no
        # reason to use the CPU for any of the stages. Halide will
        # copy the input image to the GPU the first time we run the
        # pipeline, and leave it there to reuse on subsequent runs.

        # As before, we'll compute the LUT once at the start of the
        # pipeline.
        self.lut.compute_root()

        # Let's compute the look-up-table using the GPU in 16-wide
        # one-dimensional thread blocks. First we split the index
        # into blocks of size 16:
        block, thread = hl.Var("block"), hl.Var("thread")
        self.lut.split(i, block, thread, 16)
        # Then we tell cuda that our Vars 'block' and 'thread'
        # correspond to CUDA's notions of blocks and threads, or
        # OpenCL's notions of thread groups and threads.
        self.lut.gpu_blocks(block) \
                .gpu_threads(thread)

        # This is a very common scheduling pattern on the GPU, so
        # there's a shorthand for it:

        # lut.gpu_tile(i, ii, 16)

        # hl.Func::gpu_tile method is similar to hl.Func::tile, except that
        # it also specifies that the tile coordinates correspond to
        # GPU blocks, and the coordinates within each tile correspond
        # to GPU threads.

        # Compute color channels innermost. Promise that there will
        # be three of them and unroll across them.
        self.curved.reorder(c, x, y) \
                   .bound(c, 0, 3) \
                   .unroll(c)

        # Compute curved in 2D 8x8 tiles using the GPU.
        self.curved.gpu_tile(x, y, xi, yi, 8, 8)

        # This is equivalent to:
        # curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        #       .gpu_blocks(xo, yo)
        #       .gpu_threads(xi, yi)

        # We'll leave sharpen as inlined into curved.

        # Compute the padded input as needed per GPU block, storing the
        # intermediate result in shared memory. hl.Var::gpu_blocks, and
        # hl.Var::gpu_threads exist to help you schedule producers within
        # GPU threads and blocks.
        self.padded.compute_at(self.curved, x)

        # Use the GPU threads for the x and y coordinates of the
        # padded input.
        self.padded.gpu_threads(x, y)

        # JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        # not enabled by default. We have to construct a hl.Target
        # object, enable one of them, and then pass that target
        # object to compile_jit. Otherwise your CPU will very slowly
        # pretend it's a GPU, and use one thread per output pixel.

        # Start with a target suitable for the machine you're running
        # this on.
        target = hl.get_host_target()

        # Then enable OpenCL or CUDA.
        #use_opencl = False
        use_opencl = True
        if use_opencl:
            # We'll enable OpenCL here, because it tends to give better
            # performance than CUDA, even with NVidia's drivers, because
            # NVidia's open source LLVM backend doesn't seem to do all
            # the same optimizations their proprietary compiler does.
            target.set_feature(hl.TargetFeature.OpenCL)
            print("(Using OpenCL)")
        else:
            # Uncomment the next line and comment out the line above to
            # try CUDA instead.
            target.set_feature(hl.TargetFeature.CUDA)
            print("(Using CUDA)")

        # If you want to see all of the OpenCL or CUDA API calls done
        # by the pipeline, you can also enable the Debug
        # flag. This is helpful for figuring out which stages are
        # slow, or when CPU -> GPU copies happen. It hurts
        # performance though, so we'll leave it commented out.
        # target.set_feature(hl.TargetFeature.Debug)

        self.curved.compile_jit(target)
Esempio n. 6
0
def test_target():
    # Target("") should be exactly like get_host_target().
    t1 = hl.get_host_target()
    t2 = hl.Target("")
    assert t1 == t2, "Default ctor failure"
    assert t1.supported()

    # to_string roundtripping
    t1 = hl.Target()
    ts = t1.to_string()
    assert ts == "arch_unknown-0-os_unknown"

    # Note, this should *not* validate, since validate_target_string
    # now returns false if any of arch-bits-os are undefined
    assert not hl.Target.validate_target_string(ts)

    # Don't attempt to roundtrip this: trying to create
    # a Target with unknown portions will now assert-fail.
    #
    # t2 = hl.Target(ts)
    # assert t2 == t1

    # repr() and str()
    assert str(t1) == "arch_unknown-0-os_unknown"
    assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>"

    assert t1.os == hl.TargetOS.OSUnknown
    assert t1.arch == hl.TargetArch.ArchUnknown
    assert t1.bits == 0

    # Full specification round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    ts = t1.to_string()
    assert ts == "x86-32-linux-sse41"
    assert hl.Target.validate_target_string(ts)

    # Full specification (without features) round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32)
    ts = t1.to_string()
    assert ts == "x86-32-linux"
    assert hl.Target.validate_target_string(ts)

    # Full specification round-trip, crazy features
    t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32, [
        hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX,
        hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL,
        hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute,
        hl.TargetFeature.Debug
    ])
    ts = t1.to_string()
    assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41"
    assert hl.Target.validate_target_string(ts)

    # Expected failures:
    ts = "host-unknowntoken"
    assert not hl.Target.validate_target_string(ts)

    ts = "x86-23"
    assert not hl.Target.validate_target_string(ts)

    # bits == 0 is allowed only if arch_unknown and os_unknown are specified,
    # and no features are set
    ts = "x86-0"
    assert not hl.Target.validate_target_string(ts)

    ts = "0-arch_unknown-os_unknown-sse41"
    assert not hl.Target.validate_target_string(ts)

    # "host" is only supported as the first token
    ts = "opencl-host"
    assert not hl.Target.validate_target_string(ts)

    # set_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.SSE41, False)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # set_features
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_features([hl.TargetFeature.SSE41], False)
    t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert t1.has_feature(hl.TargetFeature.AVX2)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # with_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature(
        hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41"

    # without_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts])
    # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op
    t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature(
        hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-sse41"

    # natural_vector_size
    # SSE4.1 is 16 bytes wide
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.natural_vector_size(hl.UInt(8)) == 16
    assert t1.natural_vector_size(hl.Int(16)) == 8
    assert t1.natural_vector_size(hl.UInt(32)) == 4
    assert t1.natural_vector_size(hl.Float(32)) == 4

    # has_gpu_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.OpenCL])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [])
    assert t1.has_gpu_feature()
    assert not t2.has_gpu_feature()

    # has_large_buffers & maximum_buffer_size
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.LargeBuffers])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [])
    assert t1.has_large_buffers()
    assert t1.maximum_buffer_size() == 9223372036854775807
    assert not t2.has_large_buffers()
    assert t2.maximum_buffer_size() == 2147483647

    # supports_device_api
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.CUDA])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64)
    assert t1.supports_device_api(hl.DeviceAPI.CUDA)
    assert not t2.supports_device_api(hl.DeviceAPI.CUDA)

    # supports_type (deprecated version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64))
    assert t2.supports_type(hl.Float(64))

    # supports_type (preferred version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal)
    assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal)

    # target_feature_for_device_api
    assert hl.target_feature_for_device_api(
        hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL

    # with_feature with non-convertible lists
    try:
        t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                       ["this is a string"])
    except TypeError as e:
        assert "incompatible constructor arguments" in str(e)
    else:
        assert False, 'Did not see expected exception!'
Esempio n. 7
0
def test_target():
    # Target("") should be exactly like get_host_target().
    t1 = hl.get_host_target()
    t2 = hl.Target("")
    assert t1 == t2, "Default ctor failure"
    assert t1.supported();

    # to_string roundtripping
    t1 = hl.Target()
    ts = t1.to_string()
    assert ts == "arch_unknown-0-os_unknown"
    assert hl.validate_target_string(ts)
    t2 = hl.Target(ts)

    # equality
    assert t2 == t1

    # repr() and str()
    assert str(t1) == "arch_unknown-0-os_unknown"
    assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>"

    assert t1.os == hl.TargetOS.OSUnknown
    assert t1.arch == hl.TargetArch.ArchUnknown
    assert t1.bits == 0

    # Full specification round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [ hl.TargetFeature.SSE41 ])
    ts = t1.to_string()
    assert ts == "x86-32-linux-sse41"
    assert hl.validate_target_string(ts)

    # Full specification (without features) round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32)
    ts = t1.to_string()
    assert ts == "x86-32-linux"
    assert hl.validate_target_string(ts)

    # Full specification round-trip, crazy features
    t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32,
                [hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX, hl.TargetFeature.AVX2,
                 hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL, hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute,
                 hl.TargetFeature.Debug])
    ts = t1.to_string()
    assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41"
    assert hl.validate_target_string(ts)

    # Expected failures:
    ts = "host-unknowntoken"
    assert not hl.validate_target_string(ts)

    ts = "x86-23"
    assert not hl.validate_target_string(ts)

    # bits == 0 is allowed only if arch_unknown and os_unknown are specified,
    # and no features are set
    ts = "x86-0"
    assert not hl.validate_target_string(ts)

    ts = "0-arch_unknown-os_unknown-sse41"
    assert not hl.validate_target_string(ts)

    # "host" is only supported as the first token
    ts = "opencl-host"
    assert not hl.validate_target_string(ts)

    # set_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.SSE41, False)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # set_features
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_features([hl.TargetFeature.SSE41], False)
    t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert t1.has_feature(hl.TargetFeature.AVX2)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # with_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41])
    t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature(hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41"

    # without_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts])
    # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op
    t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature(hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-sse41"

    # natural_vector_size
    # SSE4.1 is 16 bytes wide
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41])
    assert t1.natural_vector_size(hl.UInt(8)) == 16
    assert t1.natural_vector_size(hl.Int(16)) == 8
    assert t1.natural_vector_size(hl.UInt(32)) == 4
    assert t1.natural_vector_size(hl.Float(32)) == 4

    # has_gpu_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.OpenCL])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [])
    assert t1.has_gpu_feature()
    assert not t2.has_gpu_feature()

    # has_large_buffers & maximum_buffer_size
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.LargeBuffers])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [])
    assert t1.has_large_buffers()
    assert t1.maximum_buffer_size() == 9223372036854775807
    assert not t2.has_large_buffers()
    assert t2.maximum_buffer_size() == 2147483647

    # supports_device_api
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.CUDA])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64)
    assert t1.supports_device_api(hl.DeviceAPI.CUDA)
    assert not t2.supports_device_api(hl.DeviceAPI.CUDA)

    # supports_type (deprecated version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64))
    assert t2.supports_type(hl.Float(64))

    # supports_type (preferred version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal)
    assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal)

    # target_feature_for_device_api
    assert hl.target_feature_for_device_api(hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL

    # with_feature with non-convertible lists
    try:
        t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [ "this is a string" ])
    except TypeError as e:
        assert str(e) == "No registered converter was able to produce a C++ rvalue of type Halide::Target::Feature from this Python object of type str"
    else:
        assert False, 'Did not see expected exception!'
Esempio n. 8
0
    def schedule_for_gpu(self):
        # We make the decision about whether to use the GPU for each
        # hl.Func independently. If you have one hl.Func computed on the
        # CPU, and the next computed on the GPU, Halide will do the
        # copy-to-gpu under the hood. For this pipeline, there's no
        # reason to use the CPU for any of the stages. Halide will
        # copy the input image to the GPU the first time we run the
        # pipeline, and leave it there to reuse on subsequent runs.

        # As before, we'll compute the LUT once at the start of the
        # pipeline.
        self.lut.compute_root()

        # Let's compute the look-up-table using the GPU in 16-wide
        # one-dimensional thread blocks. First we split the index
        # into blocks of size 16:
        block, thread = hl.Var("block"), hl.Var("thread")
        self.lut.split(i, block, thread, 16)
        # Then we tell cuda that our Vars 'block' and 'thread'
        # correspond to CUDA's notions of blocks and threads, or
        # OpenCL's notions of thread groups and threads.
        self.lut.gpu_blocks(block) \
                .gpu_threads(thread)

        # This is a very common scheduling pattern on the GPU, so
        # there's a shorthand for it:

        # lut.gpu_tile(i, ii, 16)

        # hl.Func::gpu_tile method is similar to hl.Func::tile, except that
        # it also specifies that the tile coordinates correspond to
        # GPU blocks, and the coordinates within each tile correspond
        # to GPU threads.

        # Compute color channels innermost. Promise that there will
        # be three of them and unroll across them.
        self.curved.reorder(c, x, y) \
                   .bound(c, 0, 3) \
                   .unroll(c)

        # Compute curved in 2D 8x8 tiles using the GPU.
        self.curved.gpu_tile(x, y, xi, yi, 8, 8)

        # This is equivalent to:
        # curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        #       .gpu_blocks(xo, yo)
        #       .gpu_threads(xi, yi)

        # We'll leave sharpen as inlined into curved.

        # Compute the padded input as needed per GPU block, storing the
        # intermediate result in shared memory. hl.Var::gpu_blocks, and
        # hl.Var::gpu_threads exist to help you schedule producers within
        # GPU threads and blocks.
        self.padded.compute_at(self.curved, x)

        # Use the GPU threads for the x and y coordinates of the
        # padded input.
        self.padded.gpu_threads(x, y)

        # JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        # not enabled by default. We have to construct a hl.Target
        # object, enable one of them, and then pass that target
        # object to compile_jit. Otherwise your CPU will very slowly
        # pretend it's a GPU, and use one thread per output pixel.

        # Start with a target suitable for the machine you're running
        # this on.
        target = hl.get_host_target()

        # Then enable OpenCL or CUDA.
        #use_opencl = False
        use_opencl = True
        if use_opencl:
            # We'll enable OpenCL here, because it tends to give better
            # performance than CUDA, even with NVidia's drivers, because
            # NVidia's open source LLVM backend doesn't seem to do all
            # the same optimizations their proprietary compiler does.
            target.set_feature(hl.TargetFeature.OpenCL)
            print("(Using OpenCL)")
        else:
            # Uncomment the next line and comment out the line above to
            # try CUDA instead.
            target.set_feature(hl.TargetFeature.CUDA)
            print("(Using CUDA)")

        # If you want to see all of the OpenCL or CUDA API calls done
        # by the pipeline, you can also enable the Debug
        # flag. This is helpful for figuring out which stages are
        # slow, or when CPU -> GPU copies happen. It hurts
        # performance though, so we'll leave it commented out.
        # target.set_feature(hl.TargetFeature.Debug)

        self.curved.compile_jit(target)