def find_gpu_target(): # Start with a target suitable for the machine you're running this on. target = hl.get_host_target() features_to_try = [] if target.os == hl.TargetOS.Windows: # Try D3D12 first; if that fails, try OpenCL. if struct.calcsize("P") == 8: # D3D12Compute support is only available on 64-bit systems at present. features_to_try.append(hl.TargetFeature.D3D12Compute) features_to_try.append(hl.TargetFeature.OpenCL) elif target.os == hl.TargetOS.OSX: # OS X doesn't update its OpenCL drivers, so they tend to be broken. # CUDA would also be a fine choice on machines with NVidia GPUs. features_to_try.append(hl.TargetFeature.Metal) else: features_to_try.append(hl.TargetFeature.OpenCL) # Uncomment the following lines to also try CUDA: # features_to_try.append(hl.TargetFeature.CUDA); for f in features_to_try: new_target = target.with_feature(f) if (hl.host_supports_target_device(new_target)): return new_target print( "Requested GPU(s) are not supported. (Do you have the proper hardware and/or driver installed?)" ) return target
def stereoBM(left_image, right_image, width, height, SADWindowSize, minDisparity, numDisparities): x, y, c = Var("x"), Var("y"), Var("c") left, right = Func("left"), Func("right") left[x, y, c] = left_image[x, y, c] right[x, y, c] = right_image[x, y, c] W, H, C = left_image.width(), left_image.height(), left_image.channels() # Sobel filtering filteredLeft = prefilterXSobel(left, W, H) filteredRight = prefilterXSobel(right, W, H) # Stereo block-matching win2 = SADWindowSize / 2 maxDisparity = numDisparities - 1 xmin = maxDisparity + win2 xmax = width - win2 - 1 ymin = win2 ymax = height - win2 - 1 x_tile_size, y_tile_size = 32, 32 disp = findStereoCorrespondence(filteredLeft, filteredRight, SADWindowSize, minDisparity, numDisparities, xmin, xmax, ymin, ymax, x_tile_size, y_tile_size) # Write to pretty html args = h.ArgumentsVector() disp.compile_to_lowered_stmt("disp.html", args, h.HTML) # Compile / Profile profile(disp, W, H) # Start with a target suitable for the machine you're running # this on. target = h.get_host_target() disp_image = disp.realize(W, H) return disp_image
def schedule_for_cpu(self): # Compute the look-up-table ahead of time. self.lut.compute_root() # Compute color channels innermost. Promise that there will # be three of them and unroll across them. self.curved.reorder(c, x, y) \ .bound(c, 0, 3) \ .unroll(c) # Look-up-tables don't vectorize well, so just parallelize # curved in slices of 16 scanlines. self.curved.split(y, yo, yi, 16) \ .parallel(yo) # Compute sharpen as needed per scanline of curved, reusing # previous values computed within the same strip of 16 # scanlines. self.sharpen.compute_at(self.curved, yi) # Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide. self.sharpen.vectorize(x, 8) # Compute the padded input at the same granularity as the # sharpen. We'll leave the cast to 16-bit inlined into # sharpen. self.padded.store_at(self.curved, yo) \ .compute_at(self.curved, yi) # Also vectorize the padding. It's 8-bit, so we'll vectorize # 16-wide. self.padded.vectorize(x, 16) # JIT-compile the pipeline for the CPU. target = hl.get_host_target() self.curved.compile_jit(target) return
def schedule_for_gpu(self): # We make the decision about whether to use the GPU for each # hl.Func independently. If you have one hl.Func computed on the # CPU, and the next computed on the GPU, Halide will do the # copy-to-gpu under the hood. For this pipeline, there's no # reason to use the CPU for any of the stages. Halide will # copy the input image to the GPU the first time we run the # pipeline, and leave it there to reuse on subsequent runs. # As before, we'll compute the LUT once at the start of the # pipeline. self.lut.compute_root() # Let's compute the look-up-table using the GPU in 16-wide # one-dimensional thread blocks. First we split the index # into blocks of size 16: block, thread = hl.Var("block"), hl.Var("thread") self.lut.split(i, block, thread, 16) # Then we tell cuda that our Vars 'block' and 'thread' # correspond to CUDA's notions of blocks and threads, or # OpenCL's notions of thread groups and threads. self.lut.gpu_blocks(block) \ .gpu_threads(thread) # This is a very common scheduling pattern on the GPU, so # there's a shorthand for it: # lut.gpu_tile(i, ii, 16) # hl.Func::gpu_tile method is similar to hl.Func::tile, except that # it also specifies that the tile coordinates correspond to # GPU blocks, and the coordinates within each tile correspond # to GPU threads. # Compute color channels innermost. Promise that there will # be three of them and unroll across them. self.curved.reorder(c, x, y) \ .bound(c, 0, 3) \ .unroll(c) # Compute curved in 2D 8x8 tiles using the GPU. self.curved.gpu_tile(x, y, xi, yi, 8, 8) # This is equivalent to: # curved.tile(x, y, xo, yo, xi, yi, 8, 8) # .gpu_blocks(xo, yo) # .gpu_threads(xi, yi) # We'll leave sharpen as inlined into curved. # Compute the padded input as needed per GPU block, storing the # intermediate result in shared memory. hl.Var::gpu_blocks, and # hl.Var::gpu_threads exist to help you schedule producers within # GPU threads and blocks. self.padded.compute_at(self.curved, x) # Use the GPU threads for the x and y coordinates of the # padded input. self.padded.gpu_threads(x, y) # JIT-compile the pipeline for the GPU. CUDA or OpenCL are # not enabled by default. We have to construct a hl.Target # object, enable one of them, and then pass that target # object to compile_jit. Otherwise your CPU will very slowly # pretend it's a GPU, and use one thread per output pixel. # Start with a target suitable for the machine you're running # this on. target = hl.get_host_target() # Then enable OpenCL or CUDA. #use_opencl = False use_opencl = True if use_opencl: # We'll enable OpenCL here, because it tends to give better # performance than CUDA, even with NVidia's drivers, because # NVidia's open source LLVM backend doesn't seem to do all # the same optimizations their proprietary compiler does. target.set_feature(hl.TargetFeature.OpenCL) print("(Using OpenCL)") else: # Uncomment the next line and comment out the line above to # try CUDA instead. target.set_feature(hl.TargetFeature.CUDA) print("(Using CUDA)") # If you want to see all of the OpenCL or CUDA API calls done # by the pipeline, you can also enable the Debug # flag. This is helpful for figuring out which stages are # slow, or when CPU -> GPU copies happen. It hurts # performance though, so we'll leave it commented out. # target.set_feature(hl.TargetFeature.Debug) self.curved.compile_jit(target)
def test_target(): # Target("") should be exactly like get_host_target(). t1 = hl.get_host_target() t2 = hl.Target("") assert t1 == t2, "Default ctor failure" assert t1.supported() # to_string roundtripping t1 = hl.Target() ts = t1.to_string() assert ts == "arch_unknown-0-os_unknown" # Note, this should *not* validate, since validate_target_string # now returns false if any of arch-bits-os are undefined assert not hl.Target.validate_target_string(ts) # Don't attempt to roundtrip this: trying to create # a Target with unknown portions will now assert-fail. # # t2 = hl.Target(ts) # assert t2 == t1 # repr() and str() assert str(t1) == "arch_unknown-0-os_unknown" assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>" assert t1.os == hl.TargetOS.OSUnknown assert t1.arch == hl.TargetArch.ArchUnknown assert t1.bits == 0 # Full specification round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) ts = t1.to_string() assert ts == "x86-32-linux-sse41" assert hl.Target.validate_target_string(ts) # Full specification (without features) round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32) ts = t1.to_string() assert ts == "x86-32-linux" assert hl.Target.validate_target_string(ts) # Full specification round-trip, crazy features t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32, [ hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX, hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL, hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute, hl.TargetFeature.Debug ]) ts = t1.to_string() assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41" assert hl.Target.validate_target_string(ts) # Expected failures: ts = "host-unknowntoken" assert not hl.Target.validate_target_string(ts) ts = "x86-23" assert not hl.Target.validate_target_string(ts) # bits == 0 is allowed only if arch_unknown and os_unknown are specified, # and no features are set ts = "x86-0" assert not hl.Target.validate_target_string(ts) ts = "0-arch_unknown-os_unknown-sse41" assert not hl.Target.validate_target_string(ts) # "host" is only supported as the first token ts = "opencl-host" assert not hl.Target.validate_target_string(ts) # set_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.SSE41, False) assert t1.has_feature(hl.TargetFeature.AVX) assert not t1.has_feature(hl.TargetFeature.SSE41) # set_features t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_features([hl.TargetFeature.SSE41], False) t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True) assert t1.has_feature(hl.TargetFeature.AVX) assert t1.has_feature(hl.TargetFeature.AVX2) assert not t1.has_feature(hl.TargetFeature.SSE41) # with_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature( hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41" # without_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts]) # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature( hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-sse41" # natural_vector_size # SSE4.1 is 16 bytes wide t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.natural_vector_size(hl.UInt(8)) == 16 assert t1.natural_vector_size(hl.Int(16)) == 8 assert t1.natural_vector_size(hl.UInt(32)) == 4 assert t1.natural_vector_size(hl.Float(32)) == 4 # has_gpu_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.OpenCL]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, []) assert t1.has_gpu_feature() assert not t2.has_gpu_feature() # has_large_buffers & maximum_buffer_size t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.LargeBuffers]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, []) assert t1.has_large_buffers() assert t1.maximum_buffer_size() == 9223372036854775807 assert not t2.has_large_buffers() assert t2.maximum_buffer_size() == 2147483647 # supports_device_api t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.CUDA]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64) assert t1.supports_device_api(hl.DeviceAPI.CUDA) assert not t2.supports_device_api(hl.DeviceAPI.CUDA) # supports_type (deprecated version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64)) assert t2.supports_type(hl.Float(64)) # supports_type (preferred version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal) assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal) # target_feature_for_device_api assert hl.target_feature_for_device_api( hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL # with_feature with non-convertible lists try: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, ["this is a string"]) except TypeError as e: assert "incompatible constructor arguments" in str(e) else: assert False, 'Did not see expected exception!'
def test_target(): # Target("") should be exactly like get_host_target(). t1 = hl.get_host_target() t2 = hl.Target("") assert t1 == t2, "Default ctor failure" assert t1.supported(); # to_string roundtripping t1 = hl.Target() ts = t1.to_string() assert ts == "arch_unknown-0-os_unknown" assert hl.validate_target_string(ts) t2 = hl.Target(ts) # equality assert t2 == t1 # repr() and str() assert str(t1) == "arch_unknown-0-os_unknown" assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>" assert t1.os == hl.TargetOS.OSUnknown assert t1.arch == hl.TargetArch.ArchUnknown assert t1.bits == 0 # Full specification round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [ hl.TargetFeature.SSE41 ]) ts = t1.to_string() assert ts == "x86-32-linux-sse41" assert hl.validate_target_string(ts) # Full specification (without features) round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32) ts = t1.to_string() assert ts == "x86-32-linux" assert hl.validate_target_string(ts) # Full specification round-trip, crazy features t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32, [hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX, hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL, hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute, hl.TargetFeature.Debug]) ts = t1.to_string() assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41" assert hl.validate_target_string(ts) # Expected failures: ts = "host-unknowntoken" assert not hl.validate_target_string(ts) ts = "x86-23" assert not hl.validate_target_string(ts) # bits == 0 is allowed only if arch_unknown and os_unknown are specified, # and no features are set ts = "x86-0" assert not hl.validate_target_string(ts) ts = "0-arch_unknown-os_unknown-sse41" assert not hl.validate_target_string(ts) # "host" is only supported as the first token ts = "opencl-host" assert not hl.validate_target_string(ts) # set_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.SSE41, False) assert t1.has_feature(hl.TargetFeature.AVX) assert not t1.has_feature(hl.TargetFeature.SSE41) # set_features t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_features([hl.TargetFeature.SSE41], False) t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True) assert t1.has_feature(hl.TargetFeature.AVX) assert t1.has_feature(hl.TargetFeature.AVX2) assert not t1.has_feature(hl.TargetFeature.SSE41) # with_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature(hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41" # without_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts]) # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature(hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-sse41" # natural_vector_size # SSE4.1 is 16 bytes wide t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.natural_vector_size(hl.UInt(8)) == 16 assert t1.natural_vector_size(hl.Int(16)) == 8 assert t1.natural_vector_size(hl.UInt(32)) == 4 assert t1.natural_vector_size(hl.Float(32)) == 4 # has_gpu_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.OpenCL]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, []) assert t1.has_gpu_feature() assert not t2.has_gpu_feature() # has_large_buffers & maximum_buffer_size t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.LargeBuffers]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, []) assert t1.has_large_buffers() assert t1.maximum_buffer_size() == 9223372036854775807 assert not t2.has_large_buffers() assert t2.maximum_buffer_size() == 2147483647 # supports_device_api t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.CUDA]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64) assert t1.supports_device_api(hl.DeviceAPI.CUDA) assert not t2.supports_device_api(hl.DeviceAPI.CUDA) # supports_type (deprecated version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64)) assert t2.supports_type(hl.Float(64)) # supports_type (preferred version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal) assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal) # target_feature_for_device_api assert hl.target_feature_for_device_api(hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL # with_feature with non-convertible lists try: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [ "this is a string" ]) except TypeError as e: assert str(e) == "No registered converter was able to produce a C++ rvalue of type Halide::Target::Feature from this Python object of type str" else: assert False, 'Did not see expected exception!'