def test_compiletime_error(): x = hl.Var('x') y = hl.Var('y') f = hl.Func('f') f[x, y] = hl.u16(x + y) # Deliberate type-mismatch error buf = hl.Buffer(hl.UInt(8), [2, 2]) try: f.realize(buf) except RuntimeError as e: assert 'Output buffer f has type uint16 but type of the buffer passed in is uint8' in str(e) else: assert False, 'Did not see expected exception!'
def main(): input_img = hl.ImageParam(hl.UInt(16), 3, 'input') # number of intensity levels levels = hl.Param(int_t, 'levels', 8) # Parameters controlling the filter alpha = hl.Param(float_t, 'alpha', 1.0 / 7.0) beta = hl.Param(float_t, 'beta', 1.0) local_laplacian = get_local_laplacian(input_img, levels, alpha, beta) filter_test_image(local_laplacian, input_img)
def test_runtime_error(): x = hl.Var('x') f = hl.Func('f') f[x] = hl.u8(x) f.bound(x, 0, 1) # Deliberate runtime error buf = hl.Buffer(hl.UInt(8), [10]) try: f.realize(buf) except RuntimeError as e: assert 'do not cover required region' in str(e) else: assert False, 'Did not see expected exception!'
def generate_compiled_file(local_laplacian): # Need to copy the process executable from the C++ apps/local_laplacian folder to run this. # (after making it of course) arguments = ArgumentsVector() arguments.append(Argument('levels', False, int_t)) arguments.append(Argument('alpha', False, float_t)) arguments.append(Argument('beta', False, float_t)) arguments.append(Argument('input', True, hl.UInt(16))) target = hl.get_target_from_environment() local_laplacian.compile_to_file("local_laplacian", arguments, "local_laplacian", target) print("Generated compiled file for local_laplacian function.") return
def test_all(vector_width, target): # print("target is %s " % str(target)) W = 32 H = 32 input = hl.Buffer(hl.UInt(8), [W, H]) for r in range(H): for c in range(W): input[c, r] = (c + r * W) & 0xff input_f = hl.Func() input_f[x, y] = input[x, y] tests = [ (hl.BoundaryConditions.constant_exterior, check_constant_exterior), (hl.BoundaryConditions.repeat_edge, check_repeat_edge), (hl.BoundaryConditions.repeat_image, check_repeat_image), (hl.BoundaryConditions.mirror_image, check_mirror_image), (hl.BoundaryConditions.mirror_interior, check_mirror_interior), ] for bc, checker in tests: # print(' Testing %s:%d...' % (bc.__name__, vector_width)) func_input_args = {'f': input_f, 'bounds': [(0, W), (0, H)]} image_input_args = {'f': input, 'bounds': [(0, W), (0, H)]} undef_min_args = {'f': input, 'bounds': [(hl.Expr(), hl.Expr()), (0, H)]} undef_max_args = {'f': input, 'bounds': [(0, W), (hl.Expr(), hl.Expr())]} implicit_bounds_args = {'f': input} if bc == hl.BoundaryConditions.constant_exterior: func_input_args['exterior'] = test_exterior image_input_args['exterior'] = test_exterior undef_min_args['exterior'] = test_exterior undef_max_args['exterior'] = test_exterior implicit_bounds_args['exterior'] = test_exterior realize_and_check( bc(**func_input_args), checker, input, test_min, test_extent, test_min, test_extent, vector_width, target) realize_and_check( bc(**image_input_args), checker, input, test_min, test_extent, test_min, test_extent, vector_width, target) realize_and_check( bc(**undef_min_args), checker, input, 0, W, test_min, test_extent, vector_width, target) realize_and_check( bc(**undef_max_args), checker, input, test_min, test_extent, 0, H, vector_width, target) realize_and_check( bc(**implicit_bounds_args), checker, input, test_min, test_extent, test_min, test_extent, vector_width, target)
def test_basics(): input = hl.ImageParam(hl.UInt(16), 2, 'input') x, y = hl.Var('x'), hl.Var('y') blur_x = hl.Func('blur_x') blur_xx = hl.Func('blur_xx') blur_y = hl.Func('blur_y') yy = hl.cast(hl.Int(32), 1) assert yy.type() == hl.Int(32) print("yy type:", yy.type()) z = x + 1 input[x,y] input[0,0] input[z,y] input[x+1,y] print("ping 0.2") input[x,y]+input[x+1,y] if False: aa = blur_x[x,y] bb = blur_x[x,y+1] aa + bb blur_x[x,y]+blur_x[x,y+1] print("ping 0.3") (input[x,y]+input[x+1,y]) / 2 print("ping 0.4") blur_x[x,y] print("ping 0.4.1") blur_xx[x,y] = input[x,y] print("ping 0.5") blur_x[x,y] = (input[x,y]+input[x+1,y]+input[x+2,y])/3 print("ping 1") blur_y[x,y] = (blur_x[x,y]+blur_x[x,y+1]+blur_x[x,y+2])/3 xi, yi = hl.Var('xi'), hl.Var('yi') print("ping 2") blur_y.tile(x, y, xi, yi, 8, 4).parallel(y).vectorize(xi, 8) blur_x.compute_at(blur_y, x).vectorize(x, 8) blur_y.compile_jit() print("Compiled to jit") return
def test_scalar_funcs(): input = hl.ImageParam(hl.UInt(16), 0, 'input') f = hl.Func('f') g = hl.Func('g') input[()] (input[()] + input[()]) / 2 f[()] g[()] f[()] = (input[()] + input[()] + input[()]) / 3 g[()] = (f[()] + f[()] + f[()]) / 3 g.compile_jit()
def main(): input = hl.ImageParam(hl.UInt(16), 3, 'input') # number of intensity levels levels = hl.Param(int_t, 'levels', 8) #Parameters controlling the filter alpha = hl.Param(float_t, 'alpha', 1.0 / 7.0) beta = hl.Param(float_t, 'beta', 1.0) local_laplacian = get_local_laplacian(input, levels, alpha, beta) generate = False # Set to False to run the jit immediately and get instant gratification. if generate: generate_compiled_file(local_laplacian) else: filter_test_image(local_laplacian, input) return
def test_looplevel(): x, y = hl.Var('x'), hl.Var('y') target = hl.get_jit_target_from_environment() buffer_input = hl.Buffer(hl.UInt(8), [4, 4]) buffer_input.fill(123) func_input = hl.Func("func_input") func_input[x, y] = x + y simple_compute_at = hl.LoopLevel() simple = simplestub.generate(target, buffer_input, func_input, 3.5, compute_level=simple_compute_at) computed_output = hl.Func('computed_output') computed_output[x, y] = simple[x, y] + 3 simple_compute_at.set(hl.LoopLevel(computed_output, x)) _realize_and_check(computed_output, 3)
def test_make_interleaved(): w = 7 h = 13 c = 3 b = hl.Buffer.make_interleaved(type = hl.UInt(8), width = w, height = h, channels = c) assert b.dim(0).min() == 0 assert b.dim(0).extent() == w assert b.dim(0).stride() == c assert b.dim(1).min() == 0 assert b.dim(1).extent() == h assert b.dim(1).stride() == w * c assert b.dim(2).min() == 0 assert b.dim(2).extent() == c assert b.dim(2).stride() == 1 a = np.array(b, copy = False) assert a.shape == (w, h, c) assert a.strides == (c, w*c, 1) assert a.dtype == np.uint8
def test_int_promotion(): # Verify that (Exprlike op literal) correctly matches the type # of the literal to the Exprlike (rather than promoting the result to int32). # All types that use add_binary_operators() should be tested here. x = hl.Var('x') # All the binary ops are handled the same, so + is good enough # Exprlike = FuncRef f = hl.Func('f') f[x] = hl.u16(x) _check_is_u16(f[x] + 2) _check_is_u16(2 + f[x]) # Exprlike = Expr e = hl.Expr(f[x]) _check_is_u16(e + 2) _check_is_u16(2 + e) # Exprlike = Param p = hl.Param(hl.UInt(16)) _check_is_u16(p + 2) _check_is_u16(2 + p)
def addone(): # feed input input_data = np.ones((4, 4), dtype=np.uint8) A = hl.Buffer(input_data) i, j = hl.Var("i"), hl.Var("j") B = hl.Func("B") B[i, j] = A[i, j] + 1 # output if 0: output = B.realize(4, 4) print("out: \n", np.asanyarray(output)) if 0: output = hl.Buffer(hl.UInt(8), [4, 4]) B.realize(output) print("out: \n", np.asanyarray(output)) if 1: output_data = np.empty(input_data.shape, dtype=input_data.dtype, order="F") output = hl.Buffer(output_data) B.realize(output) print("out: \n", output_data)
def get_blur(input): assert type(input) == hl.ImageParam assert input.dimensions() == 2 x, y = hl.Var("x"), hl.Var("y") clamped_input = hl.BoundaryConditions.repeat_edge(input) input_uint16 = hl.Func("input_uint16") input_uint16[x,y] = hl.u16(clamped_input[x,y]) ci = input_uint16 blur_x = hl.Func("blur_x") blur_y = hl.Func("blur_y") blur_x[x,y] = (ci[x,y]+ci[x+1,y]+ci[x+2,y])/3 blur_y[x,y] = hl.cast(hl.UInt(8), (blur_x[x,y]+blur_x[x,y+1]+blur_x[x,y+2])/3) # schedule xi, yi = hl.Var("xi"), hl.Var("yi") blur_y.tile(x, y, xi, yi, 8, 4).parallel(y).vectorize(xi, 8) blur_x.compute_at(blur_y, x).vectorize(x, 8) return blur_y
def yuv_to_rgb(input): print(' yuv_to_rgb') output = hl.Func("yuv_to_rgb_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") Y = input[x, y, 0] U = input[x, y, 1] V = input[x, y, 2] output[x, y, c] = hl.cast(hl.UInt(16), 0) output[x, y, 0] = hl.u16_sat(Y + 1.403 * V) output[x, y, 1] = hl.u16_sat(Y - 0.344 * U - 0.714 * V) output[x, y, 2] = hl.u16_sat(Y + 1.77 * U) output.compute_root().parallel(y).vectorize(x, 16) output.update(0).parallel(y).vectorize(x, 16) output.update(1).parallel(y).vectorize(x, 16) output.update(2).parallel(y).vectorize(x, 16) return output
def main(): # First we'll declare some Vars to use below. x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/rgb.png") # Now we'll express a multi-stage pipeline that blurs an image # first horizontally, and then vertically. if True: # Take a color 8-bit input input = hl.Buffer(imageio.imread(image_path)) assert input.type() == hl.UInt(8) # Upgrade it to 16-bit, so we can do math without it overflowing. input_16 = hl.Func("input_16") input_16[x, y, c] = hl.cast(hl.UInt(16), input[x, y, c]) # Blur it horizontally: blur_x = hl.Func("blur_x") blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4 # Blur it vertically: blur_y = hl.Func("blur_y") blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4 # Convert back to 8-bit. output = hl.Func("output") output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c]) # Each hl.Func in this pipeline calls a previous one using # familiar function call syntax (we've overloaded operator() # on hl.Func objects). A hl.Func may call any other hl.Func that has # been given a definition. This restriction prevents # pipelines with loops in them. Halide pipelines are always # feed-forward graphs of Funcs. # Now let's realize it... # result = output.realize(input.width(), input.height(), 3) # Except that the line above is not going to work. Uncomment # it to see what happens. # Realizing this pipeline over the same domain as the input # image requires reading pixels out of bounds in the input, # because the blur_x stage reaches outwards horizontally, and # the blur_y stage reaches outwards vertically. Halide # detects this by injecting a piece of code at the top of the # pipeline that computes the region over which the input will # be read. When it starts to run the pipeline it first runs # this code, determines that the input will be read out of # bounds, and refuses to continue. No actual bounds checks # occur in the inner loop that would be slow. # # So what do we do? There are a few options. If we realize # over a domain shifted inwards by one pixel, we won't be # asking the Halide routine to read out of bounds. We saw how # to do this in the previous lesson: result = hl.Buffer( hl.UInt(8), [input.width() - 2, input.height() - 2, 3]) result.set_min([1, 1]) output.realize(result) # Save the result. It should look like a slightly blurry # parrot, and it should be two pixels narrower and two pixels # shorter than the input image. imageio.imsave("blurry_parrot_1.png", result) print("Created blurry_parrot_1.png") # This is usually the fastest way to deal with boundaries: # don't write code that reads out of bounds :) The more # general solution is our next example. # The same pipeline, with a boundary condition on the input. if True: # Take a color 8-bit input input = hl.Buffer(imageio.imread(image_path)) assert input.type() == hl.UInt(8) # This time, we'll wrap the input in a hl.Func that prevents # reading out of bounds: clamped = hl.Func("clamped") # Define an expression that clamps x to lie within the the # range [0, input.width()-1]. clamped_x = hl.clamp(x, 0, input.width() - 1) # Similarly hl.clamp y. clamped_y = hl.clamp(y, 0, input.height() - 1) # Load from input at the clamped coordinates. This means that # no matter how we evaluated the hl.Func 'clamped', we'll never # read out of bounds on the input. This is a hl.clamp-to-edge # style boundary condition, and is the simplest boundary # condition to express in Halide. clamped[x, y, c] = input[clamped_x, clamped_y, c] # Defining 'clamped' in that way can be done more concisely # using a helper function from the BoundaryConditions # namespace like so: # # clamped = hl.BoundaryConditions.repeat_edge(input) # # These are important to use for other boundary conditions, # because they are expressed in the way that Halide can best # understand and optimize. # Upgrade it to 16-bit, so we can do math without it # overflowing. This time we'll refer to our new hl.Func # 'clamped', instead of referring to the input image # directly. input_16 = hl.Func("input_16") input_16[x, y, c] = hl.cast(hl.UInt(16), clamped[x, y, c]) # The rest of the pipeline will be the same... # Blur it horizontally: blur_x = hl.Func("blur_x") blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4 # Blur it vertically: blur_y = hl.Func("blur_y") blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4 # Convert back to 8-bit. output = hl.Func("output") output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c]) # This time it's safe to evaluate the output over the some # domain as the input, because we have a boundary condition. result = output.realize(input.width(), input.height(), 3) # Save the result. It should look like a slightly blurry # parrot, but this time it will be the same size as the # input. imageio.imsave("blurry_parrot_2.png", result) print("Created blurry_parrot_2.png") print("Success!") return 0
def main(): # We'll define the simple one-stage pipeline that we used in lesson 10. brighter = hl.Func("brighter") x, y = hl.Var("x"), hl.Var("y") # Declare the arguments. offset = hl.Param(hl.UInt(8)) input = hl.ImageParam(hl.UInt(8), 2) args = [input, offset] # Define the hl.Func. brighter[x, y] = input[x, y] + offset # Schedule it. brighter.vectorize(x, 16).parallel(y) # The following line is what we did in lesson 10. It compiles an # object file suitable for the system that you're running this # program on. For example, if you compile and run this file on # 64-bit linux on an x86 cpu with sse4.1, then the generated code # will be suitable for 64-bit linux on x86 with sse4.1. brighter.compile_to_file("lesson_11_host", args, "lesson_11_host") # We can also compile object files suitable for other cpus and # operating systems. You do this with an optional third argument # to compile_to_file which specifies the target to compile for. create_android = True create_windows = True create_ios = True if create_android: # Let's use this to compile a 32-bit arm android version of this code: target = hl.Target() target.os = hl.TargetOS.Android # The operating system target.arch = hl.TargetArch.ARM # The CPU architecture target.bits = 32 # The bit-width of the architecture arm_features = [] # A list of features to set target.set_features(arm_features) # Pass the target as the last argument. brighter.compile_to_file("lesson_11_arm_32_android", args, "lesson_11_arm_32_android", target) if create_windows: # And now a Windows object file for 64-bit x86 with AVX and SSE 4.1: target = hl.Target() target.os = hl.TargetOS.Windows target.arch = hl.TargetArch.X86 target.bits = 64 target.set_features([hl.TargetFeature.AVX, hl.TargetFeature.SSE41]) brighter.compile_to_file("lesson_11_x86_64_windows", args, "lesson_11_x86_64_windows", target) if create_ios: # And finally an iOS mach-o object file for one of Apple's 32-bit # ARM processors - the A6. It's used in the iPhone 5. The A6 uses # a slightly modified ARM architecture called ARMv7s. We specify # this using the target features field. Support for Apple's # 64-bit ARM processors is very new in llvm, and still somewhat # flaky. target = hl.Target() target.os = hl.TargetOS.IOS target.arch = hl.TargetArch.ARM target.bits = 32 target.set_features([hl.TargetFeature.ARMv7s]) brighter.compile_to_file("lesson_11_arm_32_ios", args, "lesson_11_arm_32_ios", target) # Now let's check these files are what they claim, by examining # their first few bytes. if create_android: # 32-arm android object files start with the magic bytes: # uint8_t [] arm_32_android_magic = [ 0x7f, ord('E'), ord('L'), ord('F'), # ELF format 1, # 32-bit 1, # 2's complement little-endian 1 ] # Current version of elf length = len(arm_32_android_magic) f = open("lesson_11_arm_32_android.o", "rb") try: header_bytes = f.read(length) except: print("Android object file not generated") return -1 f.close() header = list(unpack("B" * length, header_bytes)) if header != arm_32_android_magic: print([x == y for x, y in zip(header, arm_32_android_magic)]) raise Exception( "Unexpected header bytes in 32-bit arm object file.") return -1 if create_windows: # 64-bit windows object files start with the magic 16-bit value 0x8664 # (presumably referring to x86-64) # uint8_t [] win_64_magic = [0x64, 0x86] f = open("lesson_11_x86_64_windows.obj", "rb") try: header_bytes = f.read(2) except: print("Windows object file not generated") return -1 f.close() header = list(unpack("B" * 2, header_bytes)) if header != win_64_magic: raise Exception( "Unexpected header bytes in 64-bit windows object file.") return -1 if create_ios: # 32-bit arm iOS mach-o files start with the following magic bytes: # uint32_t [] arm_32_ios_magic = [ 0xfeedface, # Mach-o magic bytes #0xfe, 0xed, 0xfa, 0xce, # Mach-o magic bytes 12, # CPU type is ARM 11, # CPU subtype is ARMv7s 1 ] # It's a relocatable object file. f = open("lesson_11_arm_32_ios.o", "rb") try: header_bytes = f.read(4 * 4) except: print("ios object file not generated") return -1 f.close() header = list(unpack("I" * 4, header_bytes)) if header != arm_32_ios_magic: raise Exception( "Unexpected header bytes in 32-bit arm ios object file.") return -1 # It looks like the object files we produced are plausible for # those targets. We'll count that as a success for the purposes # of this tutorial. For a real application you'd then need to # figure out how to integrate Halide into your cross-compilation # toolchain. There are several small examples of this in the # Halide repository under the apps folder. See HelloAndroid and # HelloiOS here: # https:#github.com/halide/Halide/tree/master/apps/ print("Success!") return 0
def main(): # Declare some Vars to use below. x, y = hl.Var("x"), hl.Var("y") # Load a grayscale image to use as an input. image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png") input_data = imageio.imread(image_path) if True: # making the image smaller to go faster input_data = input_data[:160, :150] assert input_data.dtype == np.uint8 input = hl.Buffer(input_data) # You can define a hl.Func in multiple passes. Let's see a toy # example first. if True: # The first definition must be one like we have seen already # - a mapping from Vars to an hl.Expr: f = hl.Func("f") f[x, y] = x + y # We call this first definition the "pure" definition. # But the later definitions can include computed expressions on # both sides. The simplest example is modifying a single point: f[3, 7] = 42 # We call these extra definitions "update" definitions, or # "reduction" definitions. A reduction definition is an # update definition that recursively refers back to the # function's current value at the same site: if False: e = f[x, y] + 17 print("f[x, y] + 17", e) print("(f[x, y] + 17).type()", e.type()) print("(f[x, y]).type()", f[x, y].type()) f[x, y] = f[x, y] + 17 # If we confine our update to a single row, we can # recursively refer to values in the same column: f[x, 3] = f[x, 0] * f[x, 10] # Similarly, if we confine our update to a single column, we # can recursively refer to other values in the same row. f[0, y] = f[0, y] / f[3, y] # The general rule is: Each hl.Var used in an update definition # must appear unadorned in the same position as in the pure # definition in all references to the function on the left- # and right-hand sides. So the following definitions are # legal updates: # x is used, so all uses of f must have x as the first argument. f[x, 17] = x + 8 # y is used, so all uses of f must have y as the second argument. f[0, y] = y * 8 f[x, x + 1] = x + 8 f[y / 2, y] = f[0, y] * 17 # But these ones would cause an error: # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'. # f[y, y + 1) = y + 8 <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'. # f[y, x) = y - x <- Arguments to f on the left-hand-side are in the wrong places. # f[3, 4) = x + y <- Free variables appear on the right-hand-side # but not the left-hand-side. # We'll realize this one just to make sure it compiles. The # second-to-last definition forces us to realize over a # domain that is taller than it is wide. f.realize(100, 101) # For each realization of f, each step runs in its entirety # before the next one begins. Let's trace the loads and # stores for a simpler example: g = hl.Func("g") g[x, y] = x + y # Pure definition g[2, 1] = 42 # First update definition g[x, 0] = g[x, 1] # Second update definition g.trace_loads() g.trace_stores() g.realize(4, 4) # Reading the log, we see that each pass is applied in turn. The # equivalent Python is: result = np.empty((4, 4), dtype=np.int) # Pure definition for yy in range(4): for xx in range(4): result[yy][xx] = xx + yy # First update definition result[1][2] = 42 # Second update definition for xx in range(4): result[0][xx] = result[1][xx] # end of section # Putting update passes inside loops. if True: # Starting with this pure definition: f = hl.Func("f") f[x, y] = x + y # Say we want an update that squares the first fifty rows. We # could do this by adding 50 update definitions: # f[x, 0) = f[x, 0) * f[x, 0) # f[x, 1) = f[x, 1) * f[x, 1) # f[x, 2) = f[x, 2) * f[x, 2) # ... # f[x, 49) = f[x, 49) * f[x, 49) # Or equivalently using a compile-time loop in our C++: # for (int i = 0 i < 50 i++) { # f[x, i) = f[x, i) * f[x, i) # # But it's more manageable and more flexible to put the loop # in the generated code. We do this by defining a "reduction # domain" and using it inside an update definition: r = hl.RDom([(0, 50)]) f[x, r] = f[x, r] * f[x, r] halide_result = f.realize(100, 100) # The equivalent Python is: py_result = np.empty((100, 100), dtype=np.int) for yy in range(100): for xx in range(100): py_result[yy][xx] = xx + yy for xx in range(100): for rr in range(50): # The loop over the reduction domain occurs inside of # the loop over any pure variables used in the update # step: py_result[rr][xx] = py_result[rr][xx] * py_result[rr][xx] # Check the results match: for yy in range(100): for xx in range(100): assert halide_result[xx, yy] == py_result[yy][xx], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[yy][xx]) # Now we'll examine a real-world use for an update definition: # computing a histogram. if True: # Some operations on images can't be cleanly expressed as a pure # function from the output coordinates to the value stored # there. The classic example is computing a histogram. The # natural way to do it is to iterate over the input image, # updating histogram buckets. Here's how you do that in Halide: histogram = hl.Func("histogram") # Histogram buckets start as zero. histogram[x] = 0 # Define a multi-dimensional reduction domain over the input image: r = hl.RDom([(0, input.width()), (0, input.height())]) # For every point in the reduction domain, increment the # histogram bucket corresponding to the intensity of the # input image at that point. histogram[input[r.x, r.y]] += 1 halide_result = histogram.realize(256) # The equivalent Python is: py_result = np.empty((256), dtype=np.int) for xx in range(256): py_result[xx] = 0 for r_y in range(input.height()): for r_x in range(input.width()): py_result[input_data[r_x, r_y]] += 1 # Check the answers agree: for xx in range(256): assert py_result[xx] == halide_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) # Scheduling update steps if True: # The pure variables in an update step and can be # parallelized, vectorized, split, etc as usual. # Vectorizing, splitting, or parallelize the variables that # are part of the reduction domain is trickier. We'll cover # that in a later lesson. # Consider the definition: f = hl.Func("x") f[x, y] = x * y # Set the second row to equal the first row. f[x, 1] = f[x, 0] # Set the second column to equal the first column plus 2. f[1, y] = f[0, y] + 2 # The pure variables in each stage can be scheduled # independently. To control the pure definition, we schedule # as we have done in the past. The following code vectorizes # and parallelizes the pure definition only. f.vectorize(x, 4).parallel(y) # We use hl.Func::update(int) to get a handle to an update step # for the purposes of scheduling. The following line # vectorizes the first update step across x. We can't do # anything with y for this update step, because it doesn't # use y. f.update(0).vectorize(x, 4) # Now we parallelize the second update step in chunks of size # 4. yo, yi = hl.Var("yo"), hl.Var("yi") f.update(1).split(y, yo, yi, 4).parallel(yo) halide_result = f.realize(16, 16) # Here's the equivalent (serial) C: py_result = np.empty((16, 16), dtype=np.int) # Pure step. Vectorized in x and parallelized in y. for yy in range(16): # Should be a parallel for loop for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] py_result[yy][xx[0]] = xx[0] * yy py_result[yy][xx[1]] = xx[1] * yy py_result[yy][xx[2]] = xx[2] * yy py_result[yy][xx[3]] = xx[3] * yy # First update. Vectorized in x. for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] py_result[1][xx[0]] = py_result[0][xx[0]] py_result[1][xx[1]] = py_result[0][xx[1]] py_result[1][xx[2]] = py_result[0][xx[2]] py_result[1][xx[3]] = py_result[0][xx[3]] # Second update. Parallelized in chunks of size 4 in y. for yo in range(4): # Should be a parallel for loop for yi in range(4): yy = yo * 4 + yi py_result[yy][1] = py_result[yy][0] + 2 # Check the C and Halide results match: for yy in range(16): for xx in range(16): assert halide_result[xx, yy] == py_result[yy][xx], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[yy][xx]) # That covers how to schedule the variables within a hl.Func that # uses update steps, but what about producer-consumer # relationships that involve compute_at and store_at? Let's # examine a reduction as a producer, in a producer-consumer pair. if True: # Because an update does multiple passes over a stored array, # it's not meaningful to inline them. So the default schedule # for them does the closest thing possible. It computes them # in the innermost loop of their consumer. Consider this # trivial example: producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 producer[x] += 1 consumer[x] = 2 * producer[x] halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) for xx in range(10): producer_storage = np.empty((1), dtype=np.int) # Pure step for producer producer_storage[0] = xx * 17 # Update step for producer producer_storage[0] = producer_storage[0] + 1 # Pure step for consumer py_result[xx] = 2 * producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) # For all other compute_at/store_at options, the reduction # gets placed where you would expect, somewhere in the loop # nest of the consumer. # Now let's consider a reduction as a consumer in a # producer-consumer pair. This is a little more involved. if True: if True: # Case 1: The consumer references the producer in the pure step # only. producer, consumer = hl.Func("producer"), hl.Func("consumer") # The producer is pure. producer[x] = x * 17 consumer[x] = 2 * producer[x] consumer[x] += 1 # The valid schedules for the producer in this case are # the default schedule - inlined, and also: # # 1) producer.compute_at(x), which places the computation of # the producer inside the loop over x in the pure step of the # consumer. # # 2) producer.compute_root(), which computes all of the # producer ahead of time. # # 3) producer.store_root().compute_at(x), which allocates # space for the consumer outside the loop over x, but fills # it in as needed inside the loop. # # Let's use option 1. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] = 2 * producer_storage[0] # Update step for the consumer for xx in range(10): py_result[xx] += 1 # All of the pure step is evaluated before any of the # update step, so there are two separate loops over x. # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) if True: # Case 2: The consumer references the producer in the update step # only producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside the update # step of the producer, because that's the only step that # uses the producer. producer.compute_at(consumer, x) # Note however, that we didn't say: # # producer.compute_at(consumer.update(0), x). # # Scheduling is done with respect to Vars of a hl.Func, and # the Vars of a hl.Func are shared across the pure and # update steps. halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): py_result[xx] = xx # Update step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] += producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) if True: # Case 3: The consumer references the producer in # multiple steps that share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = producer[x] * x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside both the # pure and the update step of the producer. So there ends # up being two separate realizations of the producer, and # redundant work occurs. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] = producer_storage[0] * xx # Update step for the consumer for xx in range(10): # Another copy of the pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] += producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) if True: # Case 4: The consumer references the producer in # multiple steps that do not share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x, y] = x * y consumer[x, y] = x + y consumer[x, 0] = producer[x, x - 1] consumer[0, y] = producer[y, y - 1] # In this case neither producer.compute_at(consumer, x) # nor producer.compute_at(consumer, y) will work, because # either one fails to cover one of the uses of the # producer. So we'd have to inline producer, or use # producer.compute_root(). # Let's say we really really want producer to be # compute_at the inner loops of both consumer update # steps. Halide doesn't allow multiple different # schedules for a single hl.Func, but we can work around it # by making two wrappers around producer, and scheduling # those instead: # Attempt 2: producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(), hl.Func(), hl.Func() producer_wrapper_1[x, y] = producer[x, y] producer_wrapper_2[x, y] = producer[x, y] consumer_2[x, y] = x + y consumer_2[x, 0] += producer_wrapper_1[x, x - 1] consumer_2[0, y] += producer_wrapper_2[y, y - 1] # The wrapper functions give us two separate handles on # the producer, so we can schedule them differently. producer_wrapper_1.compute_at(consumer_2, x) producer_wrapper_2.compute_at(consumer_2, y) halide_result = consumer_2.realize(10, 10) # The equivalent Python is: py_result = np.empty((10, 10), dtype=np.int) # Pure step for the consumer for yy in range(10): for xx in range(10): py_result[yy][xx] = xx + yy # First update step for consumer for xx in range(10): producer_wrapper_1_storage = np.empty((1), dtype=np.int) producer_wrapper_1_storage[0] = xx * (xx - 1) py_result[0][xx] += producer_wrapper_1_storage[0] # Second update step for consumer for yy in range(10): producer_wrapper_2_storage = np.empty((1), dtype=np.int) producer_wrapper_2_storage[0] = yy * (yy - 1) py_result[yy][0] += producer_wrapper_2_storage[0] # Check the results match for yy in range(10): for xx in range(10): assert halide_result[xx, yy] == py_result[yy][xx], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[yy][xx]) if True: # Case 5: Scheduling a producer under a reduction domain # variable of the consumer. # We are not just restricted to scheduling producers at # the loops over the pure variables of the consumer. If a # producer is only used within a loop over a reduction # domain (hl.RDom) variable, we can also schedule the # producer there. producer, consumer = hl.Func("producer"), hl.Func("consumer") r = hl.RDom([(0, 5)]) producer[x] = x * 17 consumer[x] = x + 10 consumer[x] += r + producer[x + r] producer.compute_at(consumer, r) halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer. for xx in range(10): py_result[xx] = xx + 10 # Update step for the consumer. for xx in range(10): # The loop over the reduction domain is always the inner loop. for rr in range(5): # We've schedule the storage and computation of # the producer here. We just need a single value. producer_storage = np.empty((1), dtype=np.int) # Pure step of the producer. producer_storage[0] = (xx + rr) * 17 # Now use it in the update step of the consumer. py_result[xx] += rr + producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) # A real-world example of a reduction inside a producer-consumer chain. if True: # The default schedule for a reduction is a good one for # convolution-like operations. For example, the following # computes a 5x5 box-blur of our grayscale test image with a # hl.clamp-to-edge boundary condition: # First add the boundary condition. clamped = hl.BoundaryConditions.repeat_edge(input) # Define a 5x5 box that starts at (-2, -2) r = hl.RDom([(-2, 5), (-2, 5)]) # Compute the 5x5 sum around each pixel. local_sum = hl.Func("local_sum") local_sum[x, y] = 0 # Compute the sum as a 32-bit integer local_sum[x, y] += clamped[x + r.x, y + r.y] # Divide the sum by 25 to make it an average blurry = hl.Func("blurry") blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25) halide_result = blurry.realize(input.width(), input.height()) # The default schedule will inline 'clamped' into the update # step of 'local_sum', because clamped only has a pure # definition, and so its default schedule is fully-inlined. # We will then compute local_sum per x coordinate of blurry, # because the default schedule for reductions is # compute-innermost. Here's the equivalent Python: #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0] local_sum = np.empty((1), dtype=np.int32) py_result = hl.Buffer(hl.UInt(8), [input.width(), input.height()]) for yy in range(input.height()): for xx in range(input.width()): # FIXME this loop is quite slow # Pure step of local_sum local_sum[0] = 0 # Update step of local_sum for r_y in range(-2, 2 + 1): for r_x in range(-2, 2 + 1): # The clamping has been inlined into the update step. clamped_x = min(max(xx + r_x, 0), input.width() - 1) clamped_y = min(max(yy + r_y, 0), input.height() - 1) local_sum[0] += input[clamped_x, clamped_y] # Pure step of blurry # py_result(x, y) = (uint8_t)(local_sum[0] / 25) #py_result[xx, yy] = cast_to_uint8(local_sum[0] / 25) # hl.cast done internally py_result[xx, yy] = int(local_sum[0] / 25) # Check the results match for yy in range(input.height()): for xx in range(input.width()): assert halide_result[xx, yy] == py_result[xx, yy], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[xx, yy]) # Reduction helpers. if True: # There are several reduction helper functions provided in # Halide.h, which compute small reductions and schedule them # innermost into their consumer. The most useful one is # "sum". f1 = hl.Func("f1") r = hl.RDom([(0, 100)]) f1[x] = hl.sum(r + x) * 7 # Sum creates a small anonymous hl.Func to do the reduction. It's # equivalent to: f2, anon = hl.Func("f2"), hl.Func("anon") anon[x] = 0 anon[x] += r + x f2[x] = anon[x] * 7 # So even though f1 references a reduction domain, it is a # pure function. The reduction domain has been swallowed to # define the inner anonymous reduction. halide_result_1 = f1.realize(10) halide_result_2 = f2.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) for xx in range(10): anon = np.empty((1), dtype=np.int) anon[0] = 0 for rr in range(100): anon[0] += rr + xx py_result[xx] = anon[0] * 7 # Check they all match. for xx in range(10): assert halide_result_1[xx] == py_result[xx], \ "halide_result_1(%d) = %d instead of %d" % (xx, halide_result_1[xx], py_result[xx]) assert halide_result_2[xx] == py_result[xx], \ "halide_result_2(%d) = %d instead of %d" % (xx, halide_result_2[xx], py_result[xx]) print("Success!") return 0
def test_type(): t1 = hl.Type() assert t1.code() == hl.TypeCode.Handle assert t1.bits() == 0 assert t1.lanes() == 0 t1 = hl.Type(hl.TypeCode.Int, 32, 1) t2 = hl.Int(32) assert t1 == t2 assert t2.code() == hl.TypeCode.Int assert t2.bits() == 32 assert t2.lanes() == 1 assert t2.bytes() == 4 t1 = t2.with_code(hl.TypeCode.UInt) assert t1 == hl.UInt(32) t1 = t2.with_bits(16) assert t1 == hl.Int(16) assert t1 != t2 t1 = t2.with_lanes(8) assert t1 == hl.Int(32, 8) assert t1 != t2 assert t1.element_of() == hl.Int(32) b1 = hl.Bool() f32 = hl.Float(32) h64 = hl.Handle() i32 = hl.Int(32) u32 = hl.UInt(32) vi32x8 = hl.Int(32, 8) u64 = hl.UInt(64) i64 = hl.Int(64) assert b1.is_bool() assert f32.is_float() assert h64.is_handle() assert i32.is_int() assert u32.is_uint() assert i64.is_int() assert u64.is_uint() assert not vi32x8.is_scalar() assert vi32x8.is_vector() h2 = hl.Handle() assert h64.same_handle_type(h2) assert f32.can_represent(b1) assert not b1.can_represent(f32) assert b1.max() == 1 assert b1.min() == 0 assert i32.max() == 2147483647 assert i32.min() == -2147483648 assert not b1.is_max(2) assert b1.is_max(1) assert b1.is_min(0) assert not b1.is_min(-1) assert i32.is_max(2147483647) assert i32.is_min(-2147483648) assert not u32.is_max(4294967296) assert u32.is_max(4294967295) assert u32.is_min(0) assert not u32.is_min(-1) assert not u32.is_min(1) assert i64.is_max(9223372036854775807) assert i64.is_min(-9223372036854775808) # Python doesn't have unsigned integers, so we can't really express this value. # assert u64.is_max(0xFFFFFFFFFFFFFFFF) assert u64.is_min(0) # repr() and str() assert str(i32) == "int32" assert repr(i32) == "<halide.Type int32>" assert str(vi32x8) == "int32x8" assert repr(vi32x8) == "<halide.Type int32x8>"
def main(): # This program defines a single-stage imaging pipeline that # brightens an image. # First we'll load the input image we wish to brighten. image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/rgb.png") # We create a hl.Buffer object to wrap the numpy array input = hl.Buffer(imageio.imread(image_path)) assert input.type() == hl.UInt(8) # Next we define our hl.Func object that represents our one pipeline # stage. brighter = hl.Func("brighter") # Our hl.Func will have three arguments, representing the position # in the image and the color channel. Halide treats color # channels as an extra dimension of the image. x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") # Normally we'd probably write the whole function definition on # one line. Here we'll break it apart so we can explain what # we're doing at every step. # For each pixel of the input image. value = input[x, y, c] assert type(value) == hl.Expr # Cast it to a floating point value. value = hl.cast(hl.Float(32), value) # Multiply it by 1.5 to brighten it. Halide represents real # numbers as floats, not doubles, so we stick an 'f' on the end # of our constant. value = value * 1.5 # Clamp it to be less than 255, so we don't get overflow when we # hl.cast it back to an 8-bit unsigned int. value = hl.min(value, 255.0) # Cast it back to an 8-bit unsigned integer. value = hl.cast(hl.UInt(8), value) # Define the function. brighter[x, y, c] = value # The equivalent one-liner to all of the above is: # # brighter(x, y, c) = hl.cast<uint8_t>(hl.min(input(x, y, c) * 1.5f, 255)) # brighter[x, y, c] = hl.cast(hl.UInt(8), hl.min(input[x, y, c] * 1.5, 255)) # # In the shorter version: # - I skipped the hl.cast to float, because multiplying by 1.5f does # that automatically. # - I also used integer constants in hl.clamp, because they get hl.cast # to match the type of the first argument. # - I left the h. off hl.clamp. It's unnecessary due to Koenig # lookup. # Remember. All we've done so far is build a representation of a # Halide program in memory. We haven't actually processed any # pixels yet. We haven't even compiled that Halide program yet. # So now we'll realize the hl.Func. The size of the output image # should match the size of the input image. If we just wanted to # brighten a portion of the input image we could request a # smaller size. If we request a larger size Halide will throw an # error at runtime telling us we're trying to read out of bounds # on the input image. output_image = brighter.realize( [input.width(), input.height(), input.channels()]) assert output_image.type() == hl.UInt(8) # Save the output for inspection. It should look like a bright parrot. # python3-imageio versions <2.5 expect a numpy array imageio.imsave("brighter.png", np.asanyarray(output_image)) print("Created brighter.png result file.") print("Success!") return 0
def main(): # All Exprs have a scalar type, and all Funcs evaluate to one or # more scalar types. The scalar types in Halide are unsigned # integers of various bit widths, signed integers of the same set # of bit widths, floating point numbers in single and double # precision, and opaque handles (equivalent to void *). The # following array contains all the legal types. valid_halide_types = [ hl.UInt(8), hl.UInt(16), hl.UInt(32), hl.UInt(64), hl.Int(8), hl.Int(16), hl.Int(32), hl.Int(64), hl.Float(32), hl.Float(64), hl.Handle() ] # Constructing and inspecting types. if True: # You can programmatically examine the properties of a Halide # type. This is useful when you write a C++ function that has # hl.Expr arguments and you wish to check their types: assert hl.UInt(8).bits() == 8 assert hl.Int(8).is_int() # You can also programmatically construct Types as a function of other Types. t = hl.UInt(8) t = t.with_bits(t.bits() * 2) assert t == hl.UInt(16) # Or construct a Type from a C++ scalar type #assert type_of<float>() == hl.Float(32) # The Type struct is also capable of representing vector types, # but this is reserved for Halide's internal use. You should # vectorize code by using hl.Func::vectorize, not by attempting to # construct vector expressions directly. You may encounter vector # types if you programmatically manipulate lowered Halide code, # but this is an advanced topic (see hl.Func::add_custom_lowering_pass). # You can query any Halide hl.Expr for its type. An hl.Expr # representing a hl.Var has type hl.Int(32): x = hl.Var("x") assert hl.Expr(x).type() == hl.Int(32) # Most transcendental functions in Halide hl.cast their inputs to a # hl.Float(32) and return a hl.Float(32): assert hl.sin(x).type() == hl.Float(32) # You can hl.cast an hl.Expr from one Type to another using the hl.cast operator: assert hl.cast(hl.UInt(8), x).type() == hl.UInt(8) # This also comes in a template form that takes a C++ type. #assert hl.cast<uint8_t>(x).type() == hl.UInt(8) # You can also query any defined hl.Func for the types it produces. f1 = hl.Func("f1") f1[x] = hl.cast(hl.UInt(8), x) assert f1.output_types()[0] == hl.UInt(8) f2 = hl.Func("f2") f2[x] = (x, hl.sin(x)) assert f2.output_types()[0] == hl.Int(32) and \ f2.output_types()[1] == hl.Float(32) # Type promotion rules. if True: # When you combine Exprs of different types (e.g. using '+', # '*', etc), Halide uses a system of type promotion # rules. These differ to C's rules. To demonstrate these # we'll make some Exprs of each type. x = hl.Var("x") u8 = hl.cast(hl.UInt(8), x) u16 = hl.cast(hl.UInt(16), x) u32 = hl.cast(hl.UInt(32), x) u64 = hl.cast(hl.UInt(64), x) s8 = hl.cast(hl.Int(8), x) s16 = hl.cast(hl.Int(16), x) s32 = hl.cast(hl.Int(32), x) s64 = hl.cast(hl.Int(64), x) f32 = hl.cast(hl.Float(32), x) f64 = hl.cast(hl.Float(64), x) # The rules are as follows, and are applied in the order they are # written below. # 1) It is an error to hl.cast or use arithmetic operators on Exprs of type hl.Handle(). # 2) If the types are the same, then no type conversions occur. for t in valid_halide_types: # Skip the handle type. if t.is_handle(): continue e = hl.cast(t, x) assert (e + e).type() == e.type() # 3) If one type is a float but the other is not, then the # non-float argument is promoted to a float (possibly causing a # loss of precision for large integers). assert (u8 + f32).type() == hl.Float(32) assert (f32 + s64).type() == hl.Float(32) assert (u16 + f64).type() == hl.Float(64) assert (f64 + s32).type() == hl.Float(64) # 4) If both types are float, then the narrower argument is # promoted to the wider bit-width. assert (f64 + f32).type() == hl.Float(64) # The rules above handle all the floating-point cases. The # following three rules handle the integer cases. # 5) If one of the expressions is an integer constant, then it is # coerced to the type of the other expression. assert (u32 + 3).type() == hl.UInt(32) assert (3 + s16).type() == hl.Int(16) # If this rule would cause the integer to overflow, then Halide # will trigger an error, e.g. uncommenting the following line # will cause this program to terminate with an error. # hl.Expr bad = u8 + 257 # 6) If both types are unsigned integers, or both types are # signed integers, then the narrower argument is promoted to # wider type. assert (u32 + u8).type() == hl.UInt(32) assert (s16 + s64).type() == hl.Int(64) # 7) If one type is signed and the other is unsigned, both # arguments are promoted to a signed integer with the greater of # the two bit widths. assert (u8 + s32).type() == hl.Int(32) assert (u32 + s8).type() == hl.Int(32) # Note that this may silently overflow the unsigned type in the # case where the bit widths are the same. assert (u32 + s32).type() == hl.Int(32) if False: # evaluate<X> not yet exposed to python # When an unsigned hl.Expr is converted to a wider signed type in # this way, it is first widened to a wider unsigned type # (zero-extended), and then reinterpreted as a signed # integer. I.e. casting the hl.UInt(8) value 255 to an hl.Int(32) # produces 255, not -1. #int32_t result32 = evaluate<int>(hl.cast<int32_t>(hl.cast<uint8_t>(255))) assert result32 == 255 # When a signed type is explicitly converted to a wider unsigned # type with the hl.cast operator (the type promotion rules will # never do this automatically), it is first converted to the # wider signed type (sign-extended), and then reinterpreted as # an unsigned integer. I.e. casting the hl.Int(8) value -1 to a # hl.UInt(16) produces 65535, not 255. #uint16_t result16 = evaluate<uint16_t>(hl.cast<uint16_t>(hl.cast<int8_t>(-1))) assert result16 == 65535 # The type hl.Handle(). if True: # hl.Handle is used to represent opaque pointers. Applying # type_of to any pointer type will return hl.Handle() #assert type_of<void *>() == hl.Handle() #assert type_of<const char * const **>() == hl.Handle() # (not clear what the proper python version would be) # Handles are always stored as 64-bit, regardless of the compilation # target. assert hl.Handle().bits() == 64 # The main use of an hl.Expr of type hl.Handle is to pass # it through Halide to other external code. # Generic code. if True: # The main explicit use of Type in Halide is to write Halide # code parameterized by a Type. In C++ you'd do this with # templates. In Halide there's no need - you can inspect and # modify the types dynamically at C++ runtime instead. The # function defined below averages two expressions of any # equal numeric type. x = hl.Var("x") assert average(hl.cast(hl.Float(32), x), 3.0).type() == hl.Float(32) assert average(x, 3).type() == hl.Int(32) assert average(hl.cast(hl.UInt(8), x), hl.cast(hl.UInt(8), 3)).type() == hl.UInt(8) print("Success!") return 0
def main(): # Declare some Vars to use below. x, y = hl.Var("x"), hl.Var("y") # Load a grayscale image to use as an input. image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png") input_data = imread(image_path) if True: # making the image smaller to go faster input_data = input_data[:160, :150] assert input_data.dtype == np.uint8 input = hl.Buffer(input_data) # You can define a hl.Func in multiple passes. Let's see a toy # example first. if True: # The first definition must be one like we have seen already # - a mapping from Vars to an hl.Expr: f = hl.Func("f") f[x, y] = x + y # We call this first definition the "pure" definition. # But the later definitions can include computed expressions on # both sides. The simplest example is modifying a single point: f[3, 7] = 42 # We call these extra definitions "update" definitions, or # "reduction" definitions. A reduction definition is an # update definition that recursively refers back to the # function's current value at the same site: if False: e = f[x, y] + 17 print("f[x, y] + 17", e) print("(f[x, y] + 17).type()", e.type()) print("(f[x, y]).type()", f[x, y].type()) f[x, y] = f[x, y] + 17 # If we confine our update to a single row, we can # recursively refer to values in the same column: f[x, 3] = f[x, 0] * f[x, 10] # Similarly, if we confine our update to a single column, we # can recursively refer to other values in the same row. f[0, y] = f[0, y] / f[3, y] # The general rule is: Each hl.Var used in an update definition # must appear unadorned in the same position as in the pure # definition in all references to the function on the left- # and right-hand sides. So the following definitions are # legal updates: f[x, 17] = x + 8 # x is used, so all uses of f must have x as the first argument. f[0, y] = y * 8 # y is used, so all uses of f must have y as the second argument. f[x, x + 1] = x + 8 f[y / 2, y] = f[0, y] * 17 # But these ones would cause an error: # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'. # f[y, y + 1) = y + 8 <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'. # f[y, x) = y - x <- Arguments to f on the left-hand-side are in the wrong places. # f[3, 4) = x + y <- Free variables appear on the right-hand-side but not the left-hand-side. # We'll realize this one just to make sure it compiles. The # second-to-last definition forces us to realize over a # domain that is taller than it is wide. f.realize(100, 101) # For each realization of f, each step runs in its entirety # before the next one begins. Let's trace the loads and # stores for a simpler example: g = hl.Func("g") g[x, y] = x + y # Pure definition g[2, 1] = 42 # First update definition g[x, 0] = g[x, 1] # Second update definition g.trace_loads() g.trace_stores() g.realize(4, 4) # Reading the log, we see that each pass is applied in turn. The equivalent C is: result = np.empty((4, 4), dtype=np.int) # Pure definition for yy in range(4): for xx in range(4): result[yy][xx] = xx + yy # First update definition result[1][2] = 42 # Second update definition for xx in range(4): result[0][xx] = result[1][xx] # end of section # Putting update passes inside loops. if True: # Starting with this pure definition: f = hl.Func("f") f[x, y] = x + y # Say we want an update that squares the first fifty rows. We # could do this by adding 50 update definitions: # f[x, 0) = f[x, 0) * f[x, 0) # f[x, 1) = f[x, 1) * f[x, 1) # f[x, 2) = f[x, 2) * f[x, 2) # ... # f[x, 49) = f[x, 49) * f[x, 49) # Or equivalently using a compile-time loop in our C++: # for (int i = 0 i < 50 i++) { # f[x, i) = f[x, i) * f[x, i) # # But it's more manageable and more flexible to put the loop # in the generated code. We do this by defining a "reduction # domain" and using it inside an update definition: r = hl.RDom(0, 50) f[x, r] = f[x, r] * f[x, r] halide_result = f.realize(100, 100) # The equivalent C is: c_result = np.empty((100, 100), dtype=np.int) for yy in range(100): for xx in range(100): c_result[yy][xx] = xx + yy for xx in range(100): for rr in range(50): # The loop over the reduction domain occurs inside of # the loop over any pure variables used in the update # step: c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx] # Check the results match: for yy in range(100): for xx in range(100): if halide_result(xx, yy) != c_result[yy][xx]: raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result[yy][xx])) return -1 # Now we'll examine a real-world use for an update definition: # computing a histogram. if True: # Some operations on images can't be cleanly expressed as a pure # function from the output coordinates to the value stored # there. The classic example is computing a histogram. The # natural way to do it is to iterate over the input image, # updating histogram buckets. Here's how you do that in Halide: histogram = hl.Func("histogram") # Histogram buckets start as zero. histogram[x] = 0 # Define a multi-dimensional reduction domain over the input image: r = hl.RDom(0, input.width(), 0, input.height()) # For every point in the reduction domain, increment the # histogram bucket corresponding to the intensity of the # input image at that point. histogram[input[r.x, r.y]] += 1 halide_result = histogram.realize(256) # The equivalent C is: c_result = np.empty((256), dtype=np.int) for xx in range(256): c_result[xx] = 0 for r_y in range(input.height()): for r_x in range(input.width()): c_result[input_data[r_x, r_y]] += 1 # Check the answers agree: for xx in range(256): if c_result[xx] != halide_result(xx): raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # Scheduling update steps if True: # The pure variables in an update step and can be # parallelized, vectorized, split, etc as usual. # Vectorizing, splitting, or parallelize the variables that # are part of the reduction domain is trickier. We'll cover # that in a later lesson. # Consider the definition: f = hl.Func("x") f[x, y] = x * y # Set the second row to equal the first row. f[x, 1] = f[x, 0] # Set the second column to equal the first column plus 2. f[1, y] = f[0, y] + 2 # The pure variables in each stage can be scheduled # independently. To control the pure definition, we schedule # as we have done in the past. The following code vectorizes # and parallelizes the pure definition only. f.vectorize(x, 4).parallel(y) # We use hl.Func::update(int) to get a handle to an update step # for the purposes of scheduling. The following line # vectorizes the first update step across x. We can't do # anything with y for this update step, because it doesn't # use y. f.update(0).vectorize(x, 4) # Now we parallelize the second update step in chunks of size # 4. yo, yi = hl.Var("yo"), hl.Var("yi") f.update(1).split(y, yo, yi, 4).parallel(yo) halide_result = f.realize(16, 16) # Here's the equivalent (serial) C: c_result = np.empty((16, 16), dtype=np.int) # Pure step. Vectorized in x and parallelized in y. for yy in range(16): # Should be a parallel for loop for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] c_result[yy][xx[0]] = xx[0] * yy c_result[yy][xx[1]] = xx[1] * yy c_result[yy][xx[2]] = xx[2] * yy c_result[yy][xx[3]] = xx[3] * yy # First update. Vectorized in x. for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] c_result[1][xx[0]] = c_result[0][xx[0]] c_result[1][xx[1]] = c_result[0][xx[1]] c_result[1][xx[2]] = c_result[0][xx[2]] c_result[1][xx[3]] = c_result[0][xx[3]] # Second update. Parallelized in chunks of size 4 in y. for yo in range(4): # Should be a parallel for loop for yi in range(4): yy = yo * 4 + yi c_result[yy][1] = c_result[yy][0] + 2 # Check the C and Halide results match: for yy in range(16): for xx in range(16): if halide_result(xx, yy) != c_result[yy][xx]: raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result[yy][xx])) return -1 # That covers how to schedule the variables within a hl.Func that # uses update steps, but what about producer-consumer # relationships that involve compute_at and store_at? Let's # examine a reduction as a producer, in a producer-consumer pair. if True: # Because an update does multiple passes over a stored array, # it's not meaningful to inline them. So the default schedule # for them does the closest thing possible. It computes them # in the innermost loop of their consumer. Consider this # trivial example: producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 producer[x] += 1 consumer[x] = 2 * producer[x] halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range(10): producer_storage = np.empty((1), dtype=np.int) # Pure step for producer producer_storage[0] = xx * 17 # Update step for producer producer_storage[0] = producer_storage[0] + 1 # Pure step for consumer c_result[xx] = 2 * producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # For all other compute_at/store_at options, the reduction # gets placed where you would expect, somewhere in the loop # nest of the consumer. # Now let's consider a reduction as a consumer in a # producer-consumer pair. This is a little more involved. if True: if True: # Case 1: The consumer references the producer in the pure step only. producer, consumer = hl.Func("producer"), hl.Func("consumer") # The producer is pure. producer[x] = x * 17 consumer[x] = 2 * producer[x] consumer[x] += 1 # The valid schedules for the producer in this case are # the default schedule - inlined, and also: # # 1) producer.compute_at(x), which places the computation of # the producer inside the loop over x in the pure step of the # consumer. # # 2) producer.compute_root(), which computes all of the # producer ahead of time. # # 3) producer.store_root().compute_at(x), which allocates # space for the consumer outside the loop over x, but fills # it in as needed inside the loop. # # Let's use option 1. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = 2 * producer_storage[0] # Update step for the consumer for xx in range(10): c_result[xx] += 1 # All of the pure step is evaluated before any of the # update step, so there are two separate loops over x. # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 2: The consumer references the producer in the update step only producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside the update # step of the producer, because that's the only step that # uses the producer. producer.compute_at(consumer, x) # Note however, that we didn't say: # # producer.compute_at(consumer.update(0), x). # # Scheduling is done with respect to Vars of a hl.Func, and # the Vars of a hl.Func are shared across the pure and # update steps. halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): c_result[xx] = xx # Update step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 3: The consumer references the producer in # multiple steps that share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = producer[x] * x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside both the # pure and the update step of the producer. So there ends # up being two separate realizations of the producer, and # redundant work occurs. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = producer_storage[0] * xx # Update step for the consumer for xx in range(10): # Another copy of the pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 4: The consumer references the producer in # multiple steps that do not share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x, y] = x * y consumer[x, y] = x + y consumer[x, 0] = producer[x, x - 1] consumer[0, y] = producer[y, y - 1] # In this case neither producer.compute_at(consumer, x) # nor producer.compute_at(consumer, y) will work, because # either one fails to cover one of the uses of the # producer. So we'd have to inline producer, or use # producer.compute_root(). # Let's say we really really want producer to be # compute_at the inner loops of both consumer update # steps. Halide doesn't allow multiple different # schedules for a single hl.Func, but we can work around it # by making two wrappers around producer, and scheduling # those instead: # Attempt 2: producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func( ), hl.Func(), hl.Func() producer_wrapper_1[x, y] = producer[x, y] producer_wrapper_2[x, y] = producer[x, y] consumer_2[x, y] = x + y consumer_2[x, 0] += producer_wrapper_1[x, x - 1] consumer_2[0, y] += producer_wrapper_2[y, y - 1] # The wrapper functions give us two separate handles on # the producer, so we can schedule them differently. producer_wrapper_1.compute_at(consumer_2, x) producer_wrapper_2.compute_at(consumer_2, y) halide_result = consumer_2.realize(10, 10) # The equivalent C is: c_result = np.empty((10, 10), dtype=np.int) # Pure step for the consumer for yy in range(10): for xx in range(10): c_result[yy][xx] = xx + yy # First update step for consumer for xx in range(10): producer_wrapper_1_storage = np.empty((1), dtype=np.int) producer_wrapper_1_storage[0] = xx * (xx - 1) c_result[0][xx] += producer_wrapper_1_storage[0] # Second update step for consumer for yy in range(10): producer_wrapper_2_storage = np.empty((1), dtype=np.int) producer_wrapper_2_storage[0] = yy * (yy - 1) c_result[yy][0] += producer_wrapper_2_storage[0] # Check the results match for yy in range(10): for xx in range(10): if halide_result(xx, yy) != c_result[yy][xx]: print("halide_result(%d, %d) = %d instead of %d", xx, yy, halide_result(xx, yy), c_result[yy][xx]) return -1 if True: # Case 5: Scheduling a producer under a reduction domain # variable of the consumer. # We are not just restricted to scheduling producers at # the loops over the pure variables of the consumer. If a # producer is only used within a loop over a reduction # domain (hl.RDom) variable, we can also schedule the # producer there. producer, consumer = hl.Func("producer"), hl.Func("consumer") r = hl.RDom(0, 5) producer[x] = x * 17 consumer[x] = x + 10 consumer[x] += r + producer[x + r] producer.compute_at(consumer, r) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer. for xx in range(10): c_result[xx] = xx + 10 # Update step for the consumer. for xx in range(10): for rr in range( 5 ): # The loop over the reduction domain is always the inner loop. # We've schedule the storage and computation of # the producer here. We just need a single value. producer_storage = np.empty((1), dtype=np.int) # Pure step of the producer. producer_storage[0] = (xx + rr) * 17 # Now use it in the update step of the consumer. c_result[xx] += rr + producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # A real-world example of a reduction inside a producer-consumer chain. if True: # The default schedule for a reduction is a good one for # convolution-like operations. For example, the following # computes a 5x5 box-blur of our grayscale test image with a # hl.clamp-to-edge boundary condition: # First add the boundary condition. clamped = hl.repeat_edge(input) # Define a 5x5 box that starts at (-2, -2) r = hl.RDom(-2, 5, -2, 5) # Compute the 5x5 sum around each pixel. local_sum = hl.Func("local_sum") local_sum[x, y] = 0 # Compute the sum as a 32-bit integer local_sum[x, y] += clamped[x + r.x, y + r.y] # Divide the sum by 25 to make it an average blurry = hl.Func("blurry") blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25) halide_result = blurry.realize(input.width(), input.height()) # The default schedule will inline 'clamped' into the update # step of 'local_sum', because clamped only has a pure # definition, and so its default schedule is fully-inlined. # We will then compute local_sum per x coordinate of blurry, # because the default schedule for reductions is # compute-innermost. Here's the equivalent C: #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0] local_sum = np.empty((1), dtype=np.int32) c_result = hl.Buffer(hl.UInt(8), input.width(), input.height()) for yy in range(input.height()): for xx in range(input.width()): # FIXME this loop is quite slow # Pure step of local_sum local_sum[0] = 0 # Update step of local_sum for r_y in range(-2, 2 + 1): for r_x in range(-2, 2 + 1): # The clamping has been inlined into the update step. clamped_x = min(max(xx + r_x, 0), input.width() - 1) clamped_y = min(max(yy + r_y, 0), input.height() - 1) local_sum[0] += input(clamped_x, clamped_y) # Pure step of blurry #c_result(x, y) = (uint8_t)(local_sum[0] / 25) #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25) c_result[xx, yy] = int(local_sum[0] / 25) # hl.cast done internally # Check the results match for yy in range(input.height()): for xx in range(input.width()): if halide_result(xx, yy) != c_result(xx, yy): raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result(xx, yy))) return -1 # Reduction helpers. if True: # There are several reduction helper functions provided in # Halide.h, which compute small reductions and schedule them # innermost into their consumer. The most useful one is # "sum". f1 = hl.Func("f1") r = hl.RDom(0, 100) f1[x] = hl.sum(r + x) * 7 # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to: f2, anon = hl.Func("f2"), hl.Func("anon") anon[x] = 0 anon[x] += r + x f2[x] = anon[x] * 7 # So even though f1 references a reduction domain, it is a # pure function. The reduction domain has been swallowed to # define the inner anonymous reduction. halide_result_1 = f1.realize(10) halide_result_2 = f2.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range(10): anon = np.empty((1), dtype=np.int) anon[0] = 0 for rr in range(100): anon[0] += rr + xx c_result[xx] = anon[0] * 7 # Check they all match. for xx in range(10): if halide_result_1(xx) != c_result[xx]: print("halide_result_1(%d) = %d instead of %d", x, halide_result_1(x), c_result[x]) return -1 if halide_result_2(xx) != c_result[xx]: print("halide_result_2(%d) = %d instead of %d", x, halide_result_2(x), c_result[x]) return -1 # A complex example that uses reduction helpers. if False: # non-sense to port SSE code to python, skipping this test # Other reduction helpers include "product", "minimum", # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax # requires understanding tuples, which come in a later # lesson. Let's use minimum and maximum to compute the local # spread of our grayscale image. # First, add a boundary condition to the input. clamped = hl.Func("clamped") x_clamped = hl.clamp(x, 0, input.width() - 1) y_clamped = hl.clamp(y, 0, input.height() - 1) clamped[x, y] = input[x_clamped, y_clamped] box = hl.RDom(-2, 5, -2, 5) # Compute the local maximum minus the local minimum: spread = hl.Func("spread") spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) - minimum(clamped(x + box.x, y + box.y))) # Compute the result in strips of 32 scanlines yo, yi = hl.Var("yo"), hl.Var("yi") spread.split(y, yo, yi, 32).parallel(yo) # Vectorize across x within the strips. This implicitly # vectorizes stuff that is computed within the loop over x in # spread, which includes our minimum and maximum helpers, so # they get vectorized too. spread.vectorize(x, 16) # We'll apply the boundary condition by padding each scanline # as we need it in a circular buffer (see lesson 08). clamped.store_at(spread, yo).compute_at(spread, yi) halide_result = spread.realize(input.width(), input.height()) # The C equivalent is almost too horrible to contemplate (and # took me a long time to debug). This time I want to time # both the Halide version and the C version, so I'll use sse # intrinsics for the vectorization, and openmp to do the # parallel for loop (you'll need to compile with -fopenmp or # similar to get correct timing). #ifdef __SSE2__ # Don't include the time required to allocate the output buffer. c_result = hl.Buffer(hl.UInt(8), input.width(), input.height()) #ifdef _OPENMP t1 = datetime.now() #endif # Run this one hundred times so we can average the timing results. for iters in range(100): pass # #pragma omp parallel for # for yo in range((input.height() + 31)/32): # y_base = hl.min(yo * 32, input.height() - 32) # # # Compute clamped in a circular buffer of size 8 # # (smallest power of two greater than 5). Each thread # # needs its own allocation, so it must occur here. # # clamped_width = input.width() + 4 # clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8) # # for yi in range(32): # y = y_base + yi # # uint8_t *output_row = &c_result(0, y) # # # Compute clamped for this scanline, skipping rows # # already computed within this slice. # int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2) # int max_y_clamped = (y + 2) # for (int cy = min_y_clamped cy <= max_y_clamped cy++) { # # Figure out which row of the circular buffer # # we're filling in using bitmasking: # uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width # # # Figure out which row of the input we're reading # # from by clamping the y coordinate: # int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1) # uint8_t *input_row = &input(0, clamped_y) # # # Fill it in with the padding. # for (int x = -2 x < input.width() + 2 ): # int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1) # *clamped_row++ = input_row[clamped_x] # # # # # Now iterate over vectors of x for the pure step of the output. # for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) { # int x_base = std::hl.min(x_vec * 16, input.width() - 16) # # # Allocate storage for the minimum and maximum # # helpers. One vector is enough. # __m128i minimum_storage, maximum_storage # # # The pure step for the maximum is a vector of zeros # maximum_storage = (__m128i)_mm_setzero_ps() # # # The update step for maximum # for (int max_y = y - 2 max_y <= y + 2 max_y++) { # uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width # for (int max_x = x_base - 2 max_x <= x_base + 2 max_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2)) # maximum_storage = _mm_max_epu8(maximum_storage, v) # # # # # The pure step for the minimum is a vector of # # ones. Create it by comparing something to # # itself. # minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(), # _mm_setzero_ps()) # # # The update step for minimum. # for (int min_y = y - 2 min_y <= y + 2 min_y++) { # uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width # for (int min_x = x_base - 2 min_x <= x_base + 2 min_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2)) # minimum_storage = _mm_min_epu8(minimum_storage, v) # # # # # Now compute the spread. # __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage) # # # Store it. # _mm_storeu_si128((__m128i *)(output_row + x_base), spread) # # # # del clamped_storage # # end of hundred iterations # Skip the timing comparison if we don't have openmp # enabled. Otherwise it's unfair to C. #ifdef _OPENMP t2 = datetime.now() # Now run the Halide version again without the # jit-compilation overhead. Also run it one hundred times. for iters in range(100): spread.realize(halide_result) t3 = datetime.now() # Report the timings. On my machine they both take about 3ms # for the 4-megapixel input (fast!), which makes sense, # because they're using the same vectorization and # parallelization strategy. However I find the Halide easier # to read, write, debug, modify, and port. print("Halide spread took %f ms. C equivalent took %f ms" % ((t3 - t2).total_seconds() * 1000, (t2 - t1).total_seconds() * 1000)) #endif # _OPENMP # Check the results match: for yy in range(input.height()): for xx in range(input.width()): if halide_result(xx, yy) != c_result(xx, yy): raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result(xx, yy))) return -1 #endif # __SSE2__ else: print("(Skipped the SSE2 section of the code, " "since non-sense in python world.)") print("Success!") return 0
def focus_stack_pipeline(): outputs = [] start_w, start_h = 3000, 2000 number_of_layers = 5 layer_sizes = [[start_w, start_h]] for i in range(0, number_of_layers): # Grab from prev layer w,h = layer_sizes[-1] layer_sizes.append([int(math.ceil(w/2.0)),int(math.ceil(h/2.0))]) # Add last size in once more to get the 2nd top lap layer (gaussian) for # the energy/deviation split. layer_sizes.append(layer_sizes[-1]) input = hl.ImageParam(hl.UInt(8), 3) input.dim(0).set_estimate(0, start_w) input.dim(1).set_estimate(0, start_h) input.dim(2).set_estimate(0, 3) lap_inputs = [] max_energy_inputs = [] for i in range(0,number_of_layers+1): lap_layer = hl.ImageParam(hl.Float(32), 3, "lap{}".format(i)) lap_inputs.append(lap_layer) w,h = layer_sizes[i] lap_layer.dim(0).set_estimate(0, w) lap_layer.dim(1).set_estimate(0, h) lap_layer.dim(2).set_estimate(0, 3) if i == number_of_layers: # last (top - small) layer # Add the last laplacian (really direct from gaussian) layer # in twice. We output one maxed on entropies and one maxed on # deviations. lap_layer = hl.ImageParam(hl.Float(32), 3, "lap{}".format(i+1)) lap_inputs.append(lap_layer) lap_layer.dim(0).set_estimate(0, w) lap_layer.dim(1).set_estimate(0, h) lap_layer.dim(2).set_estimate(0, 3) entropy_layer = hl.ImageParam(hl.Float(32), 2, "entroy{}".format(i)) max_energy_inputs.append(entropy_layer) entropy_layer.dim(0).set_estimate(0, w) entropy_layer.dim(1).set_estimate(0, h) deviation_layer = hl.ImageParam(hl.Float(32), 2, "deviation{}".format(i)) max_energy_inputs.append(deviation_layer) deviation_layer.dim(0).set_estimate(0, w) deviation_layer.dim(1).set_estimate(0, h) else: max_energy_layer = hl.ImageParam(hl.Float(32), 2, "max_energy{}".format(i)) max_energy_inputs.append(max_energy_layer) max_energy_layer.dim(0).set_estimate(0, w) max_energy_layer.dim(1).set_estimate(0, h) x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") hist_index = hl.Var('hist_index') clamped = f32(x, y, c, mirror(input, 3000, 2000)) f = hl.Func("input32") f[x, y, c] = clamped[x, y, c] energy_outputs = [] gaussian_layers = [f] laplacian_layers = [] merged_laps = [] for layer_num in range(0, number_of_layers): # Add the layer size in also w,h = layer_sizes[layer_num] start_layer = gaussian_layers[-1] # Blur the image gaussian_layer = gaussian(x, y, c, start_layer) # Grab next layer size # w,h = layer_sizes[layer_num+1] # Reduce the layer size and add it into the list next_layer = reduce_layer(x, y, c, gaussian_layer) gaussian_layers.append(next_layer) # Expand back up expanded = expand_layer(x, y, c, next_layer) # Generate the laplacian from the # original - blurred/reduced/expanded version laplacian_layer = laplacian(x, y, c, start_layer, expanded) laplacian_layers.append(laplacian_layer) # Calculate energies for the gaussian layer prev_energies = mirror(max_energy_inputs[layer_num], w, h) next_energies = region_energy(x, y, c, laplacian_layer) prev_laplacian = mirror(lap_inputs[layer_num], w, h) merged_energies = energy_maxes(x, y, c, prev_energies, next_energies) merged_lap = merge_laplacian(x, y, c, merged_energies, next_energies, prev_laplacian, laplacian_layer) energy_outputs.append([[w,h,True],merged_energies]) merged_laps.append(merged_lap) # Add estimates next_layer.set_estimate(x, 0, w) next_layer.set_estimate(y, 0, h) next_layer.set_estimate(c, 0, 3) # Handle last layer differently w,h = layer_sizes[-1] # The next_lap is really just the last gaussian layer next_lap = gaussian_layers[-1] prev_entropy_laplacian = mirror(lap_inputs[-2], w, h) prev_entropy = mirror(max_energy_inputs[-2], w, h) next_entropy = entropy(x, y, c, next_lap, w, h, hist_index) merged_entropy = energy_maxes(x, y, c, prev_entropy, next_entropy) merged_lap_on_entropy = merge_laplacian(x, y, c, merged_entropy, next_entropy, prev_entropy_laplacian, next_lap) merged_laps.append(merged_lap_on_entropy) prev_deviation_laplacian = mirror(lap_inputs[-1], w, h) prev_deviation = mirror(max_energy_inputs[-1], w, h) next_deviation = deviation(x, y, c, next_lap) merged_deviation = energy_maxes(x, y, c, prev_deviation, next_deviation) merged_lap_on_deviation = merge_laplacian(x, y, c, merged_deviation, next_deviation, prev_deviation_laplacian, next_lap) merged_laps.append(merged_lap_on_deviation) energy_outputs.append([[w,h,True],merged_entropy]) energy_outputs.append([[w,h,True],merged_deviation]) print("NUM LAYERS: ", len(gaussian_layers), len(laplacian_layers), layer_sizes) # Add all of the laplacian layers to the output first i = 0 for merged_lap in merged_laps: w,h = layer_sizes[i] mid = (i < (len(merged_laps) - 2)) outputs.append([[w,h,False,mid], merged_lap]) i += 1 # Then energies for energy_output in energy_outputs: outputs.append(energy_output) new_outputs = [] for size, output in outputs: w = size[0] h = size[1] gray = len(size) > 2 and size[2] mid = len(size) > 3 and size[3] if mid: uint8_output = output else: uint8_output = output uint8_output.set_estimate(x, 0, w) uint8_output.set_estimate(y, 0, h) if not gray: uint8_output.set_estimate(c, 0, 3) new_outputs.append([size, uint8_output]) outputs = new_outputs print("OUTPUT LAYERS: ") pprint(outputs) output_funcs = [output for _, output in outputs] pipeline = hl.Pipeline(output_funcs) return { 'pipeline': pipeline, 'inputs': [input] + lap_inputs + max_energy_inputs }
def test_target(): # Target("") should be exactly like get_host_target(). t1 = hl.get_host_target() t2 = hl.Target("") assert t1 == t2, "Default ctor failure" assert t1.supported() # to_string roundtripping t1 = hl.Target() ts = t1.to_string() assert ts == "arch_unknown-0-os_unknown" # Note, this should *not* validate, since validate_target_string # now returns false if any of arch-bits-os are undefined assert not hl.Target.validate_target_string(ts) # Don't attempt to roundtrip this: trying to create # a Target with unknown portions will now assert-fail. # # t2 = hl.Target(ts) # assert t2 == t1 # repr() and str() assert str(t1) == "arch_unknown-0-os_unknown" assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>" assert t1.os == hl.TargetOS.OSUnknown assert t1.arch == hl.TargetArch.ArchUnknown assert t1.bits == 0 # Full specification round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) ts = t1.to_string() assert ts == "x86-32-linux-sse41" assert hl.Target.validate_target_string(ts) # Full specification (without features) round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32) ts = t1.to_string() assert ts == "x86-32-linux" assert hl.Target.validate_target_string(ts) # Full specification round-trip, crazy features t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32, [ hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX, hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL, hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute, hl.TargetFeature.Debug ]) ts = t1.to_string() assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41" assert hl.Target.validate_target_string(ts) # Expected failures: ts = "host-unknowntoken" assert not hl.Target.validate_target_string(ts) ts = "x86-23" assert not hl.Target.validate_target_string(ts) # bits == 0 is allowed only if arch_unknown and os_unknown are specified, # and no features are set ts = "x86-0" assert not hl.Target.validate_target_string(ts) ts = "0-arch_unknown-os_unknown-sse41" assert not hl.Target.validate_target_string(ts) # "host" is only supported as the first token ts = "opencl-host" assert not hl.Target.validate_target_string(ts) # set_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.SSE41, False) assert t1.has_feature(hl.TargetFeature.AVX) assert not t1.has_feature(hl.TargetFeature.SSE41) # set_features t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_features([hl.TargetFeature.SSE41], False) t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True) assert t1.has_feature(hl.TargetFeature.AVX) assert t1.has_feature(hl.TargetFeature.AVX2) assert not t1.has_feature(hl.TargetFeature.SSE41) # with_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature( hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41" # without_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts]) # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature( hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-sse41" # natural_vector_size # SSE4.1 is 16 bytes wide t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.natural_vector_size(hl.UInt(8)) == 16 assert t1.natural_vector_size(hl.Int(16)) == 8 assert t1.natural_vector_size(hl.UInt(32)) == 4 assert t1.natural_vector_size(hl.Float(32)) == 4 # has_gpu_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.OpenCL]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, []) assert t1.has_gpu_feature() assert not t2.has_gpu_feature() # has_large_buffers & maximum_buffer_size t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.LargeBuffers]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, []) assert t1.has_large_buffers() assert t1.maximum_buffer_size() == 9223372036854775807 assert not t2.has_large_buffers() assert t2.maximum_buffer_size() == 2147483647 # supports_device_api t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.CUDA]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64) assert t1.supports_device_api(hl.DeviceAPI.CUDA) assert not t2.supports_device_api(hl.DeviceAPI.CUDA) # supports_type (deprecated version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64)) assert t2.supports_type(hl.Float(64)) # supports_type (preferred version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal) assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal) # target_feature_for_device_api assert hl.target_feature_for_device_api( hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL # with_feature with non-convertible lists try: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, ["this is a string"]) except TypeError as e: assert "incompatible constructor arguments" in str(e) else: assert False, 'Did not see expected exception!'
def get_local_laplacian(input, levels, alpha, beta, J=8): downsample_counter = [0] upsample_counter = [0] x = hl.Var('x') y = hl.Var('y') def downsample(f): downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 * (f[2 * x, y, c] + f[2 * x + 1, y, c]) + f[2 * x + 2, y, c]) / 8.0 downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 * (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) + downx[x, 2 * y + 2, c]) / 8.0 return downy def upsample(f): upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y, c] + 0.75 * f[x // 2, y, c] upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2), c] + 0.75 * upx[x, y // 2, c] return upy def downsample2D(f): downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0 downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0 return downy def upsample2D(f): upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y] upy[x, y] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2] return upy # THE ALGORITHM # loop variables c = hl.Var('c') k = hl.Var('k') # Make the remapping function as a lookup table. remap = hl.Func('remap') fx = hl.cast(float_t, x / 256.0) #remap[x] = alpha*fx*exp(-fx*fx/2.0) remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0) # Convert to floating point floating = hl.Func('floating') floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0 # Set a boundary condition clamped = hl.Func('clamped') clamped[x, y, c] = floating[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # Get the luminance channel gray = hl.Func('gray') gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[ x, y, 1] + 0.114 * clamped[x, y, 2] # Make the processed Gaussian pyramid. gPyramid = [hl.Func('gPyramid%d' % i) for i in range(J)] # Do a lookup into a lut with 256 entires per intensity level level = k / (levels - 1) idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0 idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256) gPyramid[0][x, y, k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k] for j in range(1, J): gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k] # Get its laplacian pyramid lPyramid = [hl.Func('lPyramid%d' % i) for i in range(J)] lPyramid[J - 1] = gPyramid[J - 1] for j in range(J - 1)[::-1]: lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample( gPyramid[j + 1])[x, y, k] # Make the Gaussian pyramid of the input inGPyramid = [hl.Func('inGPyramid%d' % i) for i in range(J)] inGPyramid[0] = gray for j in range(1, J): inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y] # Make the laplacian pyramid of the output outLPyramid = [hl.Func('outLPyramid%d' % i) for i in range(J)] for j in range(J): # Split input pyramid value into integer and floating parts level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1) li = hl.clamp(hl.cast(int_t, level), 0, levels - 2) lf = level - hl.cast(float_t, li) # Linearly interpolate between the nearest processed pyramid levels outLPyramid[j][x, y] = ( 1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1] # Make the Gaussian pyramid of the output outGPyramid = [hl.Func('outGPyramid%d' % i) for i in range(J)] outGPyramid[J - 1] = outLPyramid[J - 1] for j in range(J - 1)[::-1]: outGPyramid[j][x, y] = upsample2D( outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) color = hl.Func('color') eps = 0.01 color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + eps) / (gray[x, y] + eps) output = hl.Func('local_laplacian') # Convert back to 16-bit output[x, y, c] = hl.cast(hl.UInt(16), hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0) # THE SCHEDULE remap.compute_root() target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU Schedule print("Compiling for GPU") xi, yi = hl.Var("xi"), hl.Var("yi") output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default) for j in range(J): blockw = 32 blockh = 16 if j > 3: blockw = 2 blockh = 2 if j > 0: inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) if j > 0: gPyramid[j].compute_root().reorder(k, x, y).gpu_tile( x, y, xi, yi, blockw, blockh, GPU_Default) outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) else: # CPU schedule print("Compiling for CPU") output.parallel(y, 4).vectorize(x, 4) gray.compute_root().parallel(y, 4).vectorize(x, 4) for j in range(4): if j > 0: inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) if j > 0: gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) outGPyramid[j].compute_root().parallel(y).vectorize(x, 4) for j in range(4, J): inGPyramid[j].compute_root().parallel(y) gPyramid[j].compute_root().parallel(k) outGPyramid[j].compute_root().parallel(y) return output
def test_simple(gen): x, y = hl.Var(), hl.Var() target = hl.get_jit_target_from_environment() b_in = hl.Buffer(hl.UInt(8), [2, 2]) b_in.fill(123) f_in = hl.Func("f") f_in[x, y] = x + y # ----------- Inputs by-position f = gen(target, b_in, f_in, 3.5) _realize_and_check(f) # ----------- Inputs by-name f = gen(target, buffer_input=b_in, func_input=f_in, float_arg=3.5) _realize_and_check(f) f = gen(target, float_arg=3.5, buffer_input=b_in, func_input=f_in) _realize_and_check(f) # ----------- Above set again, w/ GeneratorParam mixed in k = 42 gp = {"offset": k} # (positional) f = gen(target, b_in, f_in, 3.5, generator_params=gp) _realize_and_check(f, k) # (keyword) f = gen(target, generator_params=gp, buffer_input=b_in, func_input=f_in, float_arg=3.5) _realize_and_check(f, k) f = gen(target, buffer_input=b_in, generator_params=gp, func_input=f_in, float_arg=3.5) _realize_and_check(f, k) f = gen(target, buffer_input=b_in, func_input=f_in, generator_params=gp, float_arg=3.5) _realize_and_check(f, k) f = gen(target, buffer_input=b_in, float_arg=3.5, func_input=f_in, generator_params=gp) _realize_and_check(f, k) # ----------- Test various failure modes try: # Inputs w/ mixed by-position and by-name f = gen(target, b_in, f_in, float_arg=3.5) except RuntimeError as e: assert 'Cannot use both positional and keyword arguments for inputs.' in str( e) else: assert False, 'Did not see expected exception!' try: # too many positional args f = gen(target, b_in, f_in, 3.5, 4) except RuntimeError as e: assert 'Expected exactly 3 positional args for inputs, but saw 4.' in str( e) else: assert False, 'Did not see expected exception!' try: # too few positional args f = gen(target, b_in, f_in) except RuntimeError as e: assert 'Expected exactly 3 positional args for inputs, but saw 2.' in str( e) else: assert False, 'Did not see expected exception!' try: # Inputs that can't be converted to what the receiver needs (positional) f = gen(target, hl.f32(3.141592), "happy", k) except RuntimeError as e: assert 'Unable to cast Python instance' in str(e) else: assert False, 'Did not see expected exception!' try: # Inputs that can't be converted to what the receiver needs (named) f = gen(target, b_in, f_in, float_arg="bogus") except RuntimeError as e: assert 'Unable to cast Python instance' in str(e) else: assert False, 'Did not see expected exception!' try: # Input specified by both pos and kwarg f = gen(target, b_in, f_in, 3.5, float_arg=4.5) except RuntimeError as e: assert "Cannot use both positional and keyword arguments for inputs." in str( e) else: assert False, 'Did not see expected exception!' try: # generator_params is not a dict f = gen(target, b_in, f_in, 3.5, generator_params=[1, 2, 3]) except TypeError as e: assert "cannot convert dictionary" in str(e) else: assert False, 'Did not see expected exception!' try: # Bad gp name f = gen(target, b_in, f_in, 3.5, generator_params={"foo": 0}) except RuntimeError as e: assert "has no GeneratorParam named: foo" in str(e) else: assert False, 'Did not see expected exception!' try: # Bad input name f = gen(target, buffer_input=b_in, float_arg=3.5, generator_params=gp, funk_input=f_in) except RuntimeError as e: assert "Unknown input 'funk_input' specified via keyword argument." in str( e) else: assert False, 'Did not see expected exception!' try: # Bad gp name f = gen(target, buffer_input=b_in, float_arg=3.5, generator_params=gp, func_input=f_in, nonexistent_generator_param="wat") except RuntimeError as e: assert "Unknown input 'nonexistent_generator_param' specified via keyword argument." in str( e) else: assert False, 'Did not see expected exception!'
def _check_is_u16(e): assert e.type() == hl.UInt(16), e.type()
def test_complexstub(): constant_image = _make_constant_image() input = hl.ImageParam(hl.UInt(8), 3, 'input') input.set(constant_image) x, y, c = hl.Var(), hl.Var(), hl.Var() target = hl.get_jit_target_from_environment() float_arg = 1.25 int_arg = 33 r = complexstub(target, typed_buffer_input=constant_image, untyped_buffer_input=constant_image, simple_input=input, array_input=[input, input], float_arg=float_arg, int_arg=[int_arg, int_arg], untyped_buffer_output_type="uint8", vectorize=True) # return value is a tuple; unpack separately to avoid # making the callsite above unreadable (simple_output, tuple_output, array_output, typed_buffer_output, untyped_buffer_output, static_compiled_buffer_output) = r b = simple_output.realize(32, 32, 3, target) assert b.type() == hl.Float(32) for x in range(32): for y in range(32): for c in range(3): expected = constant_image[x, y, c] actual = b[x, y, c] assert expected == actual, "Expected %s Actual %s" % (expected, actual) b = tuple_output.realize(32, 32, 3, target) assert b[0].type() == hl.Float(32) assert b[1].type() == hl.Float(32) assert len(b) == 2 for x in range(32): for y in range(32): for c in range(3): expected1 = constant_image[x, y, c] * float_arg expected2 = expected1 + int_arg actual1, actual2 = b[0][x, y, c], b[1][x, y, c] assert expected1 == actual1, "Expected1 %s Actual1 %s" % ( expected1, actual1) assert expected2 == actual2, "Expected2 %s Actual1 %s" % ( expected2, actual2) assert len(array_output) == 2 for a in array_output: b = a.realize(32, 32, target) assert b.type() == hl.Int(16) for x in range(32): for y in range(32): expected = constant_image[x, y, 0] + int_arg actual = b[x, y] assert expected == actual, "Expected %s Actual %s" % (expected, actual) # TODO: Output<Buffer<>> has additional behaviors useful when a Stub # is used within another Generator; this isn't yet implemented since there # isn't yet Python bindings for Generator authoring. This section # of the test may need revision at that point. b = typed_buffer_output.realize(32, 32, 3, target) assert b.type() == hl.Float(32) for x in range(32): for y in range(32): for c in range(3): expected = constant_image[x, y, c] actual = b[x, y, c] assert expected == actual, "Expected %s Actual %s" % (expected, actual) b = untyped_buffer_output.realize(32, 32, 3, target) assert b.type() == hl.UInt(8) for x in range(32): for y in range(32): for c in range(3): expected = constant_image[x, y, c] actual = b[x, y, c] assert expected == actual, "Expected %s Actual %s" % (expected, actual) b = static_compiled_buffer_output.realize(4, 4, 1, target) assert b.type() == hl.UInt(8) for x in range(4): for y in range(4): for c in range(1): expected = constant_image[x, y, c] + 42 actual = b[x, y, c] assert expected == actual, "Expected %s Actual %s" % (expected, actual)
def test_schedules(verbose=False, test_random=False): #random_module.seed(int(sys.argv[1]) if len(sys.argv)>1 else 0) halide.exit_on_signal() f = halide.Func('f') x = halide.Var('x') y = halide.Var('y') c = halide.Var('c') g = halide.Func('g') v = halide.Var('v') input = halide.UniformImage(halide.UInt(16), 3) int_t = halide.Int(32) f[x, y, c] = input[ halide.clamp(x, halide.cast(int_t, 0 ), halide.cast(int_t, input.width() - 1)), halide.clamp(y, halide.cast(int_t, 0 ), halide.cast(int_t, input.height() - 1)), halide.clamp(c, halide.cast(int_t, 0), halide.cast(int_t, 2))] #g[v] = f[v,v] g[x, y, c] = f[x, y, c] + 1 assert sorted(halide.all_vars(g).keys()) == sorted(['x', 'y', 'c']) #, 'v']) if verbose: print halide.func_varlist(f) print 'caller_vars(f) =', caller_vars(g, f) print 'caller_vars(g) =', caller_vars(g, g) # validL = list(valid_schedules(g, f, 4)) # validL = [repr(_x) for _x in validL] # # for L in sorted(validL): # print repr(L) T0 = time.time() if not test_random: random = True #False nvalid_determ = 0 for L in schedules_func(g, f, 0, 3): nvalid_determ += 1 if verbose: print L nvalid_random = 0 for i in range(100): for L in schedules_func( g, f, 0, DEFAULT_MAX_DEPTH, random=True ): #sorted([repr(_x) for _x in valid_schedules(g, f, 3)]): if verbose and 0: print L #repr(L) nvalid_random += 1 s = [] for i in range(400): d = random_schedule(g, 0, DEFAULT_MAX_DEPTH) si = str(d) s.append(si) if verbose: print 'Schedule:', si d.apply() evaluate = d.test((36, 36, 3), input) print 'evaluate' evaluate() if test_random: print 'Success' sys.exit() T1 = time.time() s = '\n'.join(s) assert 'f.chunk(_c0)' in s assert 'f.root().vectorize' in s assert 'f.root().unroll' in s assert 'f.root().split' in s assert 'f.root().tile' in s assert 'f.root().parallel' in s assert 'f.root().transpose' in s assert nvalid_random == 100 if verbose: print 'generated in %.3f secs' % (T1 - T0) print 'random_schedule: OK'
def test_simplestub(): x, y = hl.Var(), hl.Var() target = hl.get_jit_target_from_environment() b_in = hl.Buffer(hl.UInt(8), [2, 2]) b_in.fill(123) f_in = hl.Func("f") f_in[x, y] = x + y # ----------- Inputs by-position f = simplestub.generate(target, b_in, f_in, 3.5) _realize_and_check(f) # ----------- Inputs by-name f = simplestub.generate(target, buffer_input=b_in, func_input=f_in, float_arg=3.5) _realize_and_check(f) # ----------- Inputs w/ mixed by-position and by-name f = simplestub.generate(target, b_in, f_in, float_arg=3.5) _realize_and_check(f) f = simplestub.generate(target, b_in, float_arg=3.5, func_input=f_in) _realize_and_check(f) # ----------- Above set again, w/ GeneratorParam mixed in k = 42 f = simplestub.generate(target, b_in, f_in, 3.5, offset=k) _realize_and_check(f, k) f = simplestub.generate(target, offset=k, buffer_input=b_in, func_input=f_in, float_arg=3.5) _realize_and_check(f, k) f = simplestub.generate(target, b_in, f_in, offset=k, float_arg=3.5) _realize_and_check(f, k) f = simplestub.generate(target, b_in, float_arg=3.5, offset=k, func_input=f_in) _realize_and_check(f, k) # ----------- Test various failure modes try: # too many positional args f = simplestub.generate(target, b_in, f_in, 3.5, 4) except RuntimeError as e: assert 'Expected at most 3 positional args, but saw 4.' in str(e) else: assert False, 'Did not see expected exception!' try: # Inputs that can't be converted to what the receiver needs (positional) f = simplestub.generate(target, 3.141592, "happy") except RuntimeError as e: assert 'Unable to cast Python instance' in str(e) else: assert False, 'Did not see expected exception!' try: # Inputs that can't be converted to what the receiver needs (named) f = simplestub.generate(target, b_in, f_in, float_arg="bogus") except RuntimeError as e: assert 'Unable to cast Python instance' in str(e) else: assert False, 'Did not see expected exception!' try: # Missing required inputs f = simplestub.generate(target, b_in, f_in) except RuntimeError as e: assert "Generator Input named 'float_arg' was not specified." in str(e) else: assert False, 'Did not see expected exception!' try: # Input specified by both pos and kwarg f = simplestub.generate(target, b_in, f_in, 3.5, float_arg=4.5) except RuntimeError as e: assert "Generator Input named 'float_arg' was specified by both position and keyword." in str( e) else: assert False, 'Did not see expected exception!' try: # Bad input name f = simplestub.generate(target, b_in, float_arg=3.5, offset=k, funk_input=f_in) except RuntimeError as e: assert "Generator Input named 'func_input' was not specified." in str( e) else: assert False, 'Did not see expected exception!' try: # Bad gp name f = simplestub.generate(target, b_in, float_arg=3.5, offset=k, func_input=f_in, nonexistent_generator_param="wat") except RuntimeError as e: assert "Generator simplestub has no GeneratorParam named: nonexistent_generator_param" in str( e) else: assert False, 'Did not see expected exception!'