def test_basics2(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1),0] # Construct the bilateral grid r = hl.RDom(0, s_sigma, 0, s_sigma, 'r') val0 = clamped[x * s_sigma, y * s_sigma] val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1), y * s_sigma * hl.cast(hl.Int(32), 1)] #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma//2), y * s_sigma - hl.cast(hl.Int(32), s_sigma//2)] val2 = clamped[x * s_sigma - s_sigma//2, y * s_sigma - s_sigma//2] val3 = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2] return
def test_basics3(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1),0] # Construct the bilateral grid r = hl.RDom(0, s_sigma, 0, s_sigma, 'r') val = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2] val = hl.clamp(val, 0.0, 1.0) #zi = hl.cast(hl.Int(32), val * (1.0/r_sigma) + 0.5) zi = hl.cast(hl.Int(32), (val / r_sigma) + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 ss = hl.select(c == 0, val, 1.0) print("hl.select(c == 0, val, 1.0)", ss) left = histogram[x, y, zi, c] print("histogram[x, y, zi, c]", histogram[x, y, zi, c]) print("histogram[x, y, zi, c]", left) left += 5 print("histogram[x, y, zi, c] after += 5", left) left += ss return
def test_basics3(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), 0] # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32((val / r_sigma) + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 ss = hl.select(c == 0, val, 1.0) left = histogram[x, y, zi, c] left += 5 left += ss
def test_basics2(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) s_sigma = 8 x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), 0] # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val0 = clamped[x * s_sigma, y * s_sigma] val00 = clamped[x * s_sigma * hl.i32(1), y * s_sigma * hl.i32(1)] val22 = clamped[x * s_sigma - hl.i32(s_sigma // 2), y * s_sigma - hl.i32(s_sigma // 2)] val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2] val3 = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] try: val1 = clamped[x * s_sigma - s_sigma / 2, y * s_sigma - s_sigma / 2] except RuntimeError as e: assert 'Implicit cast from float32 to int' in str(e) else: assert False, 'Did not see expected exception!'
def __init__(self, input): assert type(input) == hl.Buffer_uint8 self.lut = hl.Func("lut") self.padded = hl.Func("padded") self.padded16 = hl.Func("padded16") self.sharpen = hl.Func("sharpen") self.curved = hl.Func("curved") self.input = input # For this lesson, we'll use a two-stage pipeline that sharpens # and then applies a look-up-table (LUT). # First we'll define the LUT. It will be a gamma curve. self.lut[i] = hl.cast(hl.UInt(8), hl.clamp(pow(i / 255.0, 1.2) * 255.0, 0, 255)) # Augment the input with a boundary condition. self.padded[x, y, c] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c] # Cast it to 16-bit to do the math. self.padded16[x, y, c] = hl.cast(hl.UInt(16), self.padded[x, y, c]) # Next we sharpen it with a five-tap filter. self.sharpen[x, y, c] = (self.padded16[x, y, c] * 2- (self.padded16[x - 1, y, c] + self.padded16[x, y - 1, c] + self.padded16[x + 1, y, c] + self.padded16[x, y + 1, c]) / 4) # Then apply the LUT. self.curved[x, y, c] = self.lut[self.sharpen[x, y, c]]
def __init__(self, input): assert input.type() == hl.UInt(8) self.lut = hl.Func("lut") self.padded = hl.Func("padded") self.padded16 = hl.Func("padded16") self.sharpen = hl.Func("sharpen") self.curved = hl.Func("curved") self.input = input # For this lesson, we'll use a two-stage pipeline that sharpens # and then applies a look-up-table (LUT). # First we'll define the LUT. It will be a gamma curve. gamma = hl.f32(1.2) self.lut[i] = hl.u8(hl.clamp(hl.pow(i / 255.0, gamma) * 255.0, 0, 255)) # Augment the input with a boundary condition. self.padded[x, y, c] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # Cast it to 16-bit to do the math. self.padded16[x, y, c] = hl.u16(self.padded[x, y, c]) # Next we sharpen it with a five-tap filter. self.sharpen[x, y, c] = ( self.padded16[x, y, c] * 2 - (self.padded16[x - 1, y, c] + self.padded16[x, y - 1, c] + self.padded16[x + 1, y, c] + self.padded16[x, y + 1, c]) / 4) # Then apply the LUT. self.curved[x, y, c] = self.lut[self.sharpen[x, y, c]]
def test_basics2(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), 0] if True: print("s_sigma", s_sigma) print("s_sigma/2", s_sigma / 2) print("s_sigma//2", s_sigma // 2) print() print("x * s_sigma", x * s_sigma) print("x * 8", x * 8) print("x * 8 + 4", x * 8 + 4) print("x * 8 * 4", x * 8 * 4) print() print("x", x) print("(x * s_sigma).type()", ) print("(x * 8).type()", (x * 8).type()) print("(x * 8 + 4).type()", (x * 8 + 4).type()) print("(x * 8 * 4).type()", (x * 8 * 4).type()) print("(x * 8 / 4).type()", (x * 8 / 4).type()) print("((x * 8) * 4).type()", ((x * 8) * 4).type()) print("(x * (8 * 4)).type()", (x * (8 * 4)).type()) assert (x * 8).type() == hl.Int(32) assert (x * 8 * 4).type() == hl.Int(32) # yes this did fail at some point assert ((x * 8) / 4).type() == hl.Int(32) assert (x * (8 / 4)).type() == hl.Float(32) # under python3 division rules assert (x * (8 // 4)).type() == hl.Int(32) #assert (x * 8 // 4).type() == hl.Int(32) # not yet implemented # Construct the bilateral grid r = hl.RDom(0, s_sigma, 0, s_sigma, 'r') val0 = clamped[x * s_sigma, y * s_sigma] val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1), y * s_sigma * hl.cast(hl.Int(32), 1)] #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma // 2), y * s_sigma - hl.cast(hl.Int(32), s_sigma // 2)] val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2] val3 = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] return
def prefilterXSobel(image, W, H): x, y = Var("x"), Var("y") clamped, gray = Func("clamped"), Func("gray") gray[x, y] = 0.2989*image[x, y, 0] + 0.5870*image[x, y, 1] + 0.1140*image[x, y, 2] clamped[x, y] = gray[h.clamp(x, 0, W-1), h.clamp(y, 0, H-1)] temp, xSobel = Func("temp"), Func("xSobel") temp[x, y] = clamped[x+1, y] - clamped[x-1, y] xSobel[x, y] = h.cast(Int(8), h.clamp(temp[x, y-1] + 2 * temp[x, y] + temp[x, y+1], -31, 31)) xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo") xSobel.compute_root().tile(x, y, xo, yo, xi, yi, 64, 32).parallel(yo).parallel(xo) temp.compute_at(xSobel, yi).vectorize(x, 8) return xSobel
def prefilterXSobel(image, W, H): x, y = Var("x"), Var("y") clamped, gray = Func("clamped"), Func("gray") gray[x, y] = 0.2989 * image[x, y, 0] + 0.5870 * image[ x, y, 1] + 0.1140 * image[x, y, 2] clamped[x, y] = gray[h.clamp(x, 0, W - 1), h.clamp(y, 0, H - 1)] temp, xSobel = Func("temp"), Func("xSobel") temp[x, y] = clamped[x + 1, y] - clamped[x - 1, y] xSobel[x, y] = h.cast( Int(8), h.clamp(temp[x, y - 1] + 2 * temp[x, y] + temp[x, y + 1], -31, 31)) xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo") xSobel.compute_root().tile(x, y, xo, yo, xi, yi, 64, 32).parallel(yo).parallel(xo) temp.compute_at(xSobel, yi).vectorize(x, 8) return xSobel
def get_erode(input): """ Erode on 5x5 stencil, first erode x then erode y. """ x = hl.Var("x") y = hl.Var("y") c = hl.Var("c") input_clamped = hl.Func("input_clamped") erode_x = hl.Func("erode_x") erode_y = hl.Func("erode_y") input_clamped[x, y, c] = input[ hl.clamp(x, hl.cast(hl.Int(32), 0 ), hl.cast(hl.Int(32), input.width() - 1)), hl.clamp(y, hl.cast(hl.Int(32), 0 ), hl.cast(hl.Int(32), input.height() - 1)), c] erode_x[x, y, c] = hl.min( hl.min( hl.min( hl.min(input_clamped[x - 2, y, c], input_clamped[x - 1, y, c]), input_clamped[x, y, c]), input_clamped[x + 1, y, c]), input_clamped[x + 2, y, c]) erode_y[x, y, c] = hl.min( hl.min( hl.min(hl.min(erode_x[x, y - 2, c], erode_x[x, y - 1, c]), erode_x[x, y, c]), erode_x[x, y + 1, c]), erode_x[x, y + 2, c]) yi = hl.Var("yi") # CPU Schedule erode_x.compute_root().split(y, y, yi, 8).parallel(y) erode_y.compute_root().split(y, y, yi, 8).parallel(y) return erode_y
def get_erode(input): """ Erode on 5x5 stencil, first erode x then erode y. """ x = hl.Var("x") y = hl.Var("y") c = hl.Var("c") input_clamped = hl.Func("input_clamped") erode_x = hl.Func("erode_x") erode_y = hl.Func("erode_y") input_clamped[x,y,c] = input[hl.clamp(x,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.width()-1)), hl.clamp(y,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.height()-1)), c] erode_x[x,y,c] = hl.min(hl.min(hl.min(hl.min(input_clamped[x-2,y,c],input_clamped[x-1,y,c]),input_clamped[x,y,c]),input_clamped[x+1,y,c]),input_clamped[x+2,y,c]) erode_y[x,y,c] = hl.min(hl.min(hl.min(hl.min(erode_x[x,y-2,c],erode_x[x,y-1,c]),erode_x[x,y,c]),erode_x[x,y+1,c]),erode_x[x,y+2,c]) yi = hl.Var("yi") # CPU Schedule erode_x.compute_root().split(y, y, yi, 8).parallel(y) erode_y.compute_root().split(y, y, yi, 8).parallel(y) return erode_y
def entropy(x, y, c, img, w, h, hist_index): base_gray = gray(x, y, c, img) clamped_gray = mkfunc('clamped_gray', base_gray) clamped_gray[x,y] = hl.clamp(base_gray[x,y], 0, 255) u8_gray = u8(x, y, c, clamped_gray) probabilities = histogram(x, y, c, u8_gray, w, h, hist_index) r = hl.RDom([(-2, 5), (-2, 5)]) levels = mkfunc('entropy', img) levels[x,y] = 0.0 # Add in 0.00001 to prevent -Inf's levels[x,y] += base_gray[x + r.x, y + r.y] * hl.log(probabilities[u8_gray[x + r.x, y + r.y]]+0.00001) levels[x,y] = levels[x,y] * -1.0 return levels
def main(): # First we'll declare some Vars to use below. x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/rgb.png") # Now we'll express a multi-stage pipeline that blurs an image # first horizontally, and then vertically. if True: # Take a color 8-bit input input = hl.Buffer(imread(image_path)) assert input.type() == hl.UInt(8) # Upgrade it to 16-bit, so we can do math without it overflowing. input_16 = hl.Func("input_16") input_16[x, y, c] = hl.cast(hl.UInt(16), input[x, y, c]) # Blur it horizontally: blur_x = hl.Func("blur_x") blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4 # Blur it vertically: blur_y = hl.Func("blur_y") blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4 # Convert back to 8-bit. output = hl.Func("output") output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c]) # Each hl.Func in this pipeline calls a previous one using # familiar function call syntax (we've overloaded operator() # on hl.Func objects). A hl.Func may call any other hl.Func that has # been given a definition. This restriction prevents # pipelines with loops in them. Halide pipelines are always # feed-forward graphs of Funcs. # Now let's realize it... # result = output.realize(input.width(), input.height(), 3) # Except that the line above is not going to work. Uncomment # it to see what happens. # Realizing this pipeline over the same domain as the input # image requires reading pixels out of bounds in the input, # because the blur_x stage reaches outwards horizontally, and # the blur_y stage reaches outwards vertically. Halide # detects this by injecting a piece of code at the top of the # pipeline that computes the region over which the input will # be read. When it starts to run the pipeline it first runs # this code, determines that the input will be read out of # bounds, and refuses to continue. No actual bounds checks # occur in the inner loop that would be slow. # # So what do we do? There are a few options. If we realize # over a domain shifted inwards by one pixel, we won't be # asking the Halide routine to read out of bounds. We saw how # to do this in the previous lesson: result = hl.Buffer(hl.UInt(8), [input.width() - 2, input.height() - 2, 3]) result.set_min([1, 1]) output.realize(result) # Save the result. It should look like a slightly blurry # parrot, and it should be two pixels narrower and two pixels # shorter than the input image. imsave("blurry_parrot_1.png", result) print("Created blurry_parrot_1.png") # This is usually the fastest way to deal with boundaries: # don't write code that reads out of bounds :) The more # general solution is our next example. # The same pipeline, with a boundary condition on the input. if True: # Take a color 8-bit input input = hl.Buffer(imread(image_path)) assert input.type() == hl.UInt(8) # This time, we'll wrap the input in a hl.Func that prevents # reading out of bounds: clamped = hl.Func("clamped") # Define an expression that clamps x to lie within the the # range [0, input.width()-1]. clamped_x = hl.clamp(x, 0, input.width() - 1) # Similarly hl.clamp y. clamped_y = hl.clamp(y, 0, input.height() - 1) # Load from input at the clamped coordinates. This means that # no matter how we evaluated the hl.Func 'clamped', we'll never # read out of bounds on the input. This is a hl.clamp-to-edge # style boundary condition, and is the simplest boundary # condition to express in Halide. clamped[x, y, c] = input[clamped_x, clamped_y, c] # Defining 'clamped' in that way can be done more concisely # using a helper function from the BoundaryConditions # namespace like so: # # clamped = hl.BoundaryConditions.repeat_edge(input) # # These are important to use for other boundary conditions, # because they are expressed in the way that Halide can best # understand and optimize. # Upgrade it to 16-bit, so we can do math without it # overflowing. This time we'll refer to our new hl.Func # 'clamped', instead of referring to the input image # directly. input_16 = hl.Func("input_16") input_16[x, y, c] = hl.cast(hl.UInt(16), clamped[x, y, c]) # The rest of the pipeline will be the same... # Blur it horizontally: blur_x = hl.Func("blur_x") blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4 # Blur it vertically: blur_y = hl.Func("blur_y") blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4 # Convert back to 8-bit. output = hl.Func("output") output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c]) # This time it's safe to evaluate the output over the some # domain as the input, because we have a boundary condition. result = output.realize(input.width(), input.height(), 3) # Save the result. It should look like a slightly blurry # parrot, but this time it will be the same size as the # input. imsave("blurry_parrot_2.png", result) print("Created blurry_parrot_2.png") print("Success!") return 0
def get_interpolate(input, levels): """ Build function, schedules it, and invokes jit compiler :return: halide.hl.Func """ # THE ALGORITHM downsampled = [hl.Func('downsampled%d'%i) for i in range(levels)] downx = [hl.Func('downx%d'%l) for l in range(levels)] interpolated = [hl.Func('interpolated%d'%i) for i in range(levels)] # level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)] # level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)] upsampled = [hl.Func('upsampled%d'%l) for l in range(levels)] upsampledx = [hl.Func('upsampledx%d'%l) for l in range(levels)] x = hl.Var('x') y = hl.Var('y') c = hl.Var('c') clamped = hl.Func('clamped') clamped[x, y, c] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c] # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we # rewrite it in a way that doesn't trigger the bug. The rewritten # form assumes the input alpha is zero or one. # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3]) downsampled[0][x,y,c] = clamped[x, y, c] * clamped[x, y, 3] for l in range(1, levels): prev = hl.Func() prev = downsampled[l-1] if l == 4: # Also add a boundary condition at a middle pyramid level # to prevent the footprint of the downsamplings to extend # too far off the base image. Otherwise we look 512 # pixels off each edge. w = input.width()/(1 << l) h = input.height()/(1 << l) prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c]) downx[l][x,y,c] = (prev[x*2-1,y,c] + 2.0 * prev[x*2,y,c] + prev[x*2+1,y,c]) * 0.25 downsampled[l][x,y,c] = (downx[l][x,y*2-1,c] + 2.0 * downx[l][x,y*2,c] + downx[l][x,y*2+1,c]) * 0.25 interpolated[levels-1][x,y,c] = downsampled[levels-1][x,y,c] for l in range(levels-1)[::-1]: upsampledx[l][x,y,c] = (interpolated[l+1][x/2, y, c] + interpolated[l+1][(x+1)/2, y, c]) / 2.0 upsampled[l][x,y,c] = (upsampledx[l][x, y/2, c] + upsampledx[l][x, (y+1)/2, c]) / 2.0 interpolated[l][x,y,c] = downsampled[l][x,y,c] + (1.0 - downsampled[l][x,y,3]) * upsampled[l][x,y,c] normalize = hl.Func('normalize') normalize[x,y,c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3] final = hl.Func('final') final[x,y,c] = normalize[x,y,c] print("Finished function setup.") # THE SCHEDULE sched = 2 target = hl.get_target_from_environment() if target.has_gpu_feature(): sched = 4 else: sched = 2 if sched == 0: print ("Flat schedule.") for l in range(levels): downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root() elif sched == 1: print("Flat schedule with vectorization.") for l in range(levels): downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) final.compute_root() elif sched == 2: print("Flat schedule with parallelization + vectorization") xi, yi = hl.Var('xi'), hl.Var('yi') clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) for l in range(1, levels - 1): if l > 0: downsampled[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].unroll(x, 2).unroll(y, 2); final.reorder(c, x, y).bound(c, 0, 3).parallel(y) final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi) final.bound(x, 0, input.width()) final.bound(y, 0, input.height()) elif sched == 3: print("Flat schedule with vectorization sometimes.") for l in range(levels): if l + 4 < levels: yo, yi = hl.Var('yo'), hl.Var('yi') downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) else: downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root(); elif sched == 4: print("GPU schedule.") # Some gpus don't have enough memory to process the entire # image, so we process the image in tiles. yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var("ci") final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4) final.tile(x, y, xo, yo, xi, yi, input.width()/4, input.height()/4) normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16, GPU_Default).unroll(c) # Start from level 1 to save memory - level zero will be computed on demand for l in range(1, levels): tile_size = 32 >> l; if tile_size < 1: tile_size = 1 if tile_size > 16: tile_size = 16 downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) else: print("No schedule with this number.") exit(1) # JIT compile the pipeline eagerly, so we don't interfere with timing final.compile_jit(target) return final
def test_schedules(verbose=False, test_random=False): #random_module.seed(int(sys.argv[1]) if len(sys.argv)>1 else 0) halide.exit_on_signal() f = halide.Func('f') x = halide.Var('x') y = halide.Var('y') c = halide.Var('c') g = halide.Func('g') v = halide.Var('v') input = halide.UniformImage(halide.UInt(16), 3) int_t = halide.Int(32) f[x, y, c] = input[ halide.clamp(x, halide.cast(int_t, 0 ), halide.cast(int_t, input.width() - 1)), halide.clamp(y, halide.cast(int_t, 0 ), halide.cast(int_t, input.height() - 1)), halide.clamp(c, halide.cast(int_t, 0), halide.cast(int_t, 2))] #g[v] = f[v,v] g[x, y, c] = f[x, y, c] + 1 assert sorted(halide.all_vars(g).keys()) == sorted(['x', 'y', 'c']) #, 'v']) if verbose: print halide.func_varlist(f) print 'caller_vars(f) =', caller_vars(g, f) print 'caller_vars(g) =', caller_vars(g, g) # validL = list(valid_schedules(g, f, 4)) # validL = [repr(_x) for _x in validL] # # for L in sorted(validL): # print repr(L) T0 = time.time() if not test_random: random = True #False nvalid_determ = 0 for L in schedules_func(g, f, 0, 3): nvalid_determ += 1 if verbose: print L nvalid_random = 0 for i in range(100): for L in schedules_func( g, f, 0, DEFAULT_MAX_DEPTH, random=True ): #sorted([repr(_x) for _x in valid_schedules(g, f, 3)]): if verbose and 0: print L #repr(L) nvalid_random += 1 s = [] for i in range(400): d = random_schedule(g, 0, DEFAULT_MAX_DEPTH) si = str(d) s.append(si) if verbose: print 'Schedule:', si d.apply() evaluate = d.test((36, 36, 3), input) print 'evaluate' evaluate() if test_random: print 'Success' sys.exit() T1 = time.time() s = '\n'.join(s) assert 'f.chunk(_c0)' in s assert 'f.root().vectorize' in s assert 'f.root().unroll' in s assert 'f.root().split' in s assert 'f.root().tile' in s assert 'f.root().parallel' in s assert 'f.root().transpose' in s assert nvalid_random == 100 if verbose: print 'generated in %.3f secs' % (T1 - T0) print 'random_schedule: OK'
def get_local_laplacian(input, levels, alpha, beta, J=8): downsample_counter=[0] upsample_counter=[0] x = hl.Var('x') y = hl.Var('y') def downsample(f): downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0]) downsample_counter[0] += 1 downx[x,y,c] = (f[2*x-1,y,c] + 3.0*(f[2*x,y,c]+f[2*x+1,y,c]) + f[2*x+2,y,c])/8.0 downy[x,y,c] = (downx[x,2*y-1,c] + 3.0*(downx[x,2*y,c]+downx[x,2*y+1,c]) + downx[x,2*y+2,c])/8.0 return downy def upsample(f): upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0]) upsample_counter[0] += 1 upx[x,y,c] = 0.25 * f[(x//2) - 1 + 2*(x%2),y,c] + 0.75 * f[x//2,y,c] upy[x,y,c] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2),c] + 0.75 * upx[x,y//2,c] return upy def downsample2D(f): downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0]) downsample_counter[0] += 1 downx[x,y] = (f[2*x-1,y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0 downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0 return downy def upsample2D(f): upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0]) upsample_counter[0] += 1 upx[x,y] = 0.25 * f[(x//2) - 1 + 2*(x%2),y] + 0.75 * f[x//2,y] upy[x,y] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2)] + 0.75 * upx[x,y//2] return upy # THE ALGORITHM # loop variables c = hl.Var('c') k = hl.Var('k') # Make the remapping function as a lookup table. remap = hl.Func('remap') fx = hl.cast(float_t, x/256.0) #remap[x] = alpha*fx*exp(-fx*fx/2.0) remap[x] = alpha*fx*hl.exp(-fx*fx/2.0) # Convert to floating point floating = hl.Func('floating') floating[x,y,c] = hl.cast(float_t, input[x,y,c]) / 65535.0 # Set a boundary condition clamped = hl.Func('clamped') clamped[x,y,c] = floating[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c] # Get the luminance channel gray = hl.Func('gray') gray[x,y] = 0.299*clamped[x,y,0] + 0.587*clamped[x,y,1] + 0.114*clamped[x,y,2] # Make the processed Gaussian pyramid. gPyramid = [hl.Func('gPyramid%d'%i) for i in range(J)] # Do a lookup into a lut with 256 entires per intensity level level = k / (levels - 1) idx = gray[x,y]*hl.cast(float_t, levels-1)*256.0 idx = hl.clamp(hl.cast(int_t, idx), 0, (levels-1)*256) gPyramid[0][x,y,k] = beta*(gray[x, y] - level) + level + remap[idx - 256*k] for j in range(1,J): gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k] # Get its laplacian pyramid lPyramid = [hl.Func('lPyramid%d'%i) for i in range(J)] lPyramid[J-1] = gPyramid[J-1] for j in range(J-1)[::-1]: lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k] # Make the Gaussian pyramid of the input inGPyramid = [hl.Func('inGPyramid%d'%i) for i in range(J)] inGPyramid[0] = gray for j in range(1,J): inGPyramid[j][x,y] = downsample2D(inGPyramid[j-1])[x,y] # Make the laplacian pyramid of the output outLPyramid = [hl.Func('outLPyramid%d'%i) for i in range(J)] for j in range(J): # Split input pyramid value into integer and floating parts level = inGPyramid[j][x,y]*hl.cast(float_t, levels-1) li = hl.clamp(hl.cast(int_t, level), 0, levels-2) lf = level - hl.cast(float_t, li) # Linearly interpolate between the nearest processed pyramid levels outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1] # Make the Gaussian pyramid of the output outGPyramid = [hl.Func('outGPyramid%d'%i) for i in range(J)] outGPyramid[J-1] = outLPyramid[J-1] for j in range(J-1)[::-1]: outGPyramid[j][x,y] = upsample2D(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y] # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) color = hl.Func('color') eps = 0.01 color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c] + eps) / (gray[x,y] + eps) output = hl.Func('local_laplacian') # Convert back to 16-bit output[x,y,c] = hl.cast(hl.UInt(16), hl.clamp(color[x,y,c], 0.0, 1.0) * 65535.0) # THE SCHEDULE remap.compute_root() target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU Schedule print ("Compiling for GPU") xi, yi = hl.Var("xi"), hl.Var("yi") output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default) for j in range(J): blockw = 32 blockh = 16 if j > 3: blockw = 2 blockh = 2 if j > 0: inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) if j > 0: gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) else: # CPU schedule print ("Compiling for CPU") output.parallel(y, 4).vectorize(x, 4); gray.compute_root().parallel(y, 4).vectorize(x, 4); for j in range(4): if j > 0: inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) if j > 0: gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) outGPyramid[j].compute_root().parallel(y).vectorize(x, 4) for j in range(4,J): inGPyramid[j].compute_root().parallel(y) gPyramid[j].compute_root().parallel(k) outGPyramid[j].compute_root().parallel(y) return output
def main(): # Declare some Vars to use below. x, y = hl.Var ("x"), hl.Var ("y") # Load a grayscale image to use as an input. image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png") input_data = imread(image_path) if True: # making the image smaller to go faster input_data = input_data[:160, :150] assert input_data.dtype == np.uint8 input = hl.Buffer(input_data) # You can define a hl.Func in multiple passes. Let's see a toy # example first. if True: # The first definition must be one like we have seen already # - a mapping from Vars to an hl.Expr: f = hl.Func("f") f[x, y] = x + y # We call this first definition the "pure" definition. # But the later definitions can include computed expressions on # both sides. The simplest example is modifying a single point: f[3, 7] = 42 # We call these extra definitions "update" definitions, or # "reduction" definitions. A reduction definition is an # update definition that recursively refers back to the # function's current value at the same site: if False: e = f[x, y] + 17 print("f[x, y] + 17", e) print("(f[x, y] + 17).type()", e.type()) print("(f[x, y]).type()", f[x,y].type()) f[x, y] = f[x, y] + 17 # If we confine our update to a single row, we can # recursively refer to values in the same column: f[x, 3] = f[x, 0] * f[x, 10] # Similarly, if we confine our update to a single column, we # can recursively refer to other values in the same row. f[0, y] = f[0, y] / f[3, y] # The general rule is: Each hl.Var used in an update definition # must appear unadorned in the same position as in the pure # definition in all references to the function on the left- # and right-hand sides. So the following definitions are # legal updates: f[x, 17] = x + 8 # x is used, so all uses of f must have x as the first argument. f[0, y] = y * 8 # y is used, so all uses of f must have y as the second argument. f[x, x + 1] = x + 8 f[y/2, y] = f[0, y] * 17 # But these ones would cause an error: # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'. # f[y, y + 1) = y + 8 <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'. # f[y, x) = y - x <- Arguments to f on the left-hand-side are in the wrong places. # f[3, 4) = x + y <- Free variables appear on the right-hand-side but not the left-hand-side. # We'll realize this one just to make sure it compiles. The # second-to-last definition forces us to realize over a # domain that is taller than it is wide. f.realize(100, 101) # For each realization of f, each step runs in its entirety # before the next one begins. Let's trace the loads and # stores for a simpler example: g = hl.Func("g") g[x, y] = x + y # Pure definition g[2, 1] = 42 # First update definition g[x, 0] = g[x, 1] # Second update definition g.trace_loads() g.trace_stores() g.realize(4, 4) # Reading the log, we see that each pass is applied in turn. The equivalent C is: result = np.empty( (4,4), dtype=np.int) # Pure definition for yy in range(4): for xx in range(4): result[yy][xx] = xx + yy # First update definition result[1][2] = 42 # Second update definition for xx in range(4): result[0][xx] = result[1][xx] # end of section # Putting update passes inside loops. if True: # Starting with this pure definition: f = hl.Func("f") f[x, y] = x + y # Say we want an update that squares the first fifty rows. We # could do this by adding 50 update definitions: # f[x, 0) = f[x, 0) * f[x, 0) # f[x, 1) = f[x, 1) * f[x, 1) # f[x, 2) = f[x, 2) * f[x, 2) # ... # f[x, 49) = f[x, 49) * f[x, 49) # Or equivalently using a compile-time loop in our C++: # for (int i = 0 i < 50 i++) { # f[x, i) = f[x, i) * f[x, i) # # But it's more manageable and more flexible to put the loop # in the generated code. We do this by defining a "reduction # domain" and using it inside an update definition: r = hl.RDom([(0, 50)]) f[x, r] = f[x, r] * f[x, r] halide_result = f.realize(100, 100) # The equivalent C is: c_result = np.empty((100, 100), dtype=np.int) for yy in range(100): for xx in range(100): c_result[yy][xx] = xx + yy for xx in range(100): for rr in range(50): # The loop over the reduction domain occurs inside of # the loop over any pure variables used in the update # step: c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx] # Check the results match: for yy in range(100): for xx in range(100): if halide_result[xx, yy] != c_result[yy][xx]: raise Exception("halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], c_result[yy][xx])) return -1 # Now we'll examine a real-world use for an update definition: # computing a histogram. if True: # Some operations on images can't be cleanly expressed as a pure # function from the output coordinates to the value stored # there. The classic example is computing a histogram. The # natural way to do it is to iterate over the input image, # updating histogram buckets. Here's how you do that in Halide: histogram = hl.Func("histogram") # Histogram buckets start as zero. histogram[x] = 0 # Define a multi-dimensional reduction domain over the input image: r = hl.RDom([(0, input.width()), (0, input.height())]) # For every point in the reduction domain, increment the # histogram bucket corresponding to the intensity of the # input image at that point. histogram[input[r.x, r.y]] += 1 halide_result = histogram.realize(256) # The equivalent C is: c_result = np.empty((256), dtype=np.int) for xx in range(256): c_result[xx] = 0 for r_y in range(input.height()): for r_x in range(input.width()): c_result[input_data[r_x, r_y]] += 1 # Check the answers agree: for xx in range(256): if c_result[xx] != halide_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % ( xx, halide_result[xx], c_result[xx])) return -1 # Scheduling update steps if True: # The pure variables in an update step and can be # parallelized, vectorized, split, etc as usual. # Vectorizing, splitting, or parallelize the variables that # are part of the reduction domain is trickier. We'll cover # that in a later lesson. # Consider the definition: f = hl.Func("x") f[x, y] = x*y # Set the second row to equal the first row. f[x, 1] = f[x, 0] # Set the second column to equal the first column plus 2. f[1, y] = f[0, y] + 2 # The pure variables in each stage can be scheduled # independently. To control the pure definition, we schedule # as we have done in the past. The following code vectorizes # and parallelizes the pure definition only. f.vectorize(x, 4).parallel(y) # We use hl.Func::update(int) to get a handle to an update step # for the purposes of scheduling. The following line # vectorizes the first update step across x. We can't do # anything with y for this update step, because it doesn't # use y. f.update(0).vectorize(x, 4) # Now we parallelize the second update step in chunks of size # 4. yo, yi = hl.Var("yo"), hl.Var("yi") f.update(1).split(y, yo, yi, 4).parallel(yo) halide_result = f.realize(16, 16) # Here's the equivalent (serial) C: c_result = np.empty((16, 16), dtype=np.int) # Pure step. Vectorized in x and parallelized in y. for yy in range( 16): # Should be a parallel for loop for x_vec in range(4): xx = [x_vec*4, x_vec*4+1, x_vec*4+2, x_vec*4+3] c_result[yy][xx[0]] = xx[0] * yy c_result[yy][xx[1]] = xx[1] * yy c_result[yy][xx[2]] = xx[2] * yy c_result[yy][xx[3]] = xx[3] * yy # First update. Vectorized in x. for x_vec in range(4): xx = [x_vec*4, x_vec*4+1, x_vec*4+2, x_vec*4+3] c_result[1][xx[0]] = c_result[0][xx[0]] c_result[1][xx[1]] = c_result[0][xx[1]] c_result[1][xx[2]] = c_result[0][xx[2]] c_result[1][xx[3]] = c_result[0][xx[3]] # Second update. Parallelized in chunks of size 4 in y. for yo in range(4): # Should be a parallel for loop for yi in range(4): yy = yo*4 + yi c_result[yy][1] = c_result[yy][0] + 2 # Check the C and Halide results match: for yy in range( 16): for xx in range( 16 ): if halide_result[xx, yy] != c_result[yy][xx]: raise Exception("halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], c_result[yy][xx])) return -1 # That covers how to schedule the variables within a hl.Func that # uses update steps, but what about producer-consumer # relationships that involve compute_at and store_at? Let's # examine a reduction as a producer, in a producer-consumer pair. if True: # Because an update does multiple passes over a stored array, # it's not meaningful to inline them. So the default schedule # for them does the closest thing possible. It computes them # in the innermost loop of their consumer. Consider this # trivial example: producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x*17 producer[x] += 1 consumer[x] = 2 * producer[x] halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range(10): producer_storage = np.empty((1), dtype=np.int) # Pure step for producer producer_storage[0] = xx * 17 # Update step for producer producer_storage[0] = producer_storage[0] + 1 # Pure step for consumer c_result[xx] = 2 * producer_storage[0] # Check the results match for xx in range( 10 ): if halide_result[xx] != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % ( xx, halide_result[xx], c_result[xx])) return -1 # For all other compute_at/store_at options, the reduction # gets placed where you would expect, somewhere in the loop # nest of the consumer. # Now let's consider a reduction as a consumer in a # producer-consumer pair. This is a little more involved. if True: if True: # Case 1: The consumer references the producer in the pure step only. producer, consumer = hl.Func("producer"), hl.Func("consumer") # The producer is pure. producer[x] = x*17 consumer[x] = 2 * producer[x] consumer[x] += 1 # The valid schedules for the producer in this case are # the default schedule - inlined, and also: # # 1) producer.compute_at(x), which places the computation of # the producer inside the loop over x in the pure step of the # consumer. # # 2) producer.compute_root(), which computes all of the # producer ahead of time. # # 3) producer.store_root().compute_at(x), which allocates # space for the consumer outside the loop over x, but fills # it in as needed inside the loop. # # Let's use option 1. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range( 10 ): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = 2 * producer_storage[0] # Update step for the consumer for xx in range( 10 ): c_result[xx] += 1 # All of the pure step is evaluated before any of the # update step, so there are two separate loops over x. # Check the results match for xx in range( 10 ): if halide_result[xx] != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % ( xx, halide_result[xx], c_result[xx])) return -1 if True: # Case 2: The consumer references the producer in the update step only producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside the update # step of the producer, because that's the only step that # uses the producer. producer.compute_at(consumer, x) # Note however, that we didn't say: # # producer.compute_at(consumer.update(0), x). # # Scheduling is done with respect to Vars of a hl.Func, and # the Vars of a hl.Func are shared across the pure and # update steps. halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range( 10 ): c_result[xx] = xx # Update step for the consumer for xx in range( 10 ): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range( 10 ): if halide_result[xx] != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % ( xx, halide_result[xx], c_result[xx])) return -1 if True: # Case 3: The consumer references the producer in # multiple steps that share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = producer[x] * x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside both the # pure and the update step of the producer. So there ends # up being two separate realizations of the producer, and # redundant work occurs. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range( 10 ): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = producer_storage[0] * xx # Update step for the consumer for xx in range( 10 ): # Another copy of the pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range( 10 ): if halide_result[xx] != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % ( xx, halide_result[xx], c_result[xx])) return -1 if True: # Case 4: The consumer references the producer in # multiple steps that do not share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x, y] = x*y consumer[x, y] = x + y consumer[x, 0] = producer[x, x-1] consumer[0, y] = producer[y, y-1] # In this case neither producer.compute_at(consumer, x) # nor producer.compute_at(consumer, y) will work, because # either one fails to cover one of the uses of the # producer. So we'd have to inline producer, or use # producer.compute_root(). # Let's say we really really want producer to be # compute_at the inner loops of both consumer update # steps. Halide doesn't allow multiple different # schedules for a single hl.Func, but we can work around it # by making two wrappers around producer, and scheduling # those instead: # Attempt 2: producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(), hl.Func(), hl.Func() producer_wrapper_1[x, y] = producer[x, y] producer_wrapper_2[x, y] = producer[x, y] consumer_2[x, y] = x + y consumer_2[x, 0] += producer_wrapper_1[x, x-1] consumer_2[0, y] += producer_wrapper_2[y, y-1] # The wrapper functions give us two separate handles on # the producer, so we can schedule them differently. producer_wrapper_1.compute_at(consumer_2, x) producer_wrapper_2.compute_at(consumer_2, y) halide_result = consumer_2.realize(10, 10) # The equivalent C is: c_result = np.empty((10, 10), dtype=np.int) # Pure step for the consumer for yy in range( 10): for xx in range( 10 ): c_result[yy][xx] = xx + yy # First update step for consumer for xx in range( 10 ): producer_wrapper_1_storage = np.empty((1), dtype=np.int) producer_wrapper_1_storage[0] = xx * (xx-1) c_result[0][xx] += producer_wrapper_1_storage[0] # Second update step for consumer for yy in range( 10): producer_wrapper_2_storage = np.empty((1), dtype=np.int) producer_wrapper_2_storage[0] = yy * (yy-1) c_result[yy][0] += producer_wrapper_2_storage[0] # Check the results match for yy in range( 10): for xx in range( 10 ): if halide_result[xx, yy] != c_result[yy][xx]: print("halide_result(%d, %d) = %d instead of %d", xx, yy, halide_result[xx, yy], c_result[yy][xx]) return -1 if True: # Case 5: Scheduling a producer under a reduction domain # variable of the consumer. # We are not just restricted to scheduling producers at # the loops over the pure variables of the consumer. If a # producer is only used within a loop over a reduction # domain (hl.RDom) variable, we can also schedule the # producer there. producer, consumer = hl.Func("producer"), hl.Func("consumer") r = hl.RDom([(0, 5)]) producer[x] = x * 17 consumer[x] = x + 10 consumer[x] += r + producer[x + r] producer.compute_at(consumer, r) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer. for xx in range(10): c_result[xx] = xx + 10 # Update step for the consumer. for xx in range( 10 ): for rr in range(5): # The loop over the reduction domain is always the inner loop. # We've schedule the storage and computation of # the producer here. We just need a single value. producer_storage = np.empty((1), dtype=np.int) # Pure step of the producer. producer_storage[0] = (xx + rr) * 17 # Now use it in the update step of the consumer. c_result[xx] += rr + producer_storage[0] # Check the results match for xx in range( 10 ): if halide_result[xx] != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % ( xx, halide_result[xx], c_result[xx])) return -1 # A real-world example of a reduction inside a producer-consumer chain. if True: # The default schedule for a reduction is a good one for # convolution-like operations. For example, the following # computes a 5x5 box-blur of our grayscale test image with a # hl.clamp-to-edge boundary condition: # First add the boundary condition. clamped = hl.BoundaryConditions.repeat_edge(input) # Define a 5x5 box that starts at (-2, -2) r = hl.RDom([(-2, 5), (-2, 5)]) # Compute the 5x5 sum around each pixel. local_sum = hl.Func("local_sum") local_sum[x, y] = 0 # Compute the sum as a 32-bit integer local_sum[x, y] += clamped[x + r.x, y + r.y] # Divide the sum by 25 to make it an average blurry = hl.Func("blurry") blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25) halide_result = blurry.realize(input.width(), input.height()) # The default schedule will inline 'clamped' into the update # step of 'local_sum', because clamped only has a pure # definition, and so its default schedule is fully-inlined. # We will then compute local_sum per x coordinate of blurry, # because the default schedule for reductions is # compute-innermost. Here's the equivalent C: #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0] local_sum = np.empty((1), dtype=np.int32) c_result = hl.Buffer(hl.UInt(8), [input.width(), input.height()]) for yy in range(input.height()): for xx in range(input.width()): # FIXME this loop is quite slow # Pure step of local_sum local_sum[0] = 0 # Update step of local_sum for r_y in range(-2, 2+1): for r_x in range(-2, 2+1): # The clamping has been inlined into the update step. clamped_x = min(max(xx + r_x, 0), input.width()-1) clamped_y = min(max(yy + r_y, 0), input.height()-1) local_sum[0] += input[clamped_x, clamped_y] # Pure step of blurry #c_result(x, y) = (uint8_t)(local_sum[0] / 25) #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25) c_result[xx, yy] = int(local_sum[0] / 25) # hl.cast done internally # Check the results match for yy in range(input.height()): for xx in range(input.width()): if halide_result[xx, yy] != c_result[xx, yy]: raise Exception("halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result[xx, yy], c_result[xx, yy])) return -1 # Reduction helpers. if True: # There are several reduction helper functions provided in # Halide.h, which compute small reductions and schedule them # innermost into their consumer. The most useful one is # "sum". f1 = hl.Func ("f1") r = hl.RDom([(0, 100)]) f1[x] = hl.sum(r + x) * 7 # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to: f2, anon = hl.Func("f2"), hl.Func("anon") anon[x] = 0 anon[x] += r + x f2[x] = anon[x] * 7 # So even though f1 references a reduction domain, it is a # pure function. The reduction domain has been swallowed to # define the inner anonymous reduction. halide_result_1 = f1.realize(10) halide_result_2 = f2.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range( 10 ): anon = np.empty((1), dtype=np.int) anon[0] = 0 for rr in range(100): anon[0] += rr + xx c_result[xx] = anon[0] * 7 # Check they all match. for xx in range( 10 ): if halide_result_1[xx] != c_result[xx]: print("halide_result_1(%d) = %d instead of %d", xx, halide_result_1[xx], c_result[xx]) return -1 if halide_result_2[xx] != c_result[xx]: print("halide_result_2(%d) = %d instead of %d", xx, halide_result_2[xx], c_result[xx]) return -1 # A complex example that uses reduction helpers. if False: # non-sense to port SSE code to python, skipping this test # Other reduction helpers include "product", "minimum", # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax # requires understanding tuples, which come in a later # lesson. Let's use minimum and maximum to compute the local # spread of our grayscale image. # First, add a boundary condition to the input. clamped = hl.Func("clamped") x_clamped = hl.clamp(x, 0, input.width()-1) y_clamped = hl.clamp(y, 0, input.height()-1) clamped[x, y] = input[x_clamped, y_clamped] box = hl.RDom([(-2, 5), (-2, 5)]) # Compute the local maximum minus the local minimum: spread = hl.Func("spread") spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) - minimum(clamped(x + box.x, y + box.y))) # Compute the result in strips of 32 scanlines yo, yi = hl.Var("yo"), hl.Var("yi") spread.split(y, yo, yi, 32).parallel(yo) # Vectorize across x within the strips. This implicitly # vectorizes stuff that is computed within the loop over x in # spread, which includes our minimum and maximum helpers, so # they get vectorized too. spread.vectorize(x, 16) # We'll apply the boundary condition by padding each scanline # as we need it in a circular buffer (see lesson 08). clamped.store_at(spread, yo).compute_at(spread, yi) halide_result = spread.realize(input.width(), input.height()) # The C equivalent is almost too horrible to contemplate (and # took me a long time to debug). This time I want to time # both the Halide version and the C version, so I'll use sse # intrinsics for the vectorization, and openmp to do the # parallel for loop (you'll need to compile with -fopenmp or # similar to get correct timing). #ifdef __SSE2__ # Don't include the time required to allocate the output buffer. c_result = hl.Buffer(hl.UInt(8), input.width(), input.height()) #ifdef _OPENMP t1 = datetime.now() #endif # Run this one hundred times so we can average the timing results. for iters in range(100): pass # #pragma omp parallel for # for yo in range((input.height() + 31)/32): # y_base = hl.min(yo * 32, input.height() - 32) # # # Compute clamped in a circular buffer of size 8 # # (smallest power of two greater than 5). Each thread # # needs its own allocation, so it must occur here. # # clamped_width = input.width() + 4 # clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8) # # for yi in range(32): # y = y_base + yi # # uint8_t *output_row = &c_result(0, y) # # # Compute clamped for this scanline, skipping rows # # already computed within this slice. # int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2) # int max_y_clamped = (y + 2) # for (int cy = min_y_clamped cy <= max_y_clamped cy++) { # # Figure out which row of the circular buffer # # we're filling in using bitmasking: # uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width # # # Figure out which row of the input we're reading # # from by clamping the y coordinate: # int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1) # uint8_t *input_row = &input(0, clamped_y) # # # Fill it in with the padding. # for (int x = -2 x < input.width() + 2 ): # int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1) # *clamped_row++ = input_row[clamped_x] # # # # # Now iterate over vectors of x for the pure step of the output. # for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) { # int x_base = std::hl.min(x_vec * 16, input.width() - 16) # # # Allocate storage for the minimum and maximum # # helpers. One vector is enough. # __m128i minimum_storage, maximum_storage # # # The pure step for the maximum is a vector of zeros # maximum_storage = (__m128i)_mm_setzero_ps() # # # The update step for maximum # for (int max_y = y - 2 max_y <= y + 2 max_y++) { # uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width # for (int max_x = x_base - 2 max_x <= x_base + 2 max_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2)) # maximum_storage = _mm_max_epu8(maximum_storage, v) # # # # # The pure step for the minimum is a vector of # # ones. Create it by comparing something to # # itself. # minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(), # _mm_setzero_ps()) # # # The update step for minimum. # for (int min_y = y - 2 min_y <= y + 2 min_y++) { # uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width # for (int min_x = x_base - 2 min_x <= x_base + 2 min_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2)) # minimum_storage = _mm_min_epu8(minimum_storage, v) # # # # # Now compute the spread. # __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage) # # # Store it. # _mm_storeu_si128((__m128i *)(output_row + x_base), spread) # # # # del clamped_storage # # end of hundred iterations # Skip the timing comparison if we don't have openmp # enabled. Otherwise it's unfair to C. #ifdef _OPENMP t2 = datetime.now() # Now run the Halide version again without the # jit-compilation overhead. Also run it one hundred times. for iters in range(100): spread.realize(halide_result) t3 = datetime.now() # Report the timings. On my machine they both take about 3ms # for the 4-megapixel input (fast!), which makes sense, # because they're using the same vectorization and # parallelization strategy. However I find the Halide easier # to read, write, debug, modify, and port. print("Halide spread took %f ms. C equivalent took %f ms" % ( (t3 - t2).total_seconds() * 1000, (t2 - t1).total_seconds() * 1000)) #endif # _OPENMP # Check the results match: for yy in range(input.height()): for xx in range(input.width()): if halide_result(xx, yy) != c_result(xx, yy): raise Exception("halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result(xx, yy), c_result(xx, yy))) return -1 #endif # __SSE2__ else: print("(Skipped the SSE2 section of the code, " "since non-sense in python world.)") print("Success!") return 0
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def main(): # First we'll declare some Vars to use below. x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/rgb.png") # Now we'll express a multi-stage pipeline that blurs an image # first horizontally, and then vertically. if True: # Take a color 8-bit input input = hl.Buffer(imageio.imread(image_path)) assert input.type() == hl.UInt(8) # Upgrade it to 16-bit, so we can do math without it overflowing. input_16 = hl.Func("input_16") input_16[x, y, c] = hl.cast(hl.UInt(16), input[x, y, c]) # Blur it horizontally: blur_x = hl.Func("blur_x") blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4 # Blur it vertically: blur_y = hl.Func("blur_y") blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4 # Convert back to 8-bit. output = hl.Func("output") output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c]) # Each hl.Func in this pipeline calls a previous one using # familiar function call syntax (we've overloaded operator() # on hl.Func objects). A hl.Func may call any other hl.Func that has # been given a definition. This restriction prevents # pipelines with loops in them. Halide pipelines are always # feed-forward graphs of Funcs. # Now let's realize it... # result = output.realize(input.width(), input.height(), 3) # Except that the line above is not going to work. Uncomment # it to see what happens. # Realizing this pipeline over the same domain as the input # image requires reading pixels out of bounds in the input, # because the blur_x stage reaches outwards horizontally, and # the blur_y stage reaches outwards vertically. Halide # detects this by injecting a piece of code at the top of the # pipeline that computes the region over which the input will # be read. When it starts to run the pipeline it first runs # this code, determines that the input will be read out of # bounds, and refuses to continue. No actual bounds checks # occur in the inner loop that would be slow. # # So what do we do? There are a few options. If we realize # over a domain shifted inwards by one pixel, we won't be # asking the Halide routine to read out of bounds. We saw how # to do this in the previous lesson: result = hl.Buffer( hl.UInt(8), [input.width() - 2, input.height() - 2, 3]) result.set_min([1, 1]) output.realize(result) # Save the result. It should look like a slightly blurry # parrot, and it should be two pixels narrower and two pixels # shorter than the input image. imageio.imsave("blurry_parrot_1.png", result) print("Created blurry_parrot_1.png") # This is usually the fastest way to deal with boundaries: # don't write code that reads out of bounds :) The more # general solution is our next example. # The same pipeline, with a boundary condition on the input. if True: # Take a color 8-bit input input = hl.Buffer(imageio.imread(image_path)) assert input.type() == hl.UInt(8) # This time, we'll wrap the input in a hl.Func that prevents # reading out of bounds: clamped = hl.Func("clamped") # Define an expression that clamps x to lie within the the # range [0, input.width()-1]. clamped_x = hl.clamp(x, 0, input.width() - 1) # Similarly hl.clamp y. clamped_y = hl.clamp(y, 0, input.height() - 1) # Load from input at the clamped coordinates. This means that # no matter how we evaluated the hl.Func 'clamped', we'll never # read out of bounds on the input. This is a hl.clamp-to-edge # style boundary condition, and is the simplest boundary # condition to express in Halide. clamped[x, y, c] = input[clamped_x, clamped_y, c] # Defining 'clamped' in that way can be done more concisely # using a helper function from the BoundaryConditions # namespace like so: # # clamped = hl.BoundaryConditions.repeat_edge(input) # # These are important to use for other boundary conditions, # because they are expressed in the way that Halide can best # understand and optimize. # Upgrade it to 16-bit, so we can do math without it # overflowing. This time we'll refer to our new hl.Func # 'clamped', instead of referring to the input image # directly. input_16 = hl.Func("input_16") input_16[x, y, c] = hl.cast(hl.UInt(16), clamped[x, y, c]) # The rest of the pipeline will be the same... # Blur it horizontally: blur_x = hl.Func("blur_x") blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4 # Blur it vertically: blur_y = hl.Func("blur_y") blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4 # Convert back to 8-bit. output = hl.Func("output") output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c]) # This time it's safe to evaluate the output over the some # domain as the input, because we have a boundary condition. result = output.realize(input.width(), input.height(), 3) # Save the result. It should look like a slightly blurry # parrot, but this time it will be the same size as the # input. imageio.imsave("blurry_parrot_2.png", result) print("Created blurry_parrot_2.png") print("Success!") return 0
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def gen_g(self): ''' define g() function ''' # vars i, j, k, l = [self.vars[c] for c in "ijkl"] # clamped inputs x, y, z, expnt, fm, rnorm = [ self.clamps[c] for c in ["x", "y", "z", "expnt", "fm", "rnorm"] ] # unclamped input (for sizing) fm_in = self.inputs["fm_in"] # scalar inputs delo2, delta, rdelta = [ self.inputs[c] for c in ["delo2", "delta", "rdelta"] ] dx = hl.Func("dx") dy = hl.Func("dy") dz = hl.Func("dz") r2 = hl.Func("g_r2") expnt2 = hl.Func("expnt2") expnt_inv = hl.Func("expnt_inv") self.add_funcs_by_name([dx, dy, dz, r2, expnt2, expnt_inv]) dx[i, j] = x[i] - x[j] dy[i, j] = y[i] - y[j] dz[i, j] = z[i] - z[j] r2[i, j] = dx[i, j] * dx[i, j] + dy[i, j] * dy[i, j] + dz[i, j] * dz[i, j] expnt2[i, j] = expnt[i] + expnt[j] expnt_inv[i, j] = hl.f64(1.0) / expnt2[i, j] fac2 = hl.Func("fac2") ex_arg = hl.Func("ex_arg") ex = hl.Func("ex") denom = hl.Func("denom") fac4d = hl.Func("fac4d") self.add_funcs_by_name([fac2, ex_arg, ex, denom, fac4d]) fac2[i, j] = expnt[i] * expnt[j] * expnt_inv[i, j] ex_arg[i, j, k, l] = -fac2[i, j] * r2[i, j] - fac2[k, l] * r2[k, l] ex[i, j, k, l] = hl.select(ex_arg[i, j, k, l] < hl.f64(-37.0), hl.f64(0.0), hl.exp(ex_arg[i, j, k, l])) denom[i, j, k, l] = expnt2[i, j] * expnt2[k, l] * hl.sqrt(expnt2[i, j] + expnt2[k, l]) fac4d[i, j, k, l] = expnt2[i, j] * expnt2[k, l] / (expnt2[i, j] + expnt2[k, l]) x2 = hl.Func("g_x2") y2 = hl.Func("g_y2") z2 = hl.Func("g_z2") rpq2 = hl.Func("rpq2") self.add_funcs_by_name([x2, y2, z2, rpq2]) x2[i, j] = (x[i] * expnt[i] + x[j] * expnt[j]) * expnt_inv[i, j] y2[i, j] = (y[i] * expnt[i] + y[j] * expnt[j]) * expnt_inv[i, j] z2[i, j] = (z[i] * expnt[i] + z[j] * expnt[j]) * expnt_inv[i, j] rpq2[i, j, k, l] = ((x2[i, j] - x2[k, l]) * (x2[i, j] - x2[k, l]) + (y2[i, j] - y2[k, l]) * (y2[i, j] - y2[k, l]) + (z2[i, j] - z2[k, l]) * (z2[i, j] - z2[k, l])) f0t = hl.Func("f0t") f0n = hl.Func("f0n") f0x = hl.Func("f0x") f0val = hl.Func("f0val") self.add_funcs_by_name([f0t, f0n, f0x, f0val]) f0t[i, j, k, l] = fac4d[i, j, k, l] * rpq2[i, j, k, l] f0n[i, j, k, l] = hl.clamp(hl.i32((f0t[i, j, k, l] + delo2) * rdelta), fm_in.dim(0).min(), fm_in.dim(0).max()) f0x[i, j, k, l] = delta * f0n[i, j, k, l] - f0t[i, j, k, l] f0val[i, j, k, l] = hl.select( f0t[i, j, k, l] >= hl.f64(28.0), hl.f64(0.88622692545276) / hl.sqrt(f0t[i, j, k, l]), fm[f0n[i, j, k, l], 0] + f0x[i, j, k, l] * (fm[f0n[i, j, k, l], 1] + f0x[i, j, k, l] * hl.f64(0.5) * (fm[f0n[i, j, k, l], 2] + f0x[i, j, k, l] * hl.f64(1. / 3.) * (fm[f0n[i, j, k, l], 3] + f0x[i, j, k, l] * hl.f64(0.25) * fm[f0n[i, j, k, l], 4])))) g = hl.Func("g") self.add_funcs_by_name([g]) if self.tracing and self.tracing_g: g_trace_in = hl.ImageParam(hl.Float(64), 4, "g_trace_in") g_trace = hl.BoundaryConditions.constant_exterior(g_trace_in, 0) self.inputs["g_trace_in"] = g_trace_in self.clamps["g_trace"] = g_trace g_trace.compute_root() g[i, j, k, l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i, j, k, l] ) * ex[i, j, k, l] * f0val[i, j, k, l] * rnorm[i] * rnorm[ j] * rnorm[k] * rnorm[l] + g_trace[i, j, k, l] else: g_trace = None g[i, j, k, l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i, j, k, l]) * ex[i, j, k, l] * f0val[ i, j, k, l] * rnorm[i] * rnorm[j] * rnorm[k] * rnorm[l]
def get_local_laplacian(input, levels, alpha, beta, J=8): downsample_counter = [0] upsample_counter = [0] x = hl.Var('x') y = hl.Var('y') def downsample(f): downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 * (f[2 * x, y, c] + f[2 * x + 1, y, c]) + f[2 * x + 2, y, c]) / 8.0 downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 * (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) + downx[x, 2 * y + 2, c]) / 8.0 return downy def upsample(f): upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y, c] + 0.75 * f[x // 2, y, c] upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2), c] + 0.75 * upx[x, y // 2, c] return upy def downsample2D(f): downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0 downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0 return downy def upsample2D(f): upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y] upy[x, y] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2] return upy # THE ALGORITHM # loop variables c = hl.Var('c') k = hl.Var('k') # Make the remapping function as a lookup table. remap = hl.Func('remap') fx = hl.cast(float_t, x / 256.0) #remap[x] = alpha*fx*exp(-fx*fx/2.0) remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0) # Convert to floating point floating = hl.Func('floating') floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0 # Set a boundary condition clamped = hl.Func('clamped') clamped[x, y, c] = floating[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # Get the luminance channel gray = hl.Func('gray') gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[ x, y, 1] + 0.114 * clamped[x, y, 2] # Make the processed Gaussian pyramid. gPyramid = [hl.Func('gPyramid%d' % i) for i in range(J)] # Do a lookup into a lut with 256 entires per intensity level level = k / (levels - 1) idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0 idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256) gPyramid[0][x, y, k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k] for j in range(1, J): gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k] # Get its laplacian pyramid lPyramid = [hl.Func('lPyramid%d' % i) for i in range(J)] lPyramid[J - 1] = gPyramid[J - 1] for j in range(J - 1)[::-1]: lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample( gPyramid[j + 1])[x, y, k] # Make the Gaussian pyramid of the input inGPyramid = [hl.Func('inGPyramid%d' % i) for i in range(J)] inGPyramid[0] = gray for j in range(1, J): inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y] # Make the laplacian pyramid of the output outLPyramid = [hl.Func('outLPyramid%d' % i) for i in range(J)] for j in range(J): # Split input pyramid value into integer and floating parts level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1) li = hl.clamp(hl.cast(int_t, level), 0, levels - 2) lf = level - hl.cast(float_t, li) # Linearly interpolate between the nearest processed pyramid levels outLPyramid[j][x, y] = ( 1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1] # Make the Gaussian pyramid of the output outGPyramid = [hl.Func('outGPyramid%d' % i) for i in range(J)] outGPyramid[J - 1] = outLPyramid[J - 1] for j in range(J - 1)[::-1]: outGPyramid[j][x, y] = upsample2D( outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) color = hl.Func('color') eps = 0.01 color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + eps) / (gray[x, y] + eps) output = hl.Func('local_laplacian') # Convert back to 16-bit output[x, y, c] = hl.cast(hl.UInt(16), hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0) # THE SCHEDULE remap.compute_root() target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU Schedule print("Compiling for GPU") xi, yi = hl.Var("xi"), hl.Var("yi") output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default) for j in range(J): blockw = 32 blockh = 16 if j > 3: blockw = 2 blockh = 2 if j > 0: inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) if j > 0: gPyramid[j].compute_root().reorder(k, x, y).gpu_tile( x, y, xi, yi, blockw, blockh, GPU_Default) outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) else: # CPU schedule print("Compiling for CPU") output.parallel(y, 4).vectorize(x, 4) gray.compute_root().parallel(y, 4).vectorize(x, 4) for j in range(4): if j > 0: inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) if j > 0: gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) outGPyramid[j].compute_root().parallel(y).vectorize(x, 4) for j in range(4, J): inGPyramid[j].compute_root().parallel(y) gPyramid[j].compute_root().parallel(k) outGPyramid[j].compute_root().parallel(y) return output
def clamp(self, min_p, max_p): return Point(hl.clamp(self.x, min_p.x, max_p.x), hl.clamp(self.y, min_p.y, max_p.y))
def test_schedules(verbose=False, test_random=False): #random_module.seed(int(sys.argv[1]) if len(sys.argv)>1 else 0) halide.exit_on_signal() f = halide.Func('f') x = halide.Var('x') y = halide.Var('y') c = halide.Var('c') g = halide.Func('g') v = halide.Var('v') input = halide.UniformImage(halide.UInt(16), 3) int_t = halide.Int(32) f[x,y,c] = input[halide.clamp(x,halide.cast(int_t,0),halide.cast(int_t,input.width()-1)), halide.clamp(y,halide.cast(int_t,0),halide.cast(int_t,input.height()-1)), halide.clamp(c,halide.cast(int_t,0),halide.cast(int_t,2))] #g[v] = f[v,v] g[x,y,c] = f[x,y,c]+1 assert sorted(halide.all_vars(g).keys()) == sorted(['x', 'y', 'c']) #, 'v']) if verbose: print halide.func_varlist(f) print 'caller_vars(f) =', caller_vars(g, f) print 'caller_vars(g) =', caller_vars(g, g) # validL = list(valid_schedules(g, f, 4)) # validL = [repr(_x) for _x in validL] # # for L in sorted(validL): # print repr(L) T0 = time.time() if not test_random: random = True #False nvalid_determ = 0 for L in schedules_func(g, f, 0, 3): nvalid_determ += 1 if verbose: print L nvalid_random = 0 for i in range(100): for L in schedules_func(g, f, 0, DEFAULT_MAX_DEPTH, random=True): #sorted([repr(_x) for _x in valid_schedules(g, f, 3)]): if verbose and 0: print L#repr(L) nvalid_random += 1 s = [] for i in range(400): d = random_schedule(g, 0, DEFAULT_MAX_DEPTH) si = str(d) s.append(si) if verbose: print 'Schedule:', si d.apply() evaluate = d.test((36, 36, 3), input) print 'evaluate' evaluate() if test_random: print 'Success' sys.exit() T1 = time.time() s = '\n'.join(s) assert 'f.chunk(_c0)' in s assert 'f.root().vectorize' in s assert 'f.root().unroll' in s assert 'f.root().split' in s assert 'f.root().tile' in s assert 'f.root().parallel' in s assert 'f.root().transpose' in s assert nvalid_random == 100 if verbose: print 'generated in %.3f secs' % (T1-T0) print 'random_schedule: OK'
def get_interpolate(input, levels): """ Build function, schedules it, and invokes jit compiler :return: halide.hl.Func """ # THE ALGORITHM downsampled = [hl.Func('downsampled%d' % i) for i in range(levels)] downx = [hl.Func('downx%d' % l) for l in range(levels)] interpolated = [hl.Func('interpolated%d' % i) for i in range(levels)] # level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)] # level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)] upsampled = [hl.Func('upsampled%d' % l) for l in range(levels)] upsampledx = [hl.Func('upsampledx%d' % l) for l in range(levels)] x = hl.Var('x') y = hl.Var('y') c = hl.Var('c') clamped = hl.Func('clamped') clamped[x, y, c] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we # rewrite it in a way that doesn't trigger the bug. The rewritten # form assumes the input alpha is zero or one. # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3]) downsampled[0][x, y, c] = clamped[x, y, c] * clamped[x, y, 3] for l in range(1, levels): prev = hl.Func() prev = downsampled[l - 1] if l == 4: # Also add a boundary condition at a middle pyramid level # to prevent the footprint of the downsamplings to extend # too far off the base image. Otherwise we look 512 # pixels off each edge. w = input.width() / (1 << l) h = input.height() / (1 << l) prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c]) downx[l][x, y, c] = (prev[x * 2 - 1, y, c] + 2.0 * prev[x * 2, y, c] + prev[x * 2 + 1, y, c]) * 0.25 downsampled[l][x, y, c] = (downx[l][x, y * 2 - 1, c] + 2.0 * downx[l][x, y * 2, c] + downx[l][x, y * 2 + 1, c]) * 0.25 interpolated[levels - 1][x, y, c] = downsampled[levels - 1][x, y, c] for l in range(levels - 1)[::-1]: upsampledx[l][x, y, c] = (interpolated[l + 1][x / 2, y, c] + interpolated[l + 1][(x + 1) / 2, y, c]) / 2.0 upsampled[l][x, y, c] = (upsampledx[l][x, y / 2, c] + upsampledx[l][x, (y + 1) / 2, c]) / 2.0 interpolated[l][x, y, c] = downsampled[l][ x, y, c] + (1.0 - downsampled[l][x, y, 3]) * upsampled[l][x, y, c] normalize = hl.Func('normalize') normalize[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3] final = hl.Func('final') final[x, y, c] = normalize[x, y, c] print("Finished function setup.") # THE SCHEDULE sched = 2 target = hl.get_target_from_environment() if target.has_gpu_feature(): sched = 4 else: sched = 2 if sched == 0: print("Flat schedule.") for l in range(levels): downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root() elif sched == 1: print("Flat schedule with vectorization.") for l in range(levels): downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) final.compute_root() elif sched == 2: print("Flat schedule with parallelization + vectorization") xi, yi = hl.Var('xi'), hl.Var('yi') clamped.compute_root().parallel(y).bound(c, 0, 4).reorder( c, x, y).reorder_storage(c, x, y).vectorize(c, 4) for l in range(1, levels - 1): if l > 0: downsampled[l].compute_root().parallel(y).reorder( c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].compute_root().parallel(y).reorder( c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].unroll(x, 2).unroll(y, 2) final.reorder(c, x, y).bound(c, 0, 3).parallel(y) final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi) final.bound(x, 0, input.width()) final.bound(y, 0, input.height()) elif sched == 3: print("Flat schedule with vectorization sometimes.") for l in range(levels): if l + 4 < levels: yo, yi = hl.Var('yo'), hl.Var('yi') downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) else: downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root() elif sched == 4: print("GPU schedule.") # Some gpus don't have enough memory to process the entire # image, so we process the image in tiles. yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var( "ci") final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4) final.tile(x, y, xo, yo, xi, yi, input.width() / 4, input.height() / 4) normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16, GPU_Default).unroll(c) # Start from level 1 to save memory - level zero will be computed on demand for l in range(1, levels): tile_size = 32 >> l if tile_size < 1: tile_size = 1 if tile_size > 16: tile_size = 16 downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) else: print("No schedule with this number.") exit(1) # JIT compile the pipeline eagerly, so we don't interfere with timing final.compile_jit(target) return final
def main(): # Declare some Vars to use below. x, y = hl.Var("x"), hl.Var("y") # Load a grayscale image to use as an input. image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png") input_data = imread(image_path) if True: # making the image smaller to go faster input_data = input_data[:160, :150] assert input_data.dtype == np.uint8 input = hl.Buffer(input_data) # You can define a hl.Func in multiple passes. Let's see a toy # example first. if True: # The first definition must be one like we have seen already # - a mapping from Vars to an hl.Expr: f = hl.Func("f") f[x, y] = x + y # We call this first definition the "pure" definition. # But the later definitions can include computed expressions on # both sides. The simplest example is modifying a single point: f[3, 7] = 42 # We call these extra definitions "update" definitions, or # "reduction" definitions. A reduction definition is an # update definition that recursively refers back to the # function's current value at the same site: if False: e = f[x, y] + 17 print("f[x, y] + 17", e) print("(f[x, y] + 17).type()", e.type()) print("(f[x, y]).type()", f[x, y].type()) f[x, y] = f[x, y] + 17 # If we confine our update to a single row, we can # recursively refer to values in the same column: f[x, 3] = f[x, 0] * f[x, 10] # Similarly, if we confine our update to a single column, we # can recursively refer to other values in the same row. f[0, y] = f[0, y] / f[3, y] # The general rule is: Each hl.Var used in an update definition # must appear unadorned in the same position as in the pure # definition in all references to the function on the left- # and right-hand sides. So the following definitions are # legal updates: f[x, 17] = x + 8 # x is used, so all uses of f must have x as the first argument. f[0, y] = y * 8 # y is used, so all uses of f must have y as the second argument. f[x, x + 1] = x + 8 f[y / 2, y] = f[0, y] * 17 # But these ones would cause an error: # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'. # f[y, y + 1) = y + 8 <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'. # f[y, x) = y - x <- Arguments to f on the left-hand-side are in the wrong places. # f[3, 4) = x + y <- Free variables appear on the right-hand-side but not the left-hand-side. # We'll realize this one just to make sure it compiles. The # second-to-last definition forces us to realize over a # domain that is taller than it is wide. f.realize(100, 101) # For each realization of f, each step runs in its entirety # before the next one begins. Let's trace the loads and # stores for a simpler example: g = hl.Func("g") g[x, y] = x + y # Pure definition g[2, 1] = 42 # First update definition g[x, 0] = g[x, 1] # Second update definition g.trace_loads() g.trace_stores() g.realize(4, 4) # Reading the log, we see that each pass is applied in turn. The equivalent C is: result = np.empty((4, 4), dtype=np.int) # Pure definition for yy in range(4): for xx in range(4): result[yy][xx] = xx + yy # First update definition result[1][2] = 42 # Second update definition for xx in range(4): result[0][xx] = result[1][xx] # end of section # Putting update passes inside loops. if True: # Starting with this pure definition: f = hl.Func("f") f[x, y] = x + y # Say we want an update that squares the first fifty rows. We # could do this by adding 50 update definitions: # f[x, 0) = f[x, 0) * f[x, 0) # f[x, 1) = f[x, 1) * f[x, 1) # f[x, 2) = f[x, 2) * f[x, 2) # ... # f[x, 49) = f[x, 49) * f[x, 49) # Or equivalently using a compile-time loop in our C++: # for (int i = 0 i < 50 i++) { # f[x, i) = f[x, i) * f[x, i) # # But it's more manageable and more flexible to put the loop # in the generated code. We do this by defining a "reduction # domain" and using it inside an update definition: r = hl.RDom(0, 50) f[x, r] = f[x, r] * f[x, r] halide_result = f.realize(100, 100) # The equivalent C is: c_result = np.empty((100, 100), dtype=np.int) for yy in range(100): for xx in range(100): c_result[yy][xx] = xx + yy for xx in range(100): for rr in range(50): # The loop over the reduction domain occurs inside of # the loop over any pure variables used in the update # step: c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx] # Check the results match: for yy in range(100): for xx in range(100): if halide_result(xx, yy) != c_result[yy][xx]: raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result[yy][xx])) return -1 # Now we'll examine a real-world use for an update definition: # computing a histogram. if True: # Some operations on images can't be cleanly expressed as a pure # function from the output coordinates to the value stored # there. The classic example is computing a histogram. The # natural way to do it is to iterate over the input image, # updating histogram buckets. Here's how you do that in Halide: histogram = hl.Func("histogram") # Histogram buckets start as zero. histogram[x] = 0 # Define a multi-dimensional reduction domain over the input image: r = hl.RDom(0, input.width(), 0, input.height()) # For every point in the reduction domain, increment the # histogram bucket corresponding to the intensity of the # input image at that point. histogram[input[r.x, r.y]] += 1 halide_result = histogram.realize(256) # The equivalent C is: c_result = np.empty((256), dtype=np.int) for xx in range(256): c_result[xx] = 0 for r_y in range(input.height()): for r_x in range(input.width()): c_result[input_data[r_x, r_y]] += 1 # Check the answers agree: for xx in range(256): if c_result[xx] != halide_result(xx): raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # Scheduling update steps if True: # The pure variables in an update step and can be # parallelized, vectorized, split, etc as usual. # Vectorizing, splitting, or parallelize the variables that # are part of the reduction domain is trickier. We'll cover # that in a later lesson. # Consider the definition: f = hl.Func("x") f[x, y] = x * y # Set the second row to equal the first row. f[x, 1] = f[x, 0] # Set the second column to equal the first column plus 2. f[1, y] = f[0, y] + 2 # The pure variables in each stage can be scheduled # independently. To control the pure definition, we schedule # as we have done in the past. The following code vectorizes # and parallelizes the pure definition only. f.vectorize(x, 4).parallel(y) # We use hl.Func::update(int) to get a handle to an update step # for the purposes of scheduling. The following line # vectorizes the first update step across x. We can't do # anything with y for this update step, because it doesn't # use y. f.update(0).vectorize(x, 4) # Now we parallelize the second update step in chunks of size # 4. yo, yi = hl.Var("yo"), hl.Var("yi") f.update(1).split(y, yo, yi, 4).parallel(yo) halide_result = f.realize(16, 16) # Here's the equivalent (serial) C: c_result = np.empty((16, 16), dtype=np.int) # Pure step. Vectorized in x and parallelized in y. for yy in range(16): # Should be a parallel for loop for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] c_result[yy][xx[0]] = xx[0] * yy c_result[yy][xx[1]] = xx[1] * yy c_result[yy][xx[2]] = xx[2] * yy c_result[yy][xx[3]] = xx[3] * yy # First update. Vectorized in x. for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] c_result[1][xx[0]] = c_result[0][xx[0]] c_result[1][xx[1]] = c_result[0][xx[1]] c_result[1][xx[2]] = c_result[0][xx[2]] c_result[1][xx[3]] = c_result[0][xx[3]] # Second update. Parallelized in chunks of size 4 in y. for yo in range(4): # Should be a parallel for loop for yi in range(4): yy = yo * 4 + yi c_result[yy][1] = c_result[yy][0] + 2 # Check the C and Halide results match: for yy in range(16): for xx in range(16): if halide_result(xx, yy) != c_result[yy][xx]: raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result[yy][xx])) return -1 # That covers how to schedule the variables within a hl.Func that # uses update steps, but what about producer-consumer # relationships that involve compute_at and store_at? Let's # examine a reduction as a producer, in a producer-consumer pair. if True: # Because an update does multiple passes over a stored array, # it's not meaningful to inline them. So the default schedule # for them does the closest thing possible. It computes them # in the innermost loop of their consumer. Consider this # trivial example: producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 producer[x] += 1 consumer[x] = 2 * producer[x] halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range(10): producer_storage = np.empty((1), dtype=np.int) # Pure step for producer producer_storage[0] = xx * 17 # Update step for producer producer_storage[0] = producer_storage[0] + 1 # Pure step for consumer c_result[xx] = 2 * producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # For all other compute_at/store_at options, the reduction # gets placed where you would expect, somewhere in the loop # nest of the consumer. # Now let's consider a reduction as a consumer in a # producer-consumer pair. This is a little more involved. if True: if True: # Case 1: The consumer references the producer in the pure step only. producer, consumer = hl.Func("producer"), hl.Func("consumer") # The producer is pure. producer[x] = x * 17 consumer[x] = 2 * producer[x] consumer[x] += 1 # The valid schedules for the producer in this case are # the default schedule - inlined, and also: # # 1) producer.compute_at(x), which places the computation of # the producer inside the loop over x in the pure step of the # consumer. # # 2) producer.compute_root(), which computes all of the # producer ahead of time. # # 3) producer.store_root().compute_at(x), which allocates # space for the consumer outside the loop over x, but fills # it in as needed inside the loop. # # Let's use option 1. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = 2 * producer_storage[0] # Update step for the consumer for xx in range(10): c_result[xx] += 1 # All of the pure step is evaluated before any of the # update step, so there are two separate loops over x. # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 2: The consumer references the producer in the update step only producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside the update # step of the producer, because that's the only step that # uses the producer. producer.compute_at(consumer, x) # Note however, that we didn't say: # # producer.compute_at(consumer.update(0), x). # # Scheduling is done with respect to Vars of a hl.Func, and # the Vars of a hl.Func are shared across the pure and # update steps. halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): c_result[xx] = xx # Update step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 3: The consumer references the producer in # multiple steps that share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = producer[x] * x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside both the # pure and the update step of the producer. So there ends # up being two separate realizations of the producer, and # redundant work occurs. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = producer_storage[0] * xx # Update step for the consumer for xx in range(10): # Another copy of the pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 4: The consumer references the producer in # multiple steps that do not share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x, y] = x * y consumer[x, y] = x + y consumer[x, 0] = producer[x, x - 1] consumer[0, y] = producer[y, y - 1] # In this case neither producer.compute_at(consumer, x) # nor producer.compute_at(consumer, y) will work, because # either one fails to cover one of the uses of the # producer. So we'd have to inline producer, or use # producer.compute_root(). # Let's say we really really want producer to be # compute_at the inner loops of both consumer update # steps. Halide doesn't allow multiple different # schedules for a single hl.Func, but we can work around it # by making two wrappers around producer, and scheduling # those instead: # Attempt 2: producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func( ), hl.Func(), hl.Func() producer_wrapper_1[x, y] = producer[x, y] producer_wrapper_2[x, y] = producer[x, y] consumer_2[x, y] = x + y consumer_2[x, 0] += producer_wrapper_1[x, x - 1] consumer_2[0, y] += producer_wrapper_2[y, y - 1] # The wrapper functions give us two separate handles on # the producer, so we can schedule them differently. producer_wrapper_1.compute_at(consumer_2, x) producer_wrapper_2.compute_at(consumer_2, y) halide_result = consumer_2.realize(10, 10) # The equivalent C is: c_result = np.empty((10, 10), dtype=np.int) # Pure step for the consumer for yy in range(10): for xx in range(10): c_result[yy][xx] = xx + yy # First update step for consumer for xx in range(10): producer_wrapper_1_storage = np.empty((1), dtype=np.int) producer_wrapper_1_storage[0] = xx * (xx - 1) c_result[0][xx] += producer_wrapper_1_storage[0] # Second update step for consumer for yy in range(10): producer_wrapper_2_storage = np.empty((1), dtype=np.int) producer_wrapper_2_storage[0] = yy * (yy - 1) c_result[yy][0] += producer_wrapper_2_storage[0] # Check the results match for yy in range(10): for xx in range(10): if halide_result(xx, yy) != c_result[yy][xx]: print("halide_result(%d, %d) = %d instead of %d", xx, yy, halide_result(xx, yy), c_result[yy][xx]) return -1 if True: # Case 5: Scheduling a producer under a reduction domain # variable of the consumer. # We are not just restricted to scheduling producers at # the loops over the pure variables of the consumer. If a # producer is only used within a loop over a reduction # domain (hl.RDom) variable, we can also schedule the # producer there. producer, consumer = hl.Func("producer"), hl.Func("consumer") r = hl.RDom(0, 5) producer[x] = x * 17 consumer[x] = x + 10 consumer[x] += r + producer[x + r] producer.compute_at(consumer, r) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer. for xx in range(10): c_result[xx] = xx + 10 # Update step for the consumer. for xx in range(10): for rr in range( 5 ): # The loop over the reduction domain is always the inner loop. # We've schedule the storage and computation of # the producer here. We just need a single value. producer_storage = np.empty((1), dtype=np.int) # Pure step of the producer. producer_storage[0] = (xx + rr) * 17 # Now use it in the update step of the consumer. c_result[xx] += rr + producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # A real-world example of a reduction inside a producer-consumer chain. if True: # The default schedule for a reduction is a good one for # convolution-like operations. For example, the following # computes a 5x5 box-blur of our grayscale test image with a # hl.clamp-to-edge boundary condition: # First add the boundary condition. clamped = hl.repeat_edge(input) # Define a 5x5 box that starts at (-2, -2) r = hl.RDom(-2, 5, -2, 5) # Compute the 5x5 sum around each pixel. local_sum = hl.Func("local_sum") local_sum[x, y] = 0 # Compute the sum as a 32-bit integer local_sum[x, y] += clamped[x + r.x, y + r.y] # Divide the sum by 25 to make it an average blurry = hl.Func("blurry") blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25) halide_result = blurry.realize(input.width(), input.height()) # The default schedule will inline 'clamped' into the update # step of 'local_sum', because clamped only has a pure # definition, and so its default schedule is fully-inlined. # We will then compute local_sum per x coordinate of blurry, # because the default schedule for reductions is # compute-innermost. Here's the equivalent C: #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0] local_sum = np.empty((1), dtype=np.int32) c_result = hl.Buffer(hl.UInt(8), input.width(), input.height()) for yy in range(input.height()): for xx in range(input.width()): # FIXME this loop is quite slow # Pure step of local_sum local_sum[0] = 0 # Update step of local_sum for r_y in range(-2, 2 + 1): for r_x in range(-2, 2 + 1): # The clamping has been inlined into the update step. clamped_x = min(max(xx + r_x, 0), input.width() - 1) clamped_y = min(max(yy + r_y, 0), input.height() - 1) local_sum[0] += input(clamped_x, clamped_y) # Pure step of blurry #c_result(x, y) = (uint8_t)(local_sum[0] / 25) #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25) c_result[xx, yy] = int(local_sum[0] / 25) # hl.cast done internally # Check the results match for yy in range(input.height()): for xx in range(input.width()): if halide_result(xx, yy) != c_result(xx, yy): raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result(xx, yy))) return -1 # Reduction helpers. if True: # There are several reduction helper functions provided in # Halide.h, which compute small reductions and schedule them # innermost into their consumer. The most useful one is # "sum". f1 = hl.Func("f1") r = hl.RDom(0, 100) f1[x] = hl.sum(r + x) * 7 # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to: f2, anon = hl.Func("f2"), hl.Func("anon") anon[x] = 0 anon[x] += r + x f2[x] = anon[x] * 7 # So even though f1 references a reduction domain, it is a # pure function. The reduction domain has been swallowed to # define the inner anonymous reduction. halide_result_1 = f1.realize(10) halide_result_2 = f2.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range(10): anon = np.empty((1), dtype=np.int) anon[0] = 0 for rr in range(100): anon[0] += rr + xx c_result[xx] = anon[0] * 7 # Check they all match. for xx in range(10): if halide_result_1(xx) != c_result[xx]: print("halide_result_1(%d) = %d instead of %d", x, halide_result_1(x), c_result[x]) return -1 if halide_result_2(xx) != c_result[xx]: print("halide_result_2(%d) = %d instead of %d", x, halide_result_2(x), c_result[x]) return -1 # A complex example that uses reduction helpers. if False: # non-sense to port SSE code to python, skipping this test # Other reduction helpers include "product", "minimum", # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax # requires understanding tuples, which come in a later # lesson. Let's use minimum and maximum to compute the local # spread of our grayscale image. # First, add a boundary condition to the input. clamped = hl.Func("clamped") x_clamped = hl.clamp(x, 0, input.width() - 1) y_clamped = hl.clamp(y, 0, input.height() - 1) clamped[x, y] = input[x_clamped, y_clamped] box = hl.RDom(-2, 5, -2, 5) # Compute the local maximum minus the local minimum: spread = hl.Func("spread") spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) - minimum(clamped(x + box.x, y + box.y))) # Compute the result in strips of 32 scanlines yo, yi = hl.Var("yo"), hl.Var("yi") spread.split(y, yo, yi, 32).parallel(yo) # Vectorize across x within the strips. This implicitly # vectorizes stuff that is computed within the loop over x in # spread, which includes our minimum and maximum helpers, so # they get vectorized too. spread.vectorize(x, 16) # We'll apply the boundary condition by padding each scanline # as we need it in a circular buffer (see lesson 08). clamped.store_at(spread, yo).compute_at(spread, yi) halide_result = spread.realize(input.width(), input.height()) # The C equivalent is almost too horrible to contemplate (and # took me a long time to debug). This time I want to time # both the Halide version and the C version, so I'll use sse # intrinsics for the vectorization, and openmp to do the # parallel for loop (you'll need to compile with -fopenmp or # similar to get correct timing). #ifdef __SSE2__ # Don't include the time required to allocate the output buffer. c_result = hl.Buffer(hl.UInt(8), input.width(), input.height()) #ifdef _OPENMP t1 = datetime.now() #endif # Run this one hundred times so we can average the timing results. for iters in range(100): pass # #pragma omp parallel for # for yo in range((input.height() + 31)/32): # y_base = hl.min(yo * 32, input.height() - 32) # # # Compute clamped in a circular buffer of size 8 # # (smallest power of two greater than 5). Each thread # # needs its own allocation, so it must occur here. # # clamped_width = input.width() + 4 # clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8) # # for yi in range(32): # y = y_base + yi # # uint8_t *output_row = &c_result(0, y) # # # Compute clamped for this scanline, skipping rows # # already computed within this slice. # int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2) # int max_y_clamped = (y + 2) # for (int cy = min_y_clamped cy <= max_y_clamped cy++) { # # Figure out which row of the circular buffer # # we're filling in using bitmasking: # uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width # # # Figure out which row of the input we're reading # # from by clamping the y coordinate: # int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1) # uint8_t *input_row = &input(0, clamped_y) # # # Fill it in with the padding. # for (int x = -2 x < input.width() + 2 ): # int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1) # *clamped_row++ = input_row[clamped_x] # # # # # Now iterate over vectors of x for the pure step of the output. # for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) { # int x_base = std::hl.min(x_vec * 16, input.width() - 16) # # # Allocate storage for the minimum and maximum # # helpers. One vector is enough. # __m128i minimum_storage, maximum_storage # # # The pure step for the maximum is a vector of zeros # maximum_storage = (__m128i)_mm_setzero_ps() # # # The update step for maximum # for (int max_y = y - 2 max_y <= y + 2 max_y++) { # uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width # for (int max_x = x_base - 2 max_x <= x_base + 2 max_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2)) # maximum_storage = _mm_max_epu8(maximum_storage, v) # # # # # The pure step for the minimum is a vector of # # ones. Create it by comparing something to # # itself. # minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(), # _mm_setzero_ps()) # # # The update step for minimum. # for (int min_y = y - 2 min_y <= y + 2 min_y++) { # uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width # for (int min_x = x_base - 2 min_x <= x_base + 2 min_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2)) # minimum_storage = _mm_min_epu8(minimum_storage, v) # # # # # Now compute the spread. # __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage) # # # Store it. # _mm_storeu_si128((__m128i *)(output_row + x_base), spread) # # # # del clamped_storage # # end of hundred iterations # Skip the timing comparison if we don't have openmp # enabled. Otherwise it's unfair to C. #ifdef _OPENMP t2 = datetime.now() # Now run the Halide version again without the # jit-compilation overhead. Also run it one hundred times. for iters in range(100): spread.realize(halide_result) t3 = datetime.now() # Report the timings. On my machine they both take about 3ms # for the 4-megapixel input (fast!), which makes sense, # because they're using the same vectorization and # parallelization strategy. However I find the Halide easier # to read, write, debug, modify, and port. print("Halide spread took %f ms. C equivalent took %f ms" % ((t3 - t2).total_seconds() * 1000, (t2 - t1).total_seconds() * 1000)) #endif # _OPENMP # Check the results match: for yy in range(input.height()): for xx in range(input.width()): if halide_result(xx, yy) != c_result(xx, yy): raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result(xx, yy))) return -1 #endif # __SSE2__ else: print("(Skipped the SSE2 section of the code, " "since non-sense in python world.)") print("Success!") return 0