def combine2(im1, im2, width, height, dist): init_mask1 = hl.Func("mask1_layer_0") init_mask2 = hl.Func("mask2_layer_0") accumulator = hl.Func("combine_accumulator") output = hl.Func("combine_output") x, y = hl.Var("x"), hl.Var("y") im1_mirror = hl.BoundaryConditions.repeat_edge(im1, [(0, width), (0, height)]) im2_mirror = hl.BoundaryConditions.repeat_edge(im2, [(0, width), (0, height)]) weight1 = hl.f32(dist[im1_mirror[x, y]]) weight2 = hl.f32(dist[im2_mirror[x, y]]) init_mask1[x, y] = weight1 / (weight1 + weight2) init_mask2[x, y] = 1 - init_mask1[x, y] mask1 = init_mask1 mask2 = init_mask2 accumulator[x, y] = hl.i32(0) accumulator[x, y] += hl.i32(im1_mirror[x, y] * mask1[x, y]) + hl.i32(im2_mirror[x, y] * mask2[x, y]) output[x, y] = hl.u16_sat(accumulator[x, y]) init_mask1.compute_root().parallel(y).vectorize(x, 16) accumulator.compute_root().parallel(y).vectorize(x, 16) accumulator.update(0).parallel(y).vectorize(x, 16) return output
def test_minmax(): x = hl.Var() f = hl.Func() f[x] = hl.select(x == 0, hl.min(x, 1), (x == 2) | (x == 4), hl.i32(hl.min(hl.f32(x), hl.f32(3.2), x * hl.f32(2.1))), x == 3, hl.max(x, x * 3, 1, x * 4), x) b = f.realize(5) assert b[0] == 0 assert b[1] == 1, b[1] assert b[2] == 2 assert b[3] == 12 assert b[4] == 3
def __init__(self, input): assert input.type() == hl.UInt(8) self.lut = hl.Func("lut") self.padded = hl.Func("padded") self.padded16 = hl.Func("padded16") self.sharpen = hl.Func("sharpen") self.curved = hl.Func("curved") self.input = input # For this lesson, we'll use a two-stage pipeline that sharpens # and then applies a look-up-table (LUT). # First we'll define the LUT. It will be a gamma curve. gamma = hl.f32(1.2) self.lut[i] = hl.u8(hl.clamp(hl.pow(i / 255.0, gamma) * 255.0, 0, 255)) # Augment the input with a boundary condition. self.padded[x, y, c] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # Cast it to 16-bit to do the math. self.padded16[x, y, c] = hl.u16(self.padded[x, y, c]) # Next we sharpen it with a five-tap filter. self.sharpen[x, y, c] = ( self.padded16[x, y, c] * 2 - (self.padded16[x - 1, y, c] + self.padded16[x, y - 1, c] + self.padded16[x + 1, y, c] + self.padded16[x, y + 1, c]) / 4) # Then apply the LUT. self.curved[x, y, c] = self.lut[self.sharpen[x, y, c]]
def main(): hl.load_plugin("autoschedule_li2018") x = hl.Var('x') f_in = hl.Func('in') f_in[x] = hl.f32(x) # Cast to float 32 f_0 = hl.Func('f_0') f_0[x] = 2 * f_in[x] f_1 = hl.Func('f_1') f_1[x] = hl.sin(f_0[x]) f_2 = hl.Func('f_2') f_2[x] = f_1[x] * f_1[x] # Setup f_2.set_estimate(x, 0, 1000) p = hl.Pipeline(f_2) target = hl.Target() # Only first parameter is used (number of cores on CPU) params = hl.MachineParams(32, 0, 0) result = p.auto_schedule('Li2018', target, params) print('Schedule:') print(result.schedule_source) p.compile_jit() # compile buf = p.realize(1000) # compute and get the buffer
def tone_map(input, width, height, compression, gain): print(f'Compression: {compression}, gain: {gain}') normal_dist = hl.Func("luma_weight_distribution") grayscale = hl.Func("grayscale") output = hl.Func("tone_map_output") x, y, c, v = hl.Var("x"), hl.Var("y"), hl.Var("c"), hl.Var("v") rdom = hl.RDom([(0, 3)]) normal_dist[v] = hl.f32(hl.exp(-12.5 * hl.pow(hl.f32(v) / 65535 - 0.5, 2))) grayscale[x, y] = hl.u16(hl.sum(hl.u32(input[x, y, rdom])) / 3) dark = grayscale comp_const = 1 gain_const = 1 comp_slope = (compression - comp_const) / (TONE_MAP_PASSES) gain_slope = (gain - gain_const) / (TONE_MAP_PASSES) for i in range(TONE_MAP_PASSES): print(' pass', i) norm_comp = i * comp_slope + comp_const norm_gain = i * gain_slope + gain_const bright = brighten(dark, norm_comp) dark_gamma = gamma_correct(dark) bright_gamma = gamma_correct(bright) dark_gamma = combine2(dark_gamma, bright_gamma, width, height, normal_dist) dark = brighten(gamma_inverse(dark_gamma), norm_gain) output[x, y, c] = hl.u16_sat(hl.u32(input[x, y, c]) * hl.u32(dark[x, y]) / hl.u32(hl.max(1, grayscale[x, y]))) grayscale.compute_root().parallel(y).vectorize(x, 16) normal_dist.compute_root().vectorize(v, 16) return output
def test_print_expr(): x = hl.Var('x') f = hl.Func('f') f[x] = hl.print(hl.cast(hl.UInt(8), x), 'is what', 'the', 1, 'and', hl.f32(3.1415), 'saw') buf = hl.Buffer(hl.UInt(8), [1]) output = StringIO() with _redirect_stdout(output): f.realize(buf) expected = '0 is what the 1 and 3.141500 saw\n' actual = output.getvalue() assert expected == actual, "Expected: %s, Actual: %s" % (expected, actual)
def gamma_inverse(input): output = hl.Func("gamma_inverse_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") cutoff = 2575 gamma_toe = 0.0774 gamma_pow = 2.4 gamma_fac = 57632.49226 gamma_con = 0.055 if input.dimensions() == 2: output[x, y] = hl.u16(hl.select(input[x, y] < cutoff, gamma_toe * input[x, y], hl.pow(hl.f32(input[x, y]) / 65535 + gamma_con, gamma_pow) * gamma_fac)) else: output[x, y, c] = hl.u16(hl.select(input[x, y, c] < cutoff, gamma_toe * input[x, y, c], hl.pow(hl.f32(input[x, y, c]) / 65535 + gamma_con, gamma_pow) * gamma_fac)) output.compute_root().parallel(y).vectorize(x, 16) return output
def srgb(input, ccm): srgb_matrix = hl.Func("srgb_matrix") output = hl.Func("srgb_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") rdom = hl.RDom([(0, 3)]) srgb_matrix[x, y] = hl.f32(0) srgb_matrix[0, 0] = hl.f32(ccm[0][0]) srgb_matrix[1, 0] = hl.f32(ccm[0][1]) srgb_matrix[2, 0] = hl.f32(ccm[0][2]) srgb_matrix[0, 1] = hl.f32(ccm[1][0]) srgb_matrix[1, 1] = hl.f32(ccm[1][1]) srgb_matrix[2, 1] = hl.f32(ccm[1][2]) srgb_matrix[0, 2] = hl.f32(ccm[2][0]) srgb_matrix[1, 2] = hl.f32(ccm[2][1]) srgb_matrix[2, 2] = hl.f32(ccm[2][2]) output[x, y, c] = hl.u16_sat(hl.sum(srgb_matrix[rdom, c] * input[x, y, rdom])) return output
def white_balance(input, width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b): output = hl.Func("white_balance_output") print(width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b) x, y = hl.Var("x"), hl.Var("y") rdom = hl.RDom([(0, width / 2), (0, height / 2)]) output[x, y] = hl.u16(0) output[rdom.x * 2, rdom.y * 2] = hl.u16_sat(white_balance_r * hl.f32(input[rdom.x * 2, rdom.y * 2])) output[rdom.x * 2 + 1, rdom.y * 2] = hl.u16_sat(white_balance_g0 * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2])) output[rdom.x * 2, rdom.y * 2 + 1] = hl.u16_sat(white_balance_g1 * hl.f32(input[rdom.x * 2, rdom.y * 2 + 1])) output[rdom.x * 2 + 1, rdom.y * 2 + 1] = hl.u16_sat(white_balance_b * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2 + 1])) output.compute_root().parallel(y).vectorize(x, 16) output.update(0).parallel(rdom.y) output.update(1).parallel(rdom.y) output.update(2).parallel(rdom.y) output.update(3).parallel(rdom.y) return output
def rgb_to_yuv(input): print(' rgb_to_yuv') output = hl.Func("rgb_to_yuv_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") rdom = input[x, y, 0] g = input[x, y, 1] b = input[x, y, 2] output[x, y, c] = hl.f32(0) output[x, y, 0] = 0.2989 * rdom + 0.587 * g + 0.114 * b output[x, y, 1] = -0.168935 * rdom - 0.331655 * g + 0.50059 * b output[x, y, 2] = 0.499813 * rdom - 0.418531 * g + - 0.081282 * b output.compute_root().parallel(y).vectorize(x, 16) output.update(0).parallel(y).vectorize(x, 16) output.update(1).parallel(y).vectorize(x, 16) output.update(2).parallel(y).vectorize(x, 16) return output
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def combine(im1, im2, width, height, dist): init_mask1 = hl.Func("mask1_layer_0") init_mask2 = hl.Func("mask2_layer_0") accumulator = hl.Func("combine_accumulator") output = hl.Func("combine_output") x, y = hl.Var("x"), hl.Var("y") im1_mirror = hl.BoundaryConditions.repeat_edge(im1, [(0, width), (0, height)]) im2_mirror = hl.BoundaryConditions.repeat_edge(im2, [(0, width), (0, height)]) unblurred1 = im1_mirror unblurred2 = im2_mirror blurred1 = gauss_7x7(im1_mirror, "img1_layer_0") blurred2 = gauss_7x7(im2_mirror, "img2_layer_0") weight1 = hl.f32(dist[im1_mirror[x, y]]) weight2 = hl.f32(dist[im2_mirror[x, y]]) init_mask1[x, y] = weight1 / (weight1 + weight2) init_mask2[x, y] = 1 - init_mask1[x, y] mask1 = init_mask1 mask2 = init_mask2 num_layers = 2 accumulator[x, y] = hl.i32(0) for i in range(1, num_layers): print(' layer', i) prev_layer_str = str(i - 1) layer_str = str(i) laplace1 = diff(unblurred1, blurred1, "laplace1_layer_" + prev_layer_str) laplace2 = diff(unblurred2, blurred2, "laplace2_layer_" + layer_str) accumulator[x, y] += hl.i32(laplace1[x, y] * mask1[x, y]) + hl.i32(laplace2[x, y] * mask2[x, y]) unblurred1 = blurred1 unblurred2 = blurred2 blurred1 = gauss_7x7(blurred1, "img1_layer_" + layer_str) blurred2 = gauss_7x7(blurred2, "img2_layer_" + layer_str) mask1 = gauss_7x7(mask1, "mask1_layer_" + layer_str) mask2 = gauss_7x7(mask2, "mask2_layer_" + layer_str) accumulator[x, y] += hl.i32(blurred1[x, y] * mask1[x, y]) + hl.i32(blurred2[x, y] * mask2[x, y]) output[x, y] = hl.u16_sat(accumulator[x, y]) init_mask1.compute_root().parallel(y).vectorize(x, 16) accumulator.compute_root().parallel(y).vectorize(x, 16) for i in range(num_layers): accumulator.update(i).parallel(y).vectorize(x, 16) return output
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def test_simplestub(): x, y = hl.Var(), hl.Var() target = hl.get_jit_target_from_environment() b_in = hl.Buffer(hl.UInt(8), [2, 2]) b_in.fill(123) f_in = hl.Func("f") f_in[x, y] = x + y # ----------- Inputs by-position f = simplestub.generate(target, b_in, f_in, 3.5) _realize_and_check(f) # ----------- Inputs by-name f = simplestub.generate(target, buffer_input=b_in, func_input=f_in, float_arg=3.5) _realize_and_check(f) f = simplestub.generate(target, float_arg=3.5, buffer_input=b_in, func_input=f_in) _realize_and_check(f) # ----------- Above set again, w/ GeneratorParam mixed in k = 42 # (positional) f = simplestub.generate(target, b_in, f_in, 3.5, offset=k) _realize_and_check(f, k) # (keyword) f = simplestub.generate(target, offset=k, buffer_input=b_in, func_input=f_in, float_arg=3.5) _realize_and_check(f, k) f = simplestub.generate(target, buffer_input=b_in, offset=k, func_input=f_in, float_arg=3.5) _realize_and_check(f, k) f = simplestub.generate(target, buffer_input=b_in, func_input=f_in, offset=k, float_arg=3.5) _realize_and_check(f, k) f = simplestub.generate(target, buffer_input=b_in, float_arg=3.5, func_input=f_in, offset=k) _realize_and_check(f, k) # ----------- Test various failure modes try: # Inputs w/ mixed by-position and by-name f = simplestub.generate(target, b_in, f_in, float_arg=3.5) except RuntimeError as e: assert 'Cannot use both positional and keyword arguments for inputs.' in str( e) else: assert False, 'Did not see expected exception!' try: # too many positional args f = simplestub.generate(target, b_in, f_in, 3.5, 4) except RuntimeError as e: assert 'Expected exactly 3 positional args for inputs, but saw 4.' in str( e) else: assert False, 'Did not see expected exception!' try: # too few positional args f = simplestub.generate(target, b_in, f_in) except RuntimeError as e: assert 'Expected exactly 3 positional args for inputs, but saw 2.' in str( e) else: assert False, 'Did not see expected exception!' try: # Inputs that can't be converted to what the receiver needs (positional) f = simplestub.generate(target, hl.f32(3.141592), "happy", k) except RuntimeError as e: assert 'Unable to cast Python instance' in str(e) else: assert False, 'Did not see expected exception!' try: # Inputs that can't be converted to what the receiver needs (named) f = simplestub.generate(target, b_in, f_in, float_arg="bogus") except RuntimeError as e: assert 'Unable to cast Python instance' in str(e) else: assert False, 'Did not see expected exception!' try: # Input specified by both pos and kwarg f = simplestub.generate(target, b_in, f_in, 3.5, float_arg=4.5) except RuntimeError as e: assert "Cannot use both positional and keyword arguments for inputs." in str( e) else: assert False, 'Did not see expected exception!' try: # Bad input name f = simplestub.generate(target, buffer_input=b_in, float_arg=3.5, offset=k, funk_input=f_in) except RuntimeError as e: assert "Expected exactly 3 keyword args for inputs, but saw 2." in str( e) else: assert False, 'Did not see expected exception!' try: # Bad gp name f = simplestub.generate(target, buffer_input=b_in, float_arg=3.5, offset=k, func_input=f_in, nonexistent_generator_param="wat") except RuntimeError as e: assert "Generator simplestub has no GeneratorParam named: nonexistent_generator_param" in str( e) else: assert False, 'Did not see expected exception!'
def get_local_laplacian(input, levels, alpha, beta, J=8): n_downsamples = 0 n_upsamples = 0 x = hl.Var('x') y = hl.Var('y') def downsample(f): nonlocal n_downsamples downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func( 'downy%i' % n_downsamples) n_downsamples += 1 downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 * (f[2 * x, y, c] + f[2 * x + 1, y, c]) + f[2 * x + 2, y, c]) / 8.0 downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 * (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) + downx[x, 2 * y + 2, c]) / 8.0 return downy def upsample(f): nonlocal n_upsamples upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' % n_upsamples) n_upsamples += 1 upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y, c] + 0.75 * f[x // 2, y, c] upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2), c] + 0.75 * upx[x, y // 2, c] return upy def downsample2D(f): nonlocal n_downsamples downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func( 'downy%i' % n_downsamples) n_downsamples += 1 downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0 downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0 return downy def upsample2D(f): nonlocal n_upsamples upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' % n_upsamples) n_upsamples += 1 upx[x, y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y] upy[x, y] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2] return upy # THE ALGORITHM # loop variables c = hl.Var('c') k = hl.Var('k') # Make the remapping function as a lookup table. remap = hl.Func('remap') fx = hl.cast(float_t, x / 256.0) # remap[x] = alpha*fx*exp(-fx*fx/2.0) remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0) # Convert to floating point floating = hl.Func('floating') floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0 # Set a boundary condition clamped = hl.Func('clamped') clamped[x, y, c] = floating[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # Get the luminance channel gray = hl.Func('gray') kR = hl.f32(0.299) kG = hl.f32(0.587) kB = hl.f32(0.114) gray[x, y] = kR * clamped[x, y, 0] + kG * clamped[x, y, 1] + kB * clamped[x, y, 2] # Make the processed Gaussian pyramid. gPyramid = [hl.Func('gPyramid%i' % i) for i in range(J)] # Do a lookup into a lut with 256 entires per intensity level level = k / (levels - 1) idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0 idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256) gPyramid[0][x, y, k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k] for j in range(1, J): gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k] # Get its laplacian pyramid lPyramid = [hl.Func('lPyramid%i' % i) for i in range(J)] lPyramid[J - 1] = gPyramid[J - 1] for j in range(J - 1)[::-1]: lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample( gPyramid[j + 1])[x, y, k] # Make the Gaussian pyramid of the input inGPyramid = [hl.Func('inGPyramid%i' % i) for i in range(J)] inGPyramid[0] = gray for j in range(1, J): inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y] # Make the laplacian pyramid of the output outLPyramid = [hl.Func('outLPyramid%i' % i) for i in range(J)] for j in range(J): # Split input pyramid value into integer and floating parts level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1) li = hl.clamp(hl.cast(int_t, level), 0, levels - 2) lf = level - hl.cast(float_t, li) # Linearly interpolate between the nearest processed pyramid levels outLPyramid[j][x, y] = ( 1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1] # Make the Gaussian pyramid of the output outGPyramid = [hl.Func('outGPyramid%i' % i) for i in range(J)] outGPyramid[J - 1] = outLPyramid[J - 1] for j in range(J - 1)[::-1]: outGPyramid[j][x, y] = upsample2D( outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) color = hl.Func('color') eps = hl.f32(0.01) color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + eps) / (gray[x, y] + eps) output = hl.Func('local_laplacian') # Convert back to 16-bit output[x, y, c] = hl.cast(hl.UInt(16), hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0) # THE SCHEDULE target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU Schedule print("Compiling for GPU") xi, yi = hl.Var("xi"), hl.Var("yi") remap.compute_root() output.compute_root().gpu_tile(x, y, xi, yi, 16, 8) for j in range(J): blockw = 16 blockh = 8 if j > 3: blockw = 2 blockh = 2 if j > 0: inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh) gPyramid[j].compute_root().reorder(k, x, y).gpu_tile( x, y, xi, yi, blockw, blockh) outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh) else: # CPU schedule print("Compiling for CPU") remap.compute_root() output.parallel(y, 4).vectorize(x, 4) gray.compute_root().parallel(y, 4).vectorize(x, 4) for j in range(4): if j > 0: inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) if j > 0: gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) outGPyramid[j].compute_root().parallel(y).vectorize(x, 4) for j in range(4, J): inGPyramid[j].compute_root().parallel(y) gPyramid[j].compute_root().parallel(k) outGPyramid[j].compute_root().parallel(y) return output