def filter_test_image(local_laplacian, input): local_laplacian.compile_jit(hl.get_target_from_environment()) # preparing input and output memory buffers (numpy ndarrays) input_data = get_input_data() input_image = hl.Buffer(input_data) input.set(input_image) output_data = np.empty_like(input_data) # do the actual computation input_width, input_height = input_data.shape[:2] output_image = local_laplacian.realize(input_width, input_height, 3) output_data = np.asanyarray(output_image) # convert back to uint8 input_data = (input_data >> 8).astype(np.uint8) output_data = (output_data >> 8).astype(np.uint8) # save results input_path = "local_laplacian_input.png" output_path = "local_laplacian.png" imageio.imsave(input_path, input_data) imageio.imsave(output_path, output_data) print() print("local_laplacian realized on output_image.") print('Result saved at {} (input data copy at {}).'.format( output_path, input_path))
def generate_compiled_file(bilateral_grid): target = hl.get_target_from_environment() # Need to copy the filter executable from the C++ apps/bilateral_grid folder to run this. # (after making it of course) arguments = ArgumentsVector() arguments.append(Argument('r_sigma', InputScalar, hl.Float(32), 0)) arguments.append(Argument('input', InputBuffer, hl.UInt(16), 2)) bilateral_grid.compile_to_file("bilateral_grid", arguments, "bilateral_grid", target) print("Generated compiled file for bilateral_grid function.")
def generate_compiled_file(bilateral_grid): target = hl.get_target_from_environment() # Need to copy the filter executable from the C++ apps/bilateral_grid folder to run this. # (after making it of course) arguments = ArgumentsVector() arguments.append(Argument('r_sigma', InputScalar, hl.Float(32), 0)) arguments.append(Argument('input', InputBuffer, hl.UInt(16), 2)) bilateral_grid.compile_to_file("bilateral_grid", arguments, "bilateral_grid", target) print("Generated compiled file for bilateral_grid function.")
def generate_compiled_file(local_laplacian): # Need to copy the process executable from the C++ apps/local_laplacian folder to run this. # (after making it of course) arguments = ArgumentsVector() arguments.append(Argument('levels', False, int_t)) arguments.append(Argument('alpha', False, float_t)) arguments.append(Argument('beta', False, float_t)) arguments.append(Argument('input', True, hl.UInt(16))) target = hl.get_target_from_environment() local_laplacian.compile_to_file("local_laplacian", arguments, "local_laplacian", target) print("Generated compiled file for local_laplacian function.") return
def generate_compiled_file(local_laplacian): # Need to copy the process executable from the C++ apps/local_laplacian folder to run this. # (after making it of course) arguments = ArgumentsVector() arguments.append(Argument('levels', False, int_t)) arguments.append(Argument('alpha', False, float_t)) arguments.append(Argument('beta', False, float_t)) arguments.append(Argument('input', True, hl.UInt(16))) target = hl.get_target_from_environment() local_laplacian.compile_to_file("local_laplacian", arguments, "local_laplacian", target) print("Generated compiled file for local_laplacian function.") return
def main(): x = hl.Var("x") f = hl.Func("f") f[x] = 100 * x args = [] tmpdir = tempfile.mkdtemp() try: p = os.path.join(tmpdir, "f.bc") f.compile_to_bitcode(p, args, "f") assert os.path.isfile(p) p = os.path.join(tmpdir, "f.cpp") f.compile_to_c(p, args, "f") assert os.path.isfile(p) p = os.path.join(tmpdir, "f.o") f.compile_to_object(p, args, "f") assert os.path.isfile(p) p = os.path.join(tmpdir, "f.h") f.compile_to_header(p, args, "f") assert os.path.isfile(p) p = os.path.join(tmpdir, "f.s") f.compile_to_assembly(p, args, "f") assert os.path.isfile(p) p = os.path.join(tmpdir, "f.txt") f.compile_to_lowered_stmt(p, args) assert os.path.isfile(p) f.compile_to_file(os.path.join(tmpdir, "f_all"), args) assert os.path.isfile(os.path.join(tmpdir, "f_all.h")) if hl.get_target_from_environment().os == hl.TargetOS.Windows: assert os.path.isfile(os.path.join(tmpdir, "f_all.obj")) else: assert os.path.isfile(os.path.join(tmpdir, "f_all.o")) p = os.path.join(tmpdir, "f.html") f.compile_to({hl.OutputFileType.stmt_html: p}, args, "f") assert os.path.isfile(p) finally: shutil.rmtree(tmpdir, ignore_errors=True)
def get_interpolate(input, levels): """ Build function, schedules it, and invokes jit compiler :return: halide.hl.Func """ # THE ALGORITHM downsampled = [hl.Func('downsampled%d' % i) for i in range(levels)] downx = [hl.Func('downx%d' % l) for l in range(levels)] interpolated = [hl.Func('interpolated%d' % i) for i in range(levels)] # level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)] # level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)] upsampled = [hl.Func('upsampled%d' % l) for l in range(levels)] upsampledx = [hl.Func('upsampledx%d' % l) for l in range(levels)] x = hl.Var('x') y = hl.Var('y') c = hl.Var('c') clamped = hl.Func('clamped') clamped[x, y, c] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we # rewrite it in a way that doesn't trigger the bug. The rewritten # form assumes the input alpha is zero or one. # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3]) downsampled[0][x, y, c] = clamped[x, y, c] * clamped[x, y, 3] for l in range(1, levels): prev = hl.Func() prev = downsampled[l - 1] if l == 4: # Also add a boundary condition at a middle pyramid level # to prevent the footprint of the downsamplings to extend # too far off the base image. Otherwise we look 512 # pixels off each edge. w = input.width() / (1 << l) h = input.height() / (1 << l) prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c]) downx[l][x, y, c] = (prev[x * 2 - 1, y, c] + 2.0 * prev[x * 2, y, c] + prev[x * 2 + 1, y, c]) * 0.25 downsampled[l][x, y, c] = (downx[l][x, y * 2 - 1, c] + 2.0 * downx[l][x, y * 2, c] + downx[l][x, y * 2 + 1, c]) * 0.25 interpolated[levels - 1][x, y, c] = downsampled[levels - 1][x, y, c] for l in range(levels - 1)[::-1]: upsampledx[l][x, y, c] = (interpolated[l + 1][x / 2, y, c] + interpolated[l + 1][(x + 1) / 2, y, c]) / 2.0 upsampled[l][x, y, c] = (upsampledx[l][x, y / 2, c] + upsampledx[l][x, (y + 1) / 2, c]) / 2.0 interpolated[l][x, y, c] = downsampled[l][ x, y, c] + (1.0 - downsampled[l][x, y, 3]) * upsampled[l][x, y, c] normalize = hl.Func('normalize') normalize[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3] final = hl.Func('final') final[x, y, c] = normalize[x, y, c] print("Finished function setup.") # THE SCHEDULE sched = 2 target = hl.get_target_from_environment() if target.has_gpu_feature(): sched = 4 else: sched = 2 if sched == 0: print("Flat schedule.") for l in range(levels): downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root() elif sched == 1: print("Flat schedule with vectorization.") for l in range(levels): downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) final.compute_root() elif sched == 2: print("Flat schedule with parallelization + vectorization") xi, yi = hl.Var('xi'), hl.Var('yi') clamped.compute_root().parallel(y).bound(c, 0, 4).reorder( c, x, y).reorder_storage(c, x, y).vectorize(c, 4) for l in range(1, levels - 1): if l > 0: downsampled[l].compute_root().parallel(y).reorder( c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].compute_root().parallel(y).reorder( c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].unroll(x, 2).unroll(y, 2) final.reorder(c, x, y).bound(c, 0, 3).parallel(y) final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi) final.bound(x, 0, input.width()) final.bound(y, 0, input.height()) elif sched == 3: print("Flat schedule with vectorization sometimes.") for l in range(levels): if l + 4 < levels: yo, yi = hl.Var('yo'), hl.Var('yi') downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) else: downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root() elif sched == 4: print("GPU schedule.") # Some gpus don't have enough memory to process the entire # image, so we process the image in tiles. yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var( "ci") final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4) final.tile(x, y, xo, yo, xi, yi, input.width() / 4, input.height() / 4) normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16, GPU_Default).unroll(c) # Start from level 1 to save memory - level zero will be computed on demand for l in range(1, levels): tile_size = 32 >> l if tile_size < 1: tile_size = 1 if tile_size > 16: tile_size = 16 downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) else: print("No schedule with this number.") exit(1) # JIT compile the pipeline eagerly, so we don't interfere with timing final.compile_jit(target) return final
def get_interpolate(input, levels): """ Build function, schedules it, and invokes jit compiler :return: halide.hl.Func """ # THE ALGORITHM downsampled = [hl.Func('downsampled%d'%i) for i in range(levels)] downx = [hl.Func('downx%d'%l) for l in range(levels)] interpolated = [hl.Func('interpolated%d'%i) for i in range(levels)] # level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)] # level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)] upsampled = [hl.Func('upsampled%d'%l) for l in range(levels)] upsampledx = [hl.Func('upsampledx%d'%l) for l in range(levels)] x = hl.Var('x') y = hl.Var('y') c = hl.Var('c') clamped = hl.Func('clamped') clamped[x, y, c] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c] # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we # rewrite it in a way that doesn't trigger the bug. The rewritten # form assumes the input alpha is zero or one. # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3]) downsampled[0][x,y,c] = clamped[x, y, c] * clamped[x, y, 3] for l in range(1, levels): prev = hl.Func() prev = downsampled[l-1] if l == 4: # Also add a boundary condition at a middle pyramid level # to prevent the footprint of the downsamplings to extend # too far off the base image. Otherwise we look 512 # pixels off each edge. w = input.width()/(1 << l) h = input.height()/(1 << l) prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c]) downx[l][x,y,c] = (prev[x*2-1,y,c] + 2.0 * prev[x*2,y,c] + prev[x*2+1,y,c]) * 0.25 downsampled[l][x,y,c] = (downx[l][x,y*2-1,c] + 2.0 * downx[l][x,y*2,c] + downx[l][x,y*2+1,c]) * 0.25 interpolated[levels-1][x,y,c] = downsampled[levels-1][x,y,c] for l in range(levels-1)[::-1]: upsampledx[l][x,y,c] = (interpolated[l+1][x/2, y, c] + interpolated[l+1][(x+1)/2, y, c]) / 2.0 upsampled[l][x,y,c] = (upsampledx[l][x, y/2, c] + upsampledx[l][x, (y+1)/2, c]) / 2.0 interpolated[l][x,y,c] = downsampled[l][x,y,c] + (1.0 - downsampled[l][x,y,3]) * upsampled[l][x,y,c] normalize = hl.Func('normalize') normalize[x,y,c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3] final = hl.Func('final') final[x,y,c] = normalize[x,y,c] print("Finished function setup.") # THE SCHEDULE sched = 2 target = hl.get_target_from_environment() if target.has_gpu_feature(): sched = 4 else: sched = 2 if sched == 0: print ("Flat schedule.") for l in range(levels): downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root() elif sched == 1: print("Flat schedule with vectorization.") for l in range(levels): downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) final.compute_root() elif sched == 2: print("Flat schedule with parallelization + vectorization") xi, yi = hl.Var('xi'), hl.Var('yi') clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) for l in range(1, levels - 1): if l > 0: downsampled[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) interpolated[l].unroll(x, 2).unroll(y, 2); final.reorder(c, x, y).bound(c, 0, 3).parallel(y) final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi) final.bound(x, 0, input.width()) final.bound(y, 0, input.height()) elif sched == 3: print("Flat schedule with vectorization sometimes.") for l in range(levels): if l + 4 < levels: yo, yi = hl.Var('yo'), hl.Var('yi') downsampled[l].compute_root().vectorize(x, 4) interpolated[l].compute_root().vectorize(x, 4) else: downsampled[l].compute_root() interpolated[l].compute_root() final.compute_root(); elif sched == 4: print("GPU schedule.") # Some gpus don't have enough memory to process the entire # image, so we process the image in tiles. yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var("ci") final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4) final.tile(x, y, xo, yo, xi, yi, input.width()/4, input.height()/4) normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16, GPU_Default).unroll(c) # Start from level 1 to save memory - level zero will be computed on demand for l in range(1, levels): tile_size = 32 >> l; if tile_size < 1: tile_size = 1 if tile_size > 16: tile_size = 16 downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default) else: print("No schedule with this number.") exit(1) # JIT compile the pipeline eagerly, so we don't interfere with timing final.compile_jit(target) return final
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def get_local_laplacian(input, levels, alpha, beta, J=8): downsample_counter = [0] upsample_counter = [0] x = hl.Var('x') y = hl.Var('y') def downsample(f): downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 * (f[2 * x, y, c] + f[2 * x + 1, y, c]) + f[2 * x + 2, y, c]) / 8.0 downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 * (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) + downx[x, 2 * y + 2, c]) / 8.0 return downy def upsample(f): upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y, c] + 0.75 * f[x // 2, y, c] upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2), c] + 0.75 * upx[x, y // 2, c] return upy def downsample2D(f): downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0 downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0 return downy def upsample2D(f): upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y] upy[x, y] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2] return upy # THE ALGORITHM # loop variables c = hl.Var('c') k = hl.Var('k') # Make the remapping function as a lookup table. remap = hl.Func('remap') fx = hl.cast(float_t, x / 256.0) #remap[x] = alpha*fx*exp(-fx*fx/2.0) remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0) # Convert to floating point floating = hl.Func('floating') floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0 # Set a boundary condition clamped = hl.Func('clamped') clamped[x, y, c] = floating[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] # Get the luminance channel gray = hl.Func('gray') gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[ x, y, 1] + 0.114 * clamped[x, y, 2] # Make the processed Gaussian pyramid. gPyramid = [hl.Func('gPyramid%d' % i) for i in range(J)] # Do a lookup into a lut with 256 entires per intensity level level = k / (levels - 1) idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0 idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256) gPyramid[0][x, y, k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k] for j in range(1, J): gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k] # Get its laplacian pyramid lPyramid = [hl.Func('lPyramid%d' % i) for i in range(J)] lPyramid[J - 1] = gPyramid[J - 1] for j in range(J - 1)[::-1]: lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample( gPyramid[j + 1])[x, y, k] # Make the Gaussian pyramid of the input inGPyramid = [hl.Func('inGPyramid%d' % i) for i in range(J)] inGPyramid[0] = gray for j in range(1, J): inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y] # Make the laplacian pyramid of the output outLPyramid = [hl.Func('outLPyramid%d' % i) for i in range(J)] for j in range(J): # Split input pyramid value into integer and floating parts level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1) li = hl.clamp(hl.cast(int_t, level), 0, levels - 2) lf = level - hl.cast(float_t, li) # Linearly interpolate between the nearest processed pyramid levels outLPyramid[j][x, y] = ( 1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1] # Make the Gaussian pyramid of the output outGPyramid = [hl.Func('outGPyramid%d' % i) for i in range(J)] outGPyramid[J - 1] = outLPyramid[J - 1] for j in range(J - 1)[::-1]: outGPyramid[j][x, y] = upsample2D( outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) color = hl.Func('color') eps = 0.01 color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + eps) / (gray[x, y] + eps) output = hl.Func('local_laplacian') # Convert back to 16-bit output[x, y, c] = hl.cast(hl.UInt(16), hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0) # THE SCHEDULE remap.compute_root() target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU Schedule print("Compiling for GPU") xi, yi = hl.Var("xi"), hl.Var("yi") output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default) for j in range(J): blockw = 32 blockh = 16 if j > 3: blockw = 2 blockh = 2 if j > 0: inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) if j > 0: gPyramid[j].compute_root().reorder(k, x, y).gpu_tile( x, y, xi, yi, blockw, blockh, GPU_Default) outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) else: # CPU schedule print("Compiling for CPU") output.parallel(y, 4).vectorize(x, 4) gray.compute_root().parallel(y, 4).vectorize(x, 4) for j in range(4): if j > 0: inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) if j > 0: gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) outGPyramid[j].compute_root().parallel(y).vectorize(x, 4) for j in range(4, J): inGPyramid[j].compute_root().parallel(y) gPyramid[j].compute_root().parallel(k) outGPyramid[j].compute_root().parallel(y) return output
# histogram bucket corresponding to the intensity of the # input image at that point. histogram[hl.Expr(img[r.x, r.y])] += 1 histogram.set_estimate(hist_index, 0, 255) # Get the sum of all histogram cells r = hl.RDom([(0,255)]) hist_sum = hl.Func('hist_sum') hist_sum[()] = 0.0 # Compute the sum as a 32-bit integer hist_sum[()] += histogram[r.x] # Return each histogram as a % of total color pct_hist = hl.Func('pct_hist') pct_hist[hist_index] = histogram[hist_index] / hist_sum[()] return histogram def autoschedule(pipeline, autoscheduler_name, target, machine): hl.load_plugin('auto_schedule') pipeline.set_default_autoscheduler_name(autoscheduler_name) return pipeline.auto_schedule(target, machine) if __name__ == "__main__": fs = focus_stack_pipeline() print("Autoscheduling with: Adams2019") autoschedule(fs['pipeline'], "Adams2019", hl.get_target_from_environment(), hl.MachineParams(4, 256*1024, 50))
def get_local_laplacian(input, levels, alpha, beta, J=8): downsample_counter=[0] upsample_counter=[0] x = hl.Var('x') y = hl.Var('y') def downsample(f): downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0]) downsample_counter[0] += 1 downx[x,y,c] = (f[2*x-1,y,c] + 3.0*(f[2*x,y,c]+f[2*x+1,y,c]) + f[2*x+2,y,c])/8.0 downy[x,y,c] = (downx[x,2*y-1,c] + 3.0*(downx[x,2*y,c]+downx[x,2*y+1,c]) + downx[x,2*y+2,c])/8.0 return downy def upsample(f): upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0]) upsample_counter[0] += 1 upx[x,y,c] = 0.25 * f[(x//2) - 1 + 2*(x%2),y,c] + 0.75 * f[x//2,y,c] upy[x,y,c] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2),c] + 0.75 * upx[x,y//2,c] return upy def downsample2D(f): downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0]) downsample_counter[0] += 1 downx[x,y] = (f[2*x-1,y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0 downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0 return downy def upsample2D(f): upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0]) upsample_counter[0] += 1 upx[x,y] = 0.25 * f[(x//2) - 1 + 2*(x%2),y] + 0.75 * f[x//2,y] upy[x,y] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2)] + 0.75 * upx[x,y//2] return upy # THE ALGORITHM # loop variables c = hl.Var('c') k = hl.Var('k') # Make the remapping function as a lookup table. remap = hl.Func('remap') fx = hl.cast(float_t, x/256.0) #remap[x] = alpha*fx*exp(-fx*fx/2.0) remap[x] = alpha*fx*hl.exp(-fx*fx/2.0) # Convert to floating point floating = hl.Func('floating') floating[x,y,c] = hl.cast(float_t, input[x,y,c]) / 65535.0 # Set a boundary condition clamped = hl.Func('clamped') clamped[x,y,c] = floating[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c] # Get the luminance channel gray = hl.Func('gray') gray[x,y] = 0.299*clamped[x,y,0] + 0.587*clamped[x,y,1] + 0.114*clamped[x,y,2] # Make the processed Gaussian pyramid. gPyramid = [hl.Func('gPyramid%d'%i) for i in range(J)] # Do a lookup into a lut with 256 entires per intensity level level = k / (levels - 1) idx = gray[x,y]*hl.cast(float_t, levels-1)*256.0 idx = hl.clamp(hl.cast(int_t, idx), 0, (levels-1)*256) gPyramid[0][x,y,k] = beta*(gray[x, y] - level) + level + remap[idx - 256*k] for j in range(1,J): gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k] # Get its laplacian pyramid lPyramid = [hl.Func('lPyramid%d'%i) for i in range(J)] lPyramid[J-1] = gPyramid[J-1] for j in range(J-1)[::-1]: lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k] # Make the Gaussian pyramid of the input inGPyramid = [hl.Func('inGPyramid%d'%i) for i in range(J)] inGPyramid[0] = gray for j in range(1,J): inGPyramid[j][x,y] = downsample2D(inGPyramid[j-1])[x,y] # Make the laplacian pyramid of the output outLPyramid = [hl.Func('outLPyramid%d'%i) for i in range(J)] for j in range(J): # Split input pyramid value into integer and floating parts level = inGPyramid[j][x,y]*hl.cast(float_t, levels-1) li = hl.clamp(hl.cast(int_t, level), 0, levels-2) lf = level - hl.cast(float_t, li) # Linearly interpolate between the nearest processed pyramid levels outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1] # Make the Gaussian pyramid of the output outGPyramid = [hl.Func('outGPyramid%d'%i) for i in range(J)] outGPyramid[J-1] = outLPyramid[J-1] for j in range(J-1)[::-1]: outGPyramid[j][x,y] = upsample2D(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y] # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) color = hl.Func('color') eps = 0.01 color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c] + eps) / (gray[x,y] + eps) output = hl.Func('local_laplacian') # Convert back to 16-bit output[x,y,c] = hl.cast(hl.UInt(16), hl.clamp(color[x,y,c], 0.0, 1.0) * 65535.0) # THE SCHEDULE remap.compute_root() target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU Schedule print ("Compiling for GPU") xi, yi = hl.Var("xi"), hl.Var("yi") output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default) for j in range(J): blockw = 32 blockh = 16 if j > 3: blockw = 2 blockh = 2 if j > 0: inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) if j > 0: gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default) else: # CPU schedule print ("Compiling for CPU") output.parallel(y, 4).vectorize(x, 4); gray.compute_root().parallel(y, 4).vectorize(x, 4); for j in range(4): if j > 0: inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) if j > 0: gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) outGPyramid[j].compute_root().parallel(y).vectorize(x, 4) for j in range(4,J): inGPyramid[j].compute_root().parallel(y) gPyramid[j].compute_root().parallel(k) outGPyramid[j].compute_root().parallel(y) return output