Esempio n. 1
def combine2(im1, im2, width, height, dist):
    init_mask1 = hl.Func("mask1_layer_0")
    init_mask2 = hl.Func("mask2_layer_0")
    accumulator = hl.Func("combine_accumulator")
    output = hl.Func("combine_output")

    x, y = hl.Var("x"), hl.Var("y")

    im1_mirror = hl.BoundaryConditions.repeat_edge(im1, [(0, width), (0, height)])
    im2_mirror = hl.BoundaryConditions.repeat_edge(im2, [(0, width), (0, height)])

    weight1 = hl.f32(dist[im1_mirror[x, y]])
    weight2 = hl.f32(dist[im2_mirror[x, y]])

    init_mask1[x, y] = weight1 / (weight1 + weight2)
    init_mask2[x, y] = 1 - init_mask1[x, y]

    mask1 = init_mask1
    mask2 = init_mask2

    accumulator[x, y] = hl.i32(0)

    accumulator[x, y] += hl.i32(im1_mirror[x, y] * mask1[x, y]) + hl.i32(im2_mirror[x, y] * mask2[x, y])

    output[x, y] = hl.u16_sat(accumulator[x, y])

    init_mask1.compute_root().parallel(y).vectorize(x, 16)

    accumulator.compute_root().parallel(y).vectorize(x, 16)

    accumulator.update(0).parallel(y).vectorize(x, 16)

    return output
Esempio n. 2
def test_minmax():
    x = hl.Var()
    f = hl.Func()
    f[x] = == 0, hl.min(x, 1), (x == 2) | (x == 4),
                     hl.i32(hl.min(hl.f32(x), hl.f32(3.2), x * hl.f32(2.1))),
                     x == 3, hl.max(x, x * 3, 1, x * 4), x)
    b = f.realize(5)
    assert b[0] == 0
    assert b[1] == 1, b[1]
    assert b[2] == 2
    assert b[3] == 12
    assert b[4] == 3
    def __init__(self, input):

        assert input.type() == hl.UInt(8)

        self.lut = hl.Func("lut")
        self.padded = hl.Func("padded")
        self.padded16 = hl.Func("padded16")
        self.sharpen = hl.Func("sharpen")
        self.curved = hl.Func("curved")
        self.input = input

        # For this lesson, we'll use a two-stage pipeline that sharpens
        # and then applies a look-up-table (LUT).

        # First we'll define the LUT. It will be a gamma curve.
        gamma = hl.f32(1.2)
        self.lut[i] = hl.u8(hl.clamp(hl.pow(i / 255.0, gamma) * 255.0, 0, 255))

        # Augment the input with a boundary condition.
        self.padded[x, y, c] = input[hl.clamp(x, 0,
                                              input.width() - 1),
                                     hl.clamp(y, 0,
                                              input.height() - 1), c]

        # Cast it to 16-bit to do the math.
        self.padded16[x, y, c] = hl.u16(self.padded[x, y, c])

        # Next we sharpen it with a five-tap filter.
        self.sharpen[x, y, c] = (
            self.padded16[x, y, c] * 2 -
            (self.padded16[x - 1, y, c] + self.padded16[x, y - 1, c] +
             self.padded16[x + 1, y, c] + self.padded16[x, y + 1, c]) / 4)

        # Then apply the LUT.
        self.curved[x, y, c] = self.lut[self.sharpen[x, y, c]]
Esempio n. 4
def main():

    x = hl.Var('x')
    f_in = hl.Func('in')
    f_in[x] = hl.f32(x)  # Cast to float 32
    f_0 = hl.Func('f_0')
    f_0[x] = 2 * f_in[x]
    f_1 = hl.Func('f_1')
    f_1[x] = hl.sin(f_0[x])
    f_2 = hl.Func('f_2')
    f_2[x] = f_1[x] * f_1[x]

    # Setup
    f_2.set_estimate(x, 0, 1000)
    p = hl.Pipeline(f_2)
    target = hl.Target()
    # Only first parameter is used (number of cores on CPU)
    params = hl.MachineParams(32, 0, 0)
    result = p.auto_schedule('Li2018', target, params)

    p.compile_jit()  # compile
    buf = p.realize(1000)  # compute and get the buffer
Esempio n. 5
def tone_map(input, width, height, compression, gain):
    print(f'Compression: {compression}, gain: {gain}')

    normal_dist = hl.Func("luma_weight_distribution")
    grayscale = hl.Func("grayscale")
    output = hl.Func("tone_map_output")

    x, y, c, v = hl.Var("x"), hl.Var("y"), hl.Var("c"), hl.Var("v")

    rdom = hl.RDom([(0, 3)])

    normal_dist[v] = hl.f32(hl.exp(-12.5 * hl.pow(hl.f32(v) / 65535 - 0.5, 2)))

    grayscale[x, y] = hl.u16(hl.sum(hl.u32(input[x, y, rdom])) / 3)

    dark = grayscale

    comp_const = 1
    gain_const = 1

    comp_slope = (compression - comp_const) / (TONE_MAP_PASSES)
    gain_slope = (gain - gain_const) / (TONE_MAP_PASSES)

    for i in range(TONE_MAP_PASSES):
        print('    pass', i)

        norm_comp = i * comp_slope + comp_const
        norm_gain = i * gain_slope + gain_const

        bright = brighten(dark, norm_comp)

        dark_gamma = gamma_correct(dark)
        bright_gamma = gamma_correct(bright)

        dark_gamma = combine2(dark_gamma, bright_gamma, width, height, normal_dist)

        dark = brighten(gamma_inverse(dark_gamma), norm_gain)

    output[x, y, c] = hl.u16_sat(hl.u32(input[x, y, c]) * hl.u32(dark[x, y]) / hl.u32(hl.max(1, grayscale[x, y])))

    grayscale.compute_root().parallel(y).vectorize(x, 16)

    normal_dist.compute_root().vectorize(v, 16)

    return output
Esempio n. 6
def test_print_expr():
    x = hl.Var('x')
    f = hl.Func('f')
    f[x] = hl.print(hl.cast(hl.UInt(8), x), 'is what', 'the', 1, 'and',
                    hl.f32(3.1415), 'saw')
    buf = hl.Buffer(hl.UInt(8), [1])
    output = StringIO()
    with _redirect_stdout(output):
        expected = '0 is what the 1 and 3.141500 saw\n'
        actual = output.getvalue()
        assert expected == actual, "Expected: %s, Actual: %s" % (expected,
Esempio n. 7
def gamma_inverse(input):
    output = hl.Func("gamma_inverse_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    cutoff = 2575
    gamma_toe = 0.0774
    gamma_pow = 2.4
    gamma_fac = 57632.49226
    gamma_con = 0.055

    if input.dimensions() == 2:
        output[x, y] = hl.u16([x, y] < cutoff,
                                        gamma_toe * input[x, y],
                                        hl.pow(hl.f32(input[x, y]) / 65535 + gamma_con, gamma_pow) * gamma_fac))
        output[x, y, c] = hl.u16([x, y, c] < cutoff,
                                           gamma_toe * input[x, y, c],
                                           hl.pow(hl.f32(input[x, y, c]) / 65535 + gamma_con, gamma_pow) * gamma_fac))

    output.compute_root().parallel(y).vectorize(x, 16)

    return output
Esempio n. 8
def srgb(input, ccm):
    srgb_matrix = hl.Func("srgb_matrix")
    output = hl.Func("srgb_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    rdom = hl.RDom([(0, 3)])

    srgb_matrix[x, y] = hl.f32(0)

    srgb_matrix[0, 0] = hl.f32(ccm[0][0])
    srgb_matrix[1, 0] = hl.f32(ccm[0][1])
    srgb_matrix[2, 0] = hl.f32(ccm[0][2])
    srgb_matrix[0, 1] = hl.f32(ccm[1][0])
    srgb_matrix[1, 1] = hl.f32(ccm[1][1])
    srgb_matrix[2, 1] = hl.f32(ccm[1][2])
    srgb_matrix[0, 2] = hl.f32(ccm[2][0])
    srgb_matrix[1, 2] = hl.f32(ccm[2][1])
    srgb_matrix[2, 2] = hl.f32(ccm[2][2])

    output[x, y, c] = hl.u16_sat(hl.sum(srgb_matrix[rdom, c] * input[x, y, rdom]))

    return output
Esempio n. 9
def white_balance(input, width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b):
    output = hl.Func("white_balance_output")

    print(width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b)

    x, y = hl.Var("x"), hl.Var("y")

    rdom = hl.RDom([(0, width / 2), (0, height / 2)])

    output[x, y] = hl.u16(0)

    output[rdom.x * 2, rdom.y * 2] = hl.u16_sat(white_balance_r * hl.f32(input[rdom.x * 2, rdom.y * 2]))
    output[rdom.x * 2 + 1, rdom.y * 2] = hl.u16_sat(white_balance_g0 * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2]))
    output[rdom.x * 2, rdom.y * 2 + 1] = hl.u16_sat(white_balance_g1 * hl.f32(input[rdom.x * 2, rdom.y * 2 + 1]))
    output[rdom.x * 2 + 1, rdom.y * 2 + 1] = hl.u16_sat(white_balance_b * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2 + 1]))

    output.compute_root().parallel(y).vectorize(x, 16)


    return output
Esempio n. 10
def rgb_to_yuv(input):
    print('    rgb_to_yuv')

    output = hl.Func("rgb_to_yuv_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    rdom = input[x, y, 0]
    g = input[x, y, 1]
    b = input[x, y, 2]

    output[x, y, c] = hl.f32(0)

    output[x, y, 0] = 0.2989 * rdom + 0.587 * g + 0.114 * b
    output[x, y, 1] = -0.168935 * rdom - 0.331655 * g + 0.50059 * b
    output[x, y, 2] = 0.499813 * rdom - 0.418531 * g + - 0.081282 * b

    output.compute_root().parallel(y).vectorize(x, 16)

    output.update(0).parallel(y).vectorize(x, 16)
    output.update(1).parallel(y).vectorize(x, 16)
    output.update(2).parallel(y).vectorize(x, 16)

    return output
Esempio n. 11
def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid
Esempio n. 12
def combine(im1, im2, width, height, dist):
    init_mask1 = hl.Func("mask1_layer_0")
    init_mask2 = hl.Func("mask2_layer_0")
    accumulator = hl.Func("combine_accumulator")
    output = hl.Func("combine_output")

    x, y = hl.Var("x"), hl.Var("y")

    im1_mirror = hl.BoundaryConditions.repeat_edge(im1, [(0, width), (0, height)])
    im2_mirror = hl.BoundaryConditions.repeat_edge(im2, [(0, width), (0, height)])

    unblurred1 = im1_mirror
    unblurred2 = im2_mirror

    blurred1 = gauss_7x7(im1_mirror, "img1_layer_0")
    blurred2 = gauss_7x7(im2_mirror, "img2_layer_0")

    weight1 = hl.f32(dist[im1_mirror[x, y]])
    weight2 = hl.f32(dist[im2_mirror[x, y]])

    init_mask1[x, y] = weight1 / (weight1 + weight2)
    init_mask2[x, y] = 1 - init_mask1[x, y]

    mask1 = init_mask1
    mask2 = init_mask2

    num_layers = 2

    accumulator[x, y] = hl.i32(0)

    for i in range(1, num_layers):
        print('        layer', i)

        prev_layer_str = str(i - 1)
        layer_str = str(i)

        laplace1 = diff(unblurred1, blurred1, "laplace1_layer_" + prev_layer_str)
        laplace2 = diff(unblurred2, blurred2, "laplace2_layer_" + layer_str)

        accumulator[x, y] += hl.i32(laplace1[x, y] * mask1[x, y]) + hl.i32(laplace2[x, y] * mask2[x, y])

        unblurred1 = blurred1
        unblurred2 = blurred2

        blurred1 = gauss_7x7(blurred1, "img1_layer_" + layer_str)
        blurred2 = gauss_7x7(blurred2, "img2_layer_" + layer_str)

        mask1 = gauss_7x7(mask1, "mask1_layer_" + layer_str)
        mask2 = gauss_7x7(mask2, "mask2_layer_" + layer_str)

    accumulator[x, y] += hl.i32(blurred1[x, y] * mask1[x, y]) + hl.i32(blurred2[x, y] * mask2[x, y])

    output[x, y] = hl.u16_sat(accumulator[x, y])

    init_mask1.compute_root().parallel(y).vectorize(x, 16)

    accumulator.compute_root().parallel(y).vectorize(x, 16)

    for i in range(num_layers):
        accumulator.update(i).parallel(y).vectorize(x, 16)

    return output
Esempio n. 13
def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid
Esempio n. 14
def test_simplestub():
    x, y = hl.Var(), hl.Var()
    target = hl.get_jit_target_from_environment()

    b_in = hl.Buffer(hl.UInt(8), [2, 2])

    f_in = hl.Func("f")
    f_in[x, y] = x + y

    # ----------- Inputs by-position
    f = simplestub.generate(target, b_in, f_in, 3.5)

    # ----------- Inputs by-name
    f = simplestub.generate(target,

    f = simplestub.generate(target,

    # ----------- Above set again, w/ GeneratorParam mixed in
    k = 42

    # (positional)
    f = simplestub.generate(target, b_in, f_in, 3.5, offset=k)
    _realize_and_check(f, k)

    # (keyword)
    f = simplestub.generate(target,
    _realize_and_check(f, k)

    f = simplestub.generate(target,
    _realize_and_check(f, k)

    f = simplestub.generate(target,
    _realize_and_check(f, k)

    f = simplestub.generate(target,
    _realize_and_check(f, k)

    # ----------- Test various failure modes
        # Inputs w/ mixed by-position and by-name
        f = simplestub.generate(target, b_in, f_in, float_arg=3.5)
    except RuntimeError as e:
        assert 'Cannot use both positional and keyword arguments for inputs.' in str(
        assert False, 'Did not see expected exception!'

        # too many positional args
        f = simplestub.generate(target, b_in, f_in, 3.5, 4)
    except RuntimeError as e:
        assert 'Expected exactly 3 positional args for inputs, but saw 4.' in str(
        assert False, 'Did not see expected exception!'

        # too few positional args
        f = simplestub.generate(target, b_in, f_in)
    except RuntimeError as e:
        assert 'Expected exactly 3 positional args for inputs, but saw 2.' in str(
        assert False, 'Did not see expected exception!'

        # Inputs that can't be converted to what the receiver needs (positional)
        f = simplestub.generate(target, hl.f32(3.141592), "happy", k)
    except RuntimeError as e:
        assert 'Unable to cast Python instance' in str(e)
        assert False, 'Did not see expected exception!'

        # Inputs that can't be converted to what the receiver needs (named)
        f = simplestub.generate(target, b_in, f_in, float_arg="bogus")
    except RuntimeError as e:
        assert 'Unable to cast Python instance' in str(e)
        assert False, 'Did not see expected exception!'

        # Input specified by both pos and kwarg
        f = simplestub.generate(target, b_in, f_in, 3.5, float_arg=4.5)
    except RuntimeError as e:
        assert "Cannot use both positional and keyword arguments for inputs." in str(
        assert False, 'Did not see expected exception!'

        # Bad input name
        f = simplestub.generate(target,
    except RuntimeError as e:
        assert "Expected exactly 3 keyword args for inputs, but saw 2." in str(
        assert False, 'Did not see expected exception!'

        # Bad gp name
        f = simplestub.generate(target,
    except RuntimeError as e:
        assert "Generator simplestub has no GeneratorParam named: nonexistent_generator_param" in str(
        assert False, 'Did not see expected exception!'
Esempio n. 15
def get_local_laplacian(input, levels, alpha, beta, J=8):
    n_downsamples = 0
    n_upsamples = 0

    x = hl.Var('x')
    y = hl.Var('y')

    def downsample(f):
        nonlocal n_downsamples
        downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func(
            'downy%i' % n_downsamples)
        n_downsamples += 1

        downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 *
                          (f[2 * x, y, c] + f[2 * x + 1, y, c]) +
                          f[2 * x + 2, y, c]) / 8.0
        downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 *
                          (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) +
                          downx[x, 2 * y + 2, c]) / 8.0

        return downy

    def upsample(f):
        nonlocal n_upsamples
        upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' %
        n_upsamples += 1

        upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 *
                                (x % 2), y, c] + 0.75 * f[x // 2, y, c]
        upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 *
                                  (y % 2), c] + 0.75 * upx[x, y // 2, c]

        return upy

    def downsample2D(f):
        nonlocal n_downsamples
        downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func(
            'downy%i' % n_downsamples)
        n_downsamples += 1

        downx[x, y] = (f[2 * x - 1, y] + 3.0 *
                       (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0
        downy[x, y] = (downx[x, 2 * y - 1] + 3.0 *
                       (downx[x, 2 * y] + downx[x, 2 * y + 1]) +
                       downx[x, 2 * y + 2]) / 8.0

        return downy

    def upsample2D(f):
        nonlocal n_upsamples
        upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' %
        n_upsamples += 1

            y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y]
            y] = 0.25 * upx[x,
                            (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2]

        return upy


    # loop variables
    c = hl.Var('c')
    k = hl.Var('k')

    # Make the remapping function as a lookup table.
    remap = hl.Func('remap')
    fx = hl.cast(float_t, x / 256.0)
    # remap[x] = alpha*fx*exp(-fx*fx/2.0)
    remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0)

    # Convert to floating point
    floating = hl.Func('floating')
    floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0

    # Set a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y, c] = floating[hl.clamp(x, 0,
                                         input.width() - 1),
                                hl.clamp(y, 0,
                                         input.height() - 1), c]

    # Get the luminance channel
    gray = hl.Func('gray')
    kR = hl.f32(0.299)
    kG = hl.f32(0.587)
    kB = hl.f32(0.114)
         y] = kR * clamped[x, y, 0] + kG * clamped[x, y,
                                                   1] + kB * clamped[x, y, 2]

    # Make the processed Gaussian pyramid.
    gPyramid = [hl.Func('gPyramid%i' % i) for i in range(J)]
    # Do a lookup into a lut with 256 entires per intensity level
    level = k / (levels - 1)
    idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0
    idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256)
    gPyramid[0][x, y,
                k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k]
    for j in range(1, J):
        gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k]

    # Get its laplacian pyramid
    lPyramid = [hl.Func('lPyramid%i' % i) for i in range(J)]
    lPyramid[J - 1] = gPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(
            gPyramid[j + 1])[x, y, k]

    # Make the Gaussian pyramid of the input
    inGPyramid = [hl.Func('inGPyramid%i' % i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1, J):
        inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y]

    # Make the laplacian pyramid of the output
    outLPyramid = [hl.Func('outLPyramid%i' % i) for i in range(J)]
    for j in range(J):
        # Split input pyramid value into integer and floating parts
        level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1)
        li = hl.clamp(hl.cast(int_t, level), 0, levels - 2)
        lf = level - hl.cast(float_t, li)
        # Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j][x, y] = (
            1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1]

    # Make the Gaussian pyramid of the output
    outGPyramid = [hl.Func('outGPyramid%i' % i) for i in range(J)]
    outGPyramid[J - 1] = outLPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        outGPyramid[j][x, y] = upsample2D(
            outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]

    # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
    color = hl.Func('color')
    eps = hl.f32(0.01)
    color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] +
                                             eps) / (gray[x, y] + eps)

    output = hl.Func('local_laplacian')
    # Convert back to 16-bit
    output[x, y, c] = hl.cast(hl.UInt(16),
                              hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0)

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU Schedule
        print("Compiling for GPU")
        xi, yi = hl.Var("xi"), hl.Var("yi")

        output.compute_root().gpu_tile(x, y, xi, yi, 16, 8)
        for j in range(J):
            blockw = 16
            blockh = 8
            if j > 3:
                blockw = 2
                blockh = 2
            if j > 0:
                inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
                gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(
                    x, y, xi, yi, blockw, blockh)
            outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
        # CPU schedule
        print("Compiling for CPU")

        output.parallel(y, 4).vectorize(x, 4)
        gray.compute_root().parallel(y, 4).vectorize(x, 4)
        for j in range(4):
            if j > 0:
                inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            if j > 0:
                gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            outGPyramid[j].compute_root().parallel(y).vectorize(x, 4)
        for j in range(4, J):

    return output