def histogram(x, y, c, img, w, h, hist_index):
    print("GET HIST ON: ", w, h)
    histogram = hl.Func("histogram")

    # Histogram buckets start as zero.
    histogram[hist_index] = 0

    # Define a multi-dimensional reduction domain over the input image:
    r = hl.RDom([(0, w), (0, h)])

    # For every point in the reduction domain, increment the
    # histogram bucket corresponding to the intensity of the
    # input image at that point.

    histogram[hl.Expr(img[r.x, r.y])] += 1
    histogram.set_estimate(hist_index, 0, 255)

    # Get the sum of all histogram cells
    r = hl.RDom([(0,255)])
    hist_sum = hl.Func('hist_sum')
    hist_sum[()] = 0.0 # Compute the sum as a 32-bit integer
    hist_sum[()] += histogram[r.x]

    # Return each histogram as a % of total color
    pct_hist = hl.Func('pct_hist')
    pct_hist[hist_index] = histogram[hist_index] / hist_sum[()]

    return histogram
Esempio n. 2
0
def merge_temporal(images, alignment):
    weight = hl.Func("merge_temporal_weights")
    total_weight = hl.Func("merge_temporal_total_weights")
    output = hl.Func("merge_temporal_output")

    ix, iy, tx, ty, n = hl.Var('ix'), hl.Var('iy'), hl.Var('tx'), hl.Var('ty'), hl.Var('n')
    rdom0 = hl.RDom([(0, 16), (0, 16)])

    rdom1 = hl.RDom([(1, images.dim(2).extent() - 1)])

    imgs_mirror = hl.BoundaryConditions.mirror_interior(images, [(0, images.width()), (0, images.height())])

    layer = box_down2(imgs_mirror, "merge_layer")

    offset = Point(alignment[tx, ty, n]).clamp(Point(MINIMUM_OFFSET, MINIMUM_OFFSET),
                                               Point(MAXIMUM_OFFSET, MAXIMUM_OFFSET))

    al_x = idx_layer(tx, rdom0.x) + offset.x / 2
    al_y = idx_layer(ty, rdom0.y) + offset.y / 2

    ref_val = layer[idx_layer(tx, rdom0.x), idx_layer(ty, rdom0.y), 0]
    alt_val = layer[al_x, al_y, n]

    factor = 8.0
    min_distance = 10
    max_distance = 300 # max L1 distance, otherwise the value is not used

    distance = hl.sum(hl.abs(hl.cast(hl.Int(32), ref_val) - hl.cast(hl.Int(32), alt_val))) / 256

    normal_distance = hl.max(1, hl.cast(hl.Int(32), distance) / factor - min_distance / factor)

    # Weight for the alternate frame
    weight[tx, ty, n] = hl.select(normal_distance > (max_distance - min_distance), 0.0,
                                  1.0 / normal_distance)

    total_weight[tx, ty] = hl.sum(weight[tx, ty, rdom1]) + 1

    offset = Point(alignment[tx, ty, rdom1])

    al_x = idx_im(tx, ix) + offset.x
    al_y = idx_im(ty, iy) + offset.y

    ref_val = imgs_mirror[idx_im(tx, ix), idx_im(ty, iy), 0]
    alt_val = imgs_mirror[al_x, al_y, rdom1]

    # Sum all values according to their weight, and divide by total weight to obtain average
    output[ix, iy, tx, ty] = hl.sum(weight[tx, ty, rdom1] * alt_val / total_weight[tx, ty]) + ref_val / total_weight[
        tx, ty]

    weight.compute_root().parallel(ty).vectorize(tx, 16)

    total_weight.compute_root().parallel(ty).vectorize(tx, 16)

    output.compute_root().parallel(ty).vectorize(ix, 32)

    return output
Esempio n. 3
0
def gauss_15x15(input, name):
    print('        gauss_15x15')

    k = hl.Buffer(hl.Float(32), [15], "gauss_15x15")
    k.translate([-7])

    rdom = hl.RDom([(-7, 15)])

    k.fill(0)
    k[-7] = 0.004961
    k[-6] = 0.012246
    k[-5] = 0.026304
    k[-4] = 0.049165
    k[-3] = 0.079968
    k[-2] = 0.113193
    k[-1] = 0.139431
    k[0] = 0.149464
    k[7] = 0.004961
    k[6] = 0.012246
    k[5] = 0.026304
    k[4] = 0.049165
    k[3] = 0.079968
    k[2] = 0.113193
    k[1] = 0.139431

    return gauss(input, k, rdom, name)
Esempio n. 4
0
def test_basics3():
    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma',
                       0.1)  # Value needed if not generating an executable
    s_sigma = 8  # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2,
                  y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)
    zi = hl.i32((val / r_sigma) + 0.5)
    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0

    ss = hl.select(c == 0, val, 1.0)
    left = histogram[x, y, zi, c]
    left += 5
    left += ss
Esempio n. 5
0
def test_rdom():
    x = hl.Var("x")
    y = hl.Var("y")

    diagonal = hl.Func("diagonal")
    diagonal[x, y] = 1

    domain_width = 10
    domain_height = 10

    r = hl.RDom(0, domain_width, 0, domain_height)
    r.where(r.x <= r.y)

    diagonal[r.x, r.y] = 2
    output = diagonal.realize(domain_width, domain_height)

    for iy in range(domain_height):
        for ix in range(domain_width):
            if ix <= iy:
                assert output(ix, iy) == 2
            else:
                assert output(ix, iy) == 1

    print("Success!")
    return 0
Esempio n. 6
0
def test_basics2():
    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1)
    s_sigma = 8

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val0 = clamped[x * s_sigma, y * s_sigma]
    val00 = clamped[x * s_sigma * hl.i32(1), y * s_sigma * hl.i32(1)]
    val22 = clamped[x * s_sigma - hl.i32(s_sigma // 2),
                    y * s_sigma - hl.i32(s_sigma // 2)]
    val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2]
    val3 = clamped[x * s_sigma + r.x - s_sigma // 2,
                   y * s_sigma + r.y - s_sigma // 2]

    try:
        val1 = clamped[x * s_sigma - s_sigma / 2, y * s_sigma - s_sigma / 2]
    except RuntimeError as e:
        assert 'Implicit cast from float32 to int' in str(e)
    else:
        assert False, 'Did not see expected exception!'
Esempio n. 7
0
def test_basics2():

    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable
    s_sigma = 8 # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0, input.width()-1),
                          hl.clamp(y, 0, input.height()-1),0]

    # Construct the bilateral grid
    r = hl.RDom(0, s_sigma, 0, s_sigma, 'r')
    val0 = clamped[x * s_sigma, y * s_sigma]
    val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1), y * s_sigma * hl.cast(hl.Int(32), 1)]
    #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail
    val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma//2),
                    y * s_sigma - hl.cast(hl.Int(32), s_sigma//2)]
    val2 = clamped[x * s_sigma - s_sigma//2, y * s_sigma - s_sigma//2]
    val3 = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2]

    return
Esempio n. 8
0
def test_basics2():

    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma',
                       0.1)  # Value needed if not generating an executable
    s_sigma = 8  # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    if True:
        print("s_sigma", s_sigma)
        print("s_sigma/2", s_sigma / 2)
        print("s_sigma//2", s_sigma // 2)
        print()
        print("x * s_sigma", x * s_sigma)
        print("x * 8", x * 8)
        print("x * 8 + 4", x * 8 + 4)
        print("x * 8 * 4", x * 8 * 4)
        print()
        print("x", x)
        print("(x * s_sigma).type()", )
        print("(x * 8).type()", (x * 8).type())
        print("(x * 8 + 4).type()", (x * 8 + 4).type())
        print("(x * 8 * 4).type()", (x * 8 * 4).type())
        print("(x * 8 / 4).type()", (x * 8 / 4).type())
        print("((x * 8) * 4).type()", ((x * 8) * 4).type())
        print("(x * (8 * 4)).type()", (x * (8 * 4)).type())

    assert (x * 8).type() == hl.Int(32)
    assert (x * 8 * 4).type() == hl.Int(32)  # yes this did fail at some point
    assert ((x * 8) / 4).type() == hl.Int(32)
    assert (x * (8 / 4)).type() == hl.Float(32)  # under python3 division rules
    assert (x * (8 // 4)).type() == hl.Int(32)
    #assert (x * 8 // 4).type() == hl.Int(32) # not yet implemented

    # Construct the bilateral grid
    r = hl.RDom(0, s_sigma, 0, s_sigma, 'r')
    val0 = clamped[x * s_sigma, y * s_sigma]
    val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1),
                    y * s_sigma * hl.cast(hl.Int(32), 1)]
    #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail
    val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma // 2),
                    y * s_sigma - hl.cast(hl.Int(32), s_sigma // 2)]
    val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2]
    val3 = clamped[x * s_sigma + r.x - s_sigma // 2,
                   y * s_sigma + r.y - s_sigma // 2]

    return
Esempio n. 9
0
def test_basics4():
    # Test for f[g[r]] = ...
    # See https://github.com/halide/Halide/issues/4285
    x = hl.Var('x')
    f = hl.Func('f')
    g = hl.Func('g')
    g[x] = 1
    f[x] = 0.0
    r = hl.RDom([(0, 100)])
    f[g[r]] = 2.3 # This triggers a warning of double-to-float conversion
    f.compute_root()
    f.compile_jit()
Esempio n. 10
0
def test_basics4():
    # Test for f[g[r]] = ...
    # See https://github.com/halide/Halide/issues/4285
    x = hl.Var('x')
    f = hl.Func('f')
    g = hl.Func('g')
    g[x] = 1
    f[x] = 0.0
    r = hl.RDom([(0, 100)])
    f[g[r]] = 2.5
    f.compute_root()
    f.compile_jit()
Esempio n. 11
0
def box_down2(input, name):
    output = hl.Func(name)

    x, y, n = hl.Var("x"), hl.Var("y"), hl.Var('n')
    rdom = hl.RDom([(0, 2), (0, 2)])

    output[x, y, n] = hl.cast(
        hl.UInt(16),
        hl.sum(hl.cast(hl.UInt(32), input[2 * x + rdom.x, 2 * y + rdom.y, n]))
        / 4)

    output.compute_root().parallel(y).vectorize(x, 16)

    return output
Esempio n. 12
0
def align_layer(layer, prev_alignment, prev_min, prev_max):
    scores = hl.Func(layer.name() + "_scores")
    alignment = hl.Func(layer.name() + "_alignment")
    xi, yi, tx, ty, n = hl.Var("xi"), hl.Var("yi"), hl.Var('tx'), hl.Var(
        'ty'), hl.Var('n')
    rdom0 = hl.RDom([(0, 16), (0, 16)])
    rdom1 = hl.RDom([(-4, 8), (-4, 8)])

    # Alignment of the previous (more coarse) layer scaled to this (finer) layer
    prev_offset = DOWNSAMPLE_RATE * Point(
        prev_alignment[prev_tile(tx), prev_tile(ty), n]).clamp(
            prev_min, prev_max)

    x0 = idx_layer(tx, rdom0.x)
    y0 = idx_layer(ty, rdom0.y)
    # (x,y) coordinates in the search region relative to the offset obtained from the alignment of the previous layer
    x = x0 + prev_offset.x + xi
    y = y0 + prev_offset.y + yi

    ref_val = layer[x0, y0, 0]  # Value of reference frame (the first frame)
    alt_val = layer[x, y, n]  # alternate frame value

    # L1 distance between reference frame and alternate frame
    d = hl.abs(hl.cast(hl.Int(32), ref_val) - hl.cast(hl.Int(32), alt_val))

    scores[xi, yi, tx, ty, n] = hl.sum(d)

    # Alignment for each tile, where L1 distances are minimum
    alignment[tx, ty, n] = Point(hl.argmin(scores[rdom1.x, rdom1.y, tx, ty,
                                                  n])) + prev_offset

    scores.compute_at(alignment, tx).vectorize(xi, 8)

    alignment.compute_root().parallel(ty).vectorize(tx, 16)

    return alignment
def deviation(x, y, c, img):
    _gray = gray(x, y, c, img)

    r = hl.RDom([(-2, 5), (-2, 5)])

    avg = mkfunc('avg', _gray)
    avg[x,y] = 0.0
    avg[x,y] += _gray[x + r.x, y + r.y]
    avg[x,y] = avg[x,y] / 25.0

    deviation = mkfunc('deviation', avg)
    deviation[x,y] = 0.0
    deviation[x,y] += (_gray[x + r.x, y + r.y] - avg[x,y]) ** 2
    deviation[x,y] = (deviation[x,y] / 25.0)

    return deviation
Esempio n. 14
0
def gauss_7x7(input, name):
    k = hl.Buffer(hl.Float(32), [7], "gauss_7x7_kernel")
    k.translate([-3])

    rdom = hl.RDom([(-3, 7)])

    k.fill(0)
    k[-3] = 0.026267
    k[-2] = 0.100742
    k[-1] = 0.225511
    k[0] = 0.29496
    k[1] = 0.225511
    k[2] = 0.100742
    k[3] = 0.026267

    return gauss(input, k, rdom, name)
Esempio n. 15
0
def test_atomics():
    x = hl.Var('x')
    im = hl.Func('im')
    f = hl.Func('f')
    im[x] = (x * x) % 5
    r = hl.RDom([(0, 100)])
    f[x] = 0
    f[hl.Expr(im[r])] += 1
    f.compute_root().update().atomic().parallel(r)
    b = f.realize(5)

    ref = [0, 0, 0, 0, 0]
    for i in range(100):
        idx = (i * i) % 5
        ref[idx] += 1
    for i in range(5):
        assert (b[i] == ref[i])
def entropy(x, y, c, img, w, h, hist_index):
    base_gray = gray(x, y, c, img)
    clamped_gray = mkfunc('clamped_gray', base_gray)
    clamped_gray[x,y] = hl.clamp(base_gray[x,y], 0, 255)
    u8_gray = u8(x, y, c, clamped_gray)

    probabilities = histogram(x, y, c, u8_gray, w, h, hist_index)

    r = hl.RDom([(-2, 5), (-2, 5)])

    levels = mkfunc('entropy', img)
    levels[x,y] = 0.0
    # Add in 0.00001 to prevent -Inf's
    levels[x,y] += base_gray[x + r.x, y + r.y] * hl.log(probabilities[u8_gray[x + r.x, y + r.y]]+0.00001)
    levels[x,y] = levels[x,y] * -1.0

    return levels
Esempio n. 17
0
def tone_map(input, width, height, compression, gain):
    print(f'Compression: {compression}, gain: {gain}')

    normal_dist = hl.Func("luma_weight_distribution")
    grayscale = hl.Func("grayscale")
    output = hl.Func("tone_map_output")

    x, y, c, v = hl.Var("x"), hl.Var("y"), hl.Var("c"), hl.Var("v")

    rdom = hl.RDom([(0, 3)])

    normal_dist[v] = hl.f32(hl.exp(-12.5 * hl.pow(hl.f32(v) / 65535 - 0.5, 2)))

    grayscale[x, y] = hl.u16(hl.sum(hl.u32(input[x, y, rdom])) / 3)

    dark = grayscale

    comp_const = 1
    gain_const = 1

    comp_slope = (compression - comp_const) / (TONE_MAP_PASSES)
    gain_slope = (gain - gain_const) / (TONE_MAP_PASSES)

    for i in range(TONE_MAP_PASSES):
        print('    pass', i)

        norm_comp = i * comp_slope + comp_const
        norm_gain = i * gain_slope + gain_const

        bright = brighten(dark, norm_comp)

        dark_gamma = gamma_correct(dark)
        bright_gamma = gamma_correct(bright)

        dark_gamma = combine2(dark_gamma, bright_gamma, width, height, normal_dist)

        dark = brighten(gamma_inverse(dark_gamma), norm_gain)

    output[x, y, c] = hl.u16_sat(hl.u32(input[x, y, c]) * hl.u32(dark[x, y]) / hl.u32(hl.max(1, grayscale[x, y])))

    grayscale.compute_root().parallel(y).vectorize(x, 16)

    normal_dist.compute_root().vectorize(v, 16)

    return output
Esempio n. 18
0
def test_basics5():
    # Test Func.in_()
    x, y = hl.Var('x'), hl.Var('y')
    f = hl.Func('f')
    g = hl.Func('g')
    h = hl.Func('h')
    f[x, y] = y
    r = hl.RDom([(0, 100)])
    g[x] = 0
    g[x] += f[x, r]
    h[x] = 0
    h[x] += f[x, r]
    f.in_(g).compute_at(g, x)
    f.in_(h).compute_at(h, x)
    g.compute_root()
    h.compute_root()
    p = hl.Pipeline([g, h])
    p.compile_jit()
Esempio n. 19
0
def gaussian_down4(input, name):
    output = hl.Func(name)
    k = hl.Func(name + "_filter")
    x, y, n = hl.Var("x"), hl.Var("y"), hl.Var('n')
    rdom = hl.RDom([(-2, 5), (-2, 5)])

    k[x, y] = 0
    k[-2, -2] = 2
    k[-1, -2] = 4
    k[0, -2] = 5
    k[1, -2] = 4
    k[2, -2] = 2
    k[-2, -1] = 4
    k[-1, -1] = 9
    k[0, -1] = 12
    k[1, -1] = 9
    k[2, -1] = 4
    k[-2, 0] = 5
    k[-1, 0] = 12
    k[0, 0] = 15
    k[1, 0] = 12
    k[2, 0] = 5
    k[-2, 1] = 4
    k[-1, 1] = 9
    k[0, 1] = 12
    k[1, 1] = 9
    k[2, 1] = 4
    k[-2, 2] = 2
    k[-1, 2] = 4
    k[0, 2] = 5
    k[1, 2] = 4
    k[2, 2] = 2

    output[x, y, n] = hl.cast(
        hl.UInt(16),
        hl.sum(
            hl.cast(
                hl.UInt(32), input[4 * x + rdom.x, 4 * y + rdom.y, n] *
                k[rdom.x, rdom.y])) / 159)

    k.compute_root().parallel(y).parallel(x)
    output.compute_root().parallel(y).vectorize(x, 16)

    return output
Esempio n. 20
0
def test_rdom():
    x = hl.Var("x")
    y = hl.Var("y")

    diagonal = hl.Func("diagonal")
    diagonal[x, y] = 1

    domain_width = 10
    domain_height = 10

    r = hl.RDom([(0, domain_width), (0, domain_height)])
    r.where(r.x <= r.y)

    diagonal[r.x, r.y] += 2
    output = diagonal.realize(domain_width, domain_height)

    for iy in range(domain_height):
        for ix in range(domain_width):
            if ix <= iy:
                assert output[ix, iy] == 3
            else:
                assert output[ix, iy] == 1

    assert r.x.name() == r[0].name()
    assert r.y.name() == r[1].name()
    try:
        r[-1].name()
        raise Exception("underflowing index should raise KeyError")
    except KeyError:
        pass
    try:
        r[2].name()
        raise Exception("overflowing index should raise KeyError")
    except KeyError:
        pass
    try:
        r["foo"].name()
        raise Exception("bad index type should raise TypeError")
    except TypeError:
        pass

    return 0
Esempio n. 21
0
    def gen_outputs(self):
        ''' define the outputs '''
        nbfn = self.nbfn
        i, j = [self.vars[c] for c in "ij"]
        g_fock = self.funcs["g_fock"]
        g_dens = self.clamps["g_dens"]
        # output scalars
        rv = hl.Func("rv")

        # output matrix
        g_fock_out = hl.Func("g_fock_out")
        self.funcs.update({"rv": rv, "g_fock_out": g_fock_out})
        self.outputs.update({"rv": rv, "g_fock_out": g_fock_out})

        g_fock_out[i, j] = g_fock[i, j]

        rv[i] = hl.f64(0.0)
        r_rv = hl.RDom([(0, nbfn), (0, nbfn)])
        rv[0] += g_fock[r_rv] * g_dens[r_rv]
        rv[0] *= hl.f64(0.5)
Esempio n. 22
0
def srgb(input, ccm):
    srgb_matrix = hl.Func("srgb_matrix")
    output = hl.Func("srgb_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    rdom = hl.RDom([(0, 3)])

    srgb_matrix[x, y] = hl.f32(0)

    srgb_matrix[0, 0] = hl.f32(ccm[0][0])
    srgb_matrix[1, 0] = hl.f32(ccm[0][1])
    srgb_matrix[2, 0] = hl.f32(ccm[0][2])
    srgb_matrix[0, 1] = hl.f32(ccm[1][0])
    srgb_matrix[1, 1] = hl.f32(ccm[1][1])
    srgb_matrix[2, 1] = hl.f32(ccm[1][2])
    srgb_matrix[0, 2] = hl.f32(ccm[2][0])
    srgb_matrix[1, 2] = hl.f32(ccm[2][1])
    srgb_matrix[2, 2] = hl.f32(ccm[2][2])

    output[x, y, c] = hl.u16_sat(hl.sum(srgb_matrix[rdom, c] * input[x, y, rdom]))

    return output
Esempio n. 23
0
def white_balance(input, width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b):
    output = hl.Func("white_balance_output")

    print(width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b)

    x, y = hl.Var("x"), hl.Var("y")

    rdom = hl.RDom([(0, width / 2), (0, height / 2)])

    output[x, y] = hl.u16(0)

    output[rdom.x * 2, rdom.y * 2] = hl.u16_sat(white_balance_r * hl.f32(input[rdom.x * 2, rdom.y * 2]))
    output[rdom.x * 2 + 1, rdom.y * 2] = hl.u16_sat(white_balance_g0 * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2]))
    output[rdom.x * 2, rdom.y * 2 + 1] = hl.u16_sat(white_balance_g1 * hl.f32(input[rdom.x * 2, rdom.y * 2 + 1]))
    output[rdom.x * 2 + 1, rdom.y * 2 + 1] = hl.u16_sat(white_balance_b * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2 + 1]))

    output.compute_root().parallel(y).vectorize(x, 16)

    output.update(0).parallel(rdom.y)
    output.update(1).parallel(rdom.y)
    output.update(2).parallel(rdom.y)
    output.update(3).parallel(rdom.y)

    return output
Esempio n. 24
0
def test_rdom():
    x = hl.Var("x")
    y = hl.Var("y")

    diagonal = hl.Func("diagonal")
    diagonal[x, y] = 1

    domain_width = 10
    domain_height = 10

    r = hl.RDom([(0, domain_width), (0, domain_height)])
    r.where(r.x <= r.y)

    diagonal[r.x, r.y] += 2
    output = diagonal.realize(domain_width, domain_height)

    for iy in range(domain_height):
        for ix in range(domain_width):
            if ix <= iy:
                assert output[ix, iy] == 3
            else:
                assert output[ix, iy] == 1

    return 0
def main():
    # Declare some Vars to use below.
    x, y = hl.Var("x"), hl.Var("y")

    # Load a grayscale image to use as an input.
    image_path = os.path.join(os.path.dirname(__file__),
                              "../../tutorial/images/gray.png")
    input_data = imread(image_path)
    if True:
        # making the image smaller to go faster
        input_data = input_data[:160, :150]
    assert input_data.dtype == np.uint8
    input = hl.Buffer(input_data)

    # You can define a hl.Func in multiple passes. Let's see a toy
    # example first.
    if True:
        # The first definition must be one like we have seen already
        # - a mapping from Vars to an hl.Expr:
        f = hl.Func("f")
        f[x, y] = x + y
        # We call this first definition the "pure" definition.

        # But the later definitions can include computed expressions on
        # both sides. The simplest example is modifying a single point:
        f[3, 7] = 42

        # We call these extra definitions "update" definitions, or
        # "reduction" definitions. A reduction definition is an
        # update definition that recursively refers back to the
        # function's current value at the same site:
        if False:
            e = f[x, y] + 17
            print("f[x, y] + 17", e)
            print("(f[x, y] + 17).type()", e.type())
            print("(f[x, y]).type()", f[x, y].type())

        f[x, y] = f[x, y] + 17

        # If we confine our update to a single row, we can
        # recursively refer to values in the same column:
        f[x, 3] = f[x, 0] * f[x, 10]

        # Similarly, if we confine our update to a single column, we
        # can recursively refer to other values in the same row.
        f[0, y] = f[0, y] / f[3, y]

        # The general rule is: Each hl.Var used in an update definition
        # must appear unadorned in the same position as in the pure
        # definition in all references to the function on the left-
        # and right-hand sides. So the following definitions are
        # legal updates:
        f[x,
          17] = x + 8  # x is used, so all uses of f must have x as the first argument.
        f[0,
          y] = y * 8  # y is used, so all uses of f must have y as the second argument.
        f[x, x + 1] = x + 8
        f[y / 2, y] = f[0, y] * 17

        # But these ones would cause an error:
        # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'.
        # f[y, y + 1) = y + 8   <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'.
        # f[y, x) = y - x      <- Arguments to f on the left-hand-side are in the wrong places.
        # f[3, 4) = x + y      <- Free variables appear on the right-hand-side but not the left-hand-side.

        # We'll realize this one just to make sure it compiles. The
        # second-to-last definition forces us to realize over a
        # domain that is taller than it is wide.
        f.realize(100, 101)

        # For each realization of f, each step runs in its entirety
        # before the next one begins. Let's trace the loads and
        # stores for a simpler example:
        g = hl.Func("g")
        g[x, y] = x + y  # Pure definition
        g[2, 1] = 42  # First update definition
        g[x, 0] = g[x, 1]  # Second update definition

        g.trace_loads()
        g.trace_stores()

        g.realize(4, 4)

        # Reading the log, we see that each pass is applied in turn. The equivalent C is:
        result = np.empty((4, 4), dtype=np.int)
        # Pure definition
        for yy in range(4):
            for xx in range(4):
                result[yy][xx] = xx + yy

        # First update definition
        result[1][2] = 42
        # Second update definition
        for xx in range(4):
            result[0][xx] = result[1][xx]
    # end of section

    # Putting update passes inside loops.
    if True:
        # Starting with this pure definition:
        f = hl.Func("f")
        f[x, y] = x + y

        # Say we want an update that squares the first fifty rows. We
        # could do this by adding 50 update definitions:

        # f[x, 0) = f[x, 0) * f[x, 0)
        # f[x, 1) = f[x, 1) * f[x, 1)
        # f[x, 2) = f[x, 2) * f[x, 2)
        # ...
        # f[x, 49) = f[x, 49) * f[x, 49)

        # Or equivalently using a compile-time loop in our C++:
        # for (int i = 0 i < 50 i++) {
        #   f[x, i) = f[x, i) * f[x, i)
        #

        # But it's more manageable and more flexible to put the loop
        # in the generated code. We do this by defining a "reduction
        # domain" and using it inside an update definition:
        r = hl.RDom(0, 50)
        f[x, r] = f[x, r] * f[x, r]
        halide_result = f.realize(100, 100)

        # The equivalent C is:
        c_result = np.empty((100, 100), dtype=np.int)
        for yy in range(100):
            for xx in range(100):
                c_result[yy][xx] = xx + yy

        for xx in range(100):
            for rr in range(50):
                # The loop over the reduction domain occurs inside of
                # the loop over any pure variables used in the update
                # step:
                c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx]

        # Check the results match:
        for yy in range(100):
            for xx in range(100):
                if halide_result(xx, yy) != c_result[yy][xx]:
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result[yy][xx]))
                    return -1

    # Now we'll examine a real-world use for an update definition:
    # computing a histogram.
    if True:

        # Some operations on images can't be cleanly expressed as a pure
        # function from the output coordinates to the value stored
        # there. The classic example is computing a histogram. The
        # natural way to do it is to iterate over the input image,
        # updating histogram buckets. Here's how you do that in Halide:
        histogram = hl.Func("histogram")

        # Histogram buckets start as zero.
        histogram[x] = 0

        # Define a multi-dimensional reduction domain over the input image:
        r = hl.RDom(0, input.width(), 0, input.height())

        # For every point in the reduction domain, increment the
        # histogram bucket corresponding to the intensity of the
        # input image at that point.
        histogram[input[r.x, r.y]] += 1

        halide_result = histogram.realize(256)

        # The equivalent C is:
        c_result = np.empty((256), dtype=np.int)
        for xx in range(256):
            c_result[xx] = 0

        for r_y in range(input.height()):
            for r_x in range(input.width()):
                c_result[input_data[r_x, r_y]] += 1

        # Check the answers agree:
        for xx in range(256):
            if c_result[xx] != halide_result(xx):
                raise Exception("halide_result(%d) = %d instead of %d" %
                                (xx, halide_result(xx), c_result[xx]))
                return -1

    # Scheduling update steps
    if True:
        # The pure variables in an update step and can be
        # parallelized, vectorized, split, etc as usual.

        # Vectorizing, splitting, or parallelize the variables that
        # are part of the reduction domain is trickier. We'll cover
        # that in a later lesson.

        # Consider the definition:
        f = hl.Func("x")
        f[x, y] = x * y
        # Set the second row to equal the first row.
        f[x, 1] = f[x, 0]
        # Set the second column to equal the first column plus 2.
        f[1, y] = f[0, y] + 2

        # The pure variables in each stage can be scheduled
        # independently. To control the pure definition, we schedule
        # as we have done in the past. The following code vectorizes
        # and parallelizes the pure definition only.
        f.vectorize(x, 4).parallel(y)

        # We use hl.Func::update(int) to get a handle to an update step
        # for the purposes of scheduling. The following line
        # vectorizes the first update step across x. We can't do
        # anything with y for this update step, because it doesn't
        # use y.
        f.update(0).vectorize(x, 4)

        # Now we parallelize the second update step in chunks of size
        # 4.
        yo, yi = hl.Var("yo"), hl.Var("yi")
        f.update(1).split(y, yo, yi, 4).parallel(yo)

        halide_result = f.realize(16, 16)

        # Here's the equivalent (serial) C:
        c_result = np.empty((16, 16), dtype=np.int)

        # Pure step. Vectorized in x and parallelized in y.
        for yy in range(16):  # Should be a parallel for loop
            for x_vec in range(4):
                xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
                c_result[yy][xx[0]] = xx[0] * yy
                c_result[yy][xx[1]] = xx[1] * yy
                c_result[yy][xx[2]] = xx[2] * yy
                c_result[yy][xx[3]] = xx[3] * yy

        # First update. Vectorized in x.
        for x_vec in range(4):
            xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
            c_result[1][xx[0]] = c_result[0][xx[0]]
            c_result[1][xx[1]] = c_result[0][xx[1]]
            c_result[1][xx[2]] = c_result[0][xx[2]]
            c_result[1][xx[3]] = c_result[0][xx[3]]

        # Second update. Parallelized in chunks of size 4 in y.
        for yo in range(4):  # Should be a parallel for loop
            for yi in range(4):
                yy = yo * 4 + yi
                c_result[yy][1] = c_result[yy][0] + 2

        # Check the C and Halide results match:
        for yy in range(16):
            for xx in range(16):
                if halide_result(xx, yy) != c_result[yy][xx]:
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result[yy][xx]))
                    return -1

    # That covers how to schedule the variables within a hl.Func that
    # uses update steps, but what about producer-consumer
    # relationships that involve compute_at and store_at? Let's
    # examine a reduction as a producer, in a producer-consumer pair.
    if True:
        # Because an update does multiple passes over a stored array,
        # it's not meaningful to inline them. So the default schedule
        # for them does the closest thing possible. It computes them
        # in the innermost loop of their consumer. Consider this
        # trivial example:
        producer, consumer = hl.Func("producer"), hl.Func("consumer")
        producer[x] = x * 17
        producer[x] += 1
        consumer[x] = 2 * producer[x]
        halide_result = consumer.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            producer_storage = np.empty((1), dtype=np.int)
            # Pure step for producer
            producer_storage[0] = xx * 17
            # Update step for producer
            producer_storage[0] = producer_storage[0] + 1
            # Pure step for consumer
            c_result[xx] = 2 * producer_storage[0]

        # Check the results match
        for xx in range(10):
            if halide_result(xx) != c_result[xx]:
                raise Exception("halide_result(%d) = %d instead of %d" %
                                (xx, halide_result(xx), c_result[xx]))
                return -1

        # For all other compute_at/store_at options, the reduction
        # gets placed where you would expect, somewhere in the loop
        # nest of the consumer.

    # Now let's consider a reduction as a consumer in a
    # producer-consumer pair. This is a little more involved.
    if True:
        if True:
            # Case 1: The consumer references the producer in the pure step only.
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            # The producer is pure.
            producer[x] = x * 17
            consumer[x] = 2 * producer[x]
            consumer[x] += 1

            # The valid schedules for the producer in this case are
            # the default schedule - inlined, and also:
            #
            # 1) producer.compute_at(x), which places the computation of
            # the producer inside the loop over x in the pure step of the
            # consumer.
            #
            # 2) producer.compute_root(), which computes all of the
            # producer ahead of time.
            #
            # 3) producer.store_root().compute_at(x), which allocates
            # space for the consumer outside the loop over x, but fills
            # it in as needed inside the loop.
            #
            # Let's use option 1.

            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)

            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = 2 * producer_storage[0]

            # Update step for the consumer
            for xx in range(10):
                c_result[xx] += 1

            # All of the pure step is evaluated before any of the
            # update step, so there are two separate loops over x.

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 2: The consumer references the producer in the update step only
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside the update
            # step of the producer, because that's the only step that
            # uses the producer.
            producer.compute_at(consumer, x)

            # Note however, that we didn't say:
            #
            # producer.compute_at(consumer.update(0), x).
            #
            # Scheduling is done with respect to Vars of a hl.Func, and
            # the Vars of a hl.Func are shared across the pure and
            # update steps.

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                c_result[xx] = xx

            # Update step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 3: The consumer references the producer in
            # multiple steps that share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = producer[x] * x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside both the
            # pure and the update step of the producer. So there ends
            # up being two separate realizations of the producer, and
            # redundant work occurs.
            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = producer_storage[0] * xx

            # Update step for the consumer
            for xx in range(10):
                # Another copy of the pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 4: The consumer references the producer in
            # multiple steps that do not share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x, y] = x * y
            consumer[x, y] = x + y
            consumer[x, 0] = producer[x, x - 1]
            consumer[0, y] = producer[y, y - 1]

            # In this case neither producer.compute_at(consumer, x)
            # nor producer.compute_at(consumer, y) will work, because
            # either one fails to cover one of the uses of the
            # producer. So we'd have to inline producer, or use
            # producer.compute_root().

            # Let's say we really really want producer to be
            # compute_at the inner loops of both consumer update
            # steps. Halide doesn't allow multiple different
            # schedules for a single hl.Func, but we can work around it
            # by making two wrappers around producer, and scheduling
            # those instead:

            # Attempt 2:
            producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(
            ), hl.Func(), hl.Func()
            producer_wrapper_1[x, y] = producer[x, y]
            producer_wrapper_2[x, y] = producer[x, y]

            consumer_2[x, y] = x + y
            consumer_2[x, 0] += producer_wrapper_1[x, x - 1]
            consumer_2[0, y] += producer_wrapper_2[y, y - 1]

            # The wrapper functions give us two separate handles on
            # the producer, so we can schedule them differently.
            producer_wrapper_1.compute_at(consumer_2, x)
            producer_wrapper_2.compute_at(consumer_2, y)

            halide_result = consumer_2.realize(10, 10)

            # The equivalent C is:
            c_result = np.empty((10, 10), dtype=np.int)

            # Pure step for the consumer
            for yy in range(10):
                for xx in range(10):
                    c_result[yy][xx] = xx + yy

            # First update step for consumer
            for xx in range(10):
                producer_wrapper_1_storage = np.empty((1), dtype=np.int)
                producer_wrapper_1_storage[0] = xx * (xx - 1)
                c_result[0][xx] += producer_wrapper_1_storage[0]

            # Second update step for consumer
            for yy in range(10):
                producer_wrapper_2_storage = np.empty((1), dtype=np.int)
                producer_wrapper_2_storage[0] = yy * (yy - 1)
                c_result[yy][0] += producer_wrapper_2_storage[0]

            # Check the results match
            for yy in range(10):
                for xx in range(10):
                    if halide_result(xx, yy) != c_result[yy][xx]:
                        print("halide_result(%d, %d) = %d instead of %d", xx,
                              yy, halide_result(xx, yy), c_result[yy][xx])
                        return -1

        if True:
            # Case 5: Scheduling a producer under a reduction domain
            # variable of the consumer.

            # We are not just restricted to scheduling producers at
            # the loops over the pure variables of the consumer. If a
            # producer is only used within a loop over a reduction
            # domain (hl.RDom) variable, we can also schedule the
            # producer there.

            producer, consumer = hl.Func("producer"), hl.Func("consumer")

            r = hl.RDom(0, 5)
            producer[x] = x * 17
            consumer[x] = x + 10
            consumer[x] += r + producer[x + r]

            producer.compute_at(consumer, r)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer.
            for xx in range(10):
                c_result[xx] = xx + 10

            # Update step for the consumer.
            for xx in range(10):
                for rr in range(
                        5
                ):  # The loop over the reduction domain is always the inner loop.
                    # We've schedule the storage and computation of
                    # the producer here. We just need a single value.
                    producer_storage = np.empty((1), dtype=np.int)
                    # Pure step of the producer.
                    producer_storage[0] = (xx + rr) * 17

                    # Now use it in the update step of the consumer.
                    c_result[xx] += rr + producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

    # A real-world example of a reduction inside a producer-consumer chain.
    if True:
        # The default schedule for a reduction is a good one for
        # convolution-like operations. For example, the following
        # computes a 5x5 box-blur of our grayscale test image with a
        # hl.clamp-to-edge boundary condition:

        # First add the boundary condition.
        clamped = hl.repeat_edge(input)

        # Define a 5x5 box that starts at (-2, -2)
        r = hl.RDom(-2, 5, -2, 5)

        # Compute the 5x5 sum around each pixel.
        local_sum = hl.Func("local_sum")
        local_sum[x, y] = 0  # Compute the sum as a 32-bit integer
        local_sum[x, y] += clamped[x + r.x, y + r.y]

        # Divide the sum by 25 to make it an average
        blurry = hl.Func("blurry")
        blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25)

        halide_result = blurry.realize(input.width(), input.height())

        # The default schedule will inline 'clamped' into the update
        # step of 'local_sum', because clamped only has a pure
        # definition, and so its default schedule is fully-inlined.
        # We will then compute local_sum per x coordinate of blurry,
        # because the default schedule for reductions is
        # compute-innermost. Here's the equivalent C:

        #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0]
        local_sum = np.empty((1), dtype=np.int32)

        c_result = hl.Buffer(hl.UInt(8), input.width(), input.height())
        for yy in range(input.height()):
            for xx in range(input.width()):
                # FIXME this loop is quite slow
                # Pure step of local_sum
                local_sum[0] = 0
                # Update step of local_sum
                for r_y in range(-2, 2 + 1):
                    for r_x in range(-2, 2 + 1):
                        # The clamping has been inlined into the update step.
                        clamped_x = min(max(xx + r_x, 0), input.width() - 1)
                        clamped_y = min(max(yy + r_y, 0), input.height() - 1)
                        local_sum[0] += input(clamped_x, clamped_y)

                # Pure step of blurry
                #c_result(x, y) = (uint8_t)(local_sum[0] / 25)
                #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25)
                c_result[xx, yy] = int(local_sum[0] /
                                       25)  # hl.cast done internally

        # Check the results match
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result(xx, yy) != c_result(xx, yy):
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result(xx, yy)))
                    return -1

    # Reduction helpers.
    if True:
        # There are several reduction helper functions provided in
        # Halide.h, which compute small reductions and schedule them
        # innermost into their consumer. The most useful one is
        # "sum".
        f1 = hl.Func("f1")
        r = hl.RDom(0, 100)
        f1[x] = hl.sum(r + x) * 7

        # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to:
        f2, anon = hl.Func("f2"), hl.Func("anon")
        anon[x] = 0
        anon[x] += r + x
        f2[x] = anon[x] * 7

        # So even though f1 references a reduction domain, it is a
        # pure function. The reduction domain has been swallowed to
        # define the inner anonymous reduction.
        halide_result_1 = f1.realize(10)
        halide_result_2 = f2.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            anon = np.empty((1), dtype=np.int)
            anon[0] = 0
            for rr in range(100):
                anon[0] += rr + xx

            c_result[xx] = anon[0] * 7

        # Check they all match.
        for xx in range(10):
            if halide_result_1(xx) != c_result[xx]:
                print("halide_result_1(%d) = %d instead of %d", x,
                      halide_result_1(x), c_result[x])
                return -1

            if halide_result_2(xx) != c_result[xx]:
                print("halide_result_2(%d) = %d instead of %d", x,
                      halide_result_2(x), c_result[x])
                return -1

    # A complex example that uses reduction helpers.
    if False:  # non-sense to port SSE code to python, skipping this test

        # Other reduction helpers include "product", "minimum",
        # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax
        # requires understanding tuples, which come in a later
        # lesson. Let's use minimum and maximum to compute the local
        # spread of our grayscale image.

        # First, add a boundary condition to the input.
        clamped = hl.Func("clamped")
        x_clamped = hl.clamp(x, 0, input.width() - 1)
        y_clamped = hl.clamp(y, 0, input.height() - 1)
        clamped[x, y] = input[x_clamped, y_clamped]

        box = hl.RDom(-2, 5, -2, 5)
        # Compute the local maximum minus the local minimum:
        spread = hl.Func("spread")
        spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) -
                        minimum(clamped(x + box.x, y + box.y)))

        # Compute the result in strips of 32 scanlines
        yo, yi = hl.Var("yo"), hl.Var("yi")
        spread.split(y, yo, yi, 32).parallel(yo)

        # Vectorize across x within the strips. This implicitly
        # vectorizes stuff that is computed within the loop over x in
        # spread, which includes our minimum and maximum helpers, so
        # they get vectorized too.
        spread.vectorize(x, 16)

        # We'll apply the boundary condition by padding each scanline
        # as we need it in a circular buffer (see lesson 08).
        clamped.store_at(spread, yo).compute_at(spread, yi)

        halide_result = spread.realize(input.width(), input.height())

        # The C equivalent is almost too horrible to contemplate (and
        # took me a long time to debug). This time I want to time
        # both the Halide version and the C version, so I'll use sse
        # intrinsics for the vectorization, and openmp to do the
        # parallel for loop (you'll need to compile with -fopenmp or
        # similar to get correct timing).
        #ifdef __SSE2__

        # Don't include the time required to allocate the output buffer.
        c_result = hl.Buffer(hl.UInt(8), input.width(), input.height())

        #ifdef _OPENMP
        t1 = datetime.now()
        #endif

        # Run this one hundred times so we can average the timing results.
        for iters in range(100):
            pass
            # #pragma omp parallel for
            # for yo in range((input.height() + 31)/32):
            #     y_base = hl.min(yo * 32, input.height() - 32)
            #
            #     # Compute clamped in a circular buffer of size 8
            #     # (smallest power of two greater than 5). Each thread
            #     # needs its own allocation, so it must occur here.
            #
            #     clamped_width = input.width() + 4
            #     clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8)
            #
            #     for yi in range(32):
            #         y = y_base + yi
            #
            #         uint8_t *output_row = &c_result(0, y)
            #
            #         # Compute clamped for this scanline, skipping rows
            #         # already computed within this slice.
            #         int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2)
            #         int max_y_clamped = (y + 2)
            #         for (int cy = min_y_clamped cy <= max_y_clamped cy++) {
            #             # Figure out which row of the circular buffer
            #             # we're filling in using bitmasking:
            #             uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width
            #
            #             # Figure out which row of the input we're reading
            #             # from by clamping the y coordinate:
            #             int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1)
            #             uint8_t *input_row = &input(0, clamped_y)
            #
            #             # Fill it in with the padding.
            #             for (int x = -2 x < input.width() + 2 ):
            #                 int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1)
            #                 *clamped_row++ = input_row[clamped_x]
            #
            #
            #
            #         # Now iterate over vectors of x for the pure step of the output.
            #         for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) {
            #             int x_base = std::hl.min(x_vec * 16, input.width() - 16)
            #
            #             # Allocate storage for the minimum and maximum
            #             # helpers. One vector is enough.
            #             __m128i minimum_storage, maximum_storage
            #
            #             # The pure step for the maximum is a vector of zeros
            #             maximum_storage = (__m128i)_mm_setzero_ps()
            #
            #             # The update step for maximum
            #             for (int max_y = y - 2 max_y <= y + 2 max_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width
            #                 for (int max_x = x_base - 2 max_x <= x_base + 2 max_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2))
            #                     maximum_storage = _mm_max_epu8(maximum_storage, v)
            #
            #
            #
            #             # The pure step for the minimum is a vector of
            #             # ones. Create it by comparing something to
            #             # itself.
            #             minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(),
            #                                                     _mm_setzero_ps())
            #
            #             # The update step for minimum.
            #             for (int min_y = y - 2 min_y <= y + 2 min_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width
            #                 for (int min_x = x_base - 2 min_x <= x_base + 2 min_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2))
            #                     minimum_storage = _mm_min_epu8(minimum_storage, v)
            #
            #
            #
            #             # Now compute the spread.
            #             __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage)
            #
            #             # Store it.
            #             _mm_storeu_si128((__m128i *)(output_row + x_base), spread)
            #
            #
            #
            #     del clamped_storage
            #
        # end of hundred iterations

        # Skip the timing comparison if we don't have openmp
        # enabled. Otherwise it's unfair to C.
        #ifdef _OPENMP
        t2 = datetime.now()

        # Now run the Halide version again without the
        # jit-compilation overhead. Also run it one hundred times.
        for iters in range(100):
            spread.realize(halide_result)

        t3 = datetime.now()

        # Report the timings. On my machine they both take about 3ms
        # for the 4-megapixel input (fast!), which makes sense,
        # because they're using the same vectorization and
        # parallelization strategy. However I find the Halide easier
        # to read, write, debug, modify, and port.
        print("Halide spread took %f ms. C equivalent took %f ms" %
              ((t3 - t2).total_seconds() * 1000,
               (t2 - t1).total_seconds() * 1000))

        #endif # _OPENMP

        # Check the results match:
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result(xx, yy) != c_result(xx, yy):
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result(xx, yy)))
                    return -1

        #endif # __SSE2__
    else:
        print("(Skipped the SSE2 section of the code, "
              "since non-sense in python world.)")

    print("Success!")
    return 0
def main():
    # Declare some Vars to use below.
    x, y = hl.Var("x"), hl.Var("y")

    # Load a grayscale image to use as an input.
    image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png")
    input_data = imageio.imread(image_path)
    if True:
         # making the image smaller to go faster
        input_data = input_data[:160, :150]
    assert input_data.dtype == np.uint8
    input = hl.Buffer(input_data)

    # You can define a hl.Func in multiple passes. Let's see a toy
    # example first.
    if True:
        # The first definition must be one like we have seen already
        # - a mapping from Vars to an hl.Expr:
        f = hl.Func("f")
        f[x, y] = x + y
        # We call this first definition the "pure" definition.

        # But the later definitions can include computed expressions on
        # both sides. The simplest example is modifying a single point:
        f[3, 7] = 42

        # We call these extra definitions "update" definitions, or
        # "reduction" definitions. A reduction definition is an
        # update definition that recursively refers back to the
        # function's current value at the same site:
        if False:
            e = f[x, y] + 17
            print("f[x, y] + 17", e)
            print("(f[x, y] + 17).type()", e.type())
            print("(f[x, y]).type()", f[x, y].type())

        f[x, y] = f[x, y] + 17

        # If we confine our update to a single row, we can
        # recursively refer to values in the same column:
        f[x, 3] = f[x, 0] * f[x, 10]

        # Similarly, if we confine our update to a single column, we
        # can recursively refer to other values in the same row.
        f[0, y] = f[0, y] / f[3, y]

        # The general rule is: Each hl.Var used in an update definition
        # must appear unadorned in the same position as in the pure
        # definition in all references to the function on the left-
        # and right-hand sides. So the following definitions are
        # legal updates:
        # x is used, so all uses of f must have x as the first argument.
        f[x, 17] = x + 8
        # y is used, so all uses of f must have y as the second argument.
        f[0, y] = y * 8
        f[x, x + 1] = x + 8
        f[y / 2, y] = f[0, y] * 17

        # But these ones would cause an error:
        # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'.
        # f[y, y + 1) = y + 8   <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'.
        # f[y, x) = y - x      <- Arguments to f on the left-hand-side are in the wrong places.
        # f[3, 4) = x + y      <- Free variables appear on the right-hand-side
        # but not the left-hand-side.

        # We'll realize this one just to make sure it compiles. The
        # second-to-last definition forces us to realize over a
        # domain that is taller than it is wide.
        f.realize(100, 101)

        # For each realization of f, each step runs in its entirety
        # before the next one begins. Let's trace the loads and
        # stores for a simpler example:
        g = hl.Func("g")
        g[x, y] = x + y   # Pure definition
        g[2, 1] = 42      # First update definition
        g[x, 0] = g[x, 1]  # Second update definition

        g.trace_loads()
        g.trace_stores()

        g.realize(4, 4)

        # Reading the log, we see that each pass is applied in turn. The
        # equivalent Python is:
        result = np.empty((4, 4), dtype=np.int)
        # Pure definition
        for yy in range(4):
            for xx in range(4):
                result[yy][xx] = xx + yy

        # First update definition
        result[1][2] = 42
        # Second update definition
        for xx in range(4):
            result[0][xx] = result[1][xx]
    # end of section

    # Putting update passes inside loops.
    if True:
        # Starting with this pure definition:
        f = hl.Func("f")
        f[x, y] = x + y

        # Say we want an update that squares the first fifty rows. We
        # could do this by adding 50 update definitions:

        # f[x, 0) = f[x, 0) * f[x, 0)
        # f[x, 1) = f[x, 1) * f[x, 1)
        # f[x, 2) = f[x, 2) * f[x, 2)
        # ...
        # f[x, 49) = f[x, 49) * f[x, 49)

        # Or equivalently using a compile-time loop in our C++:
        # for (int i = 0 i < 50 i++) {
        #   f[x, i) = f[x, i) * f[x, i)
        #

        # But it's more manageable and more flexible to put the loop
        # in the generated code. We do this by defining a "reduction
        # domain" and using it inside an update definition:
        r = hl.RDom([(0, 50)])
        f[x, r] = f[x, r] * f[x, r]
        halide_result = f.realize(100, 100)

        # The equivalent Python is:
        py_result = np.empty((100, 100), dtype=np.int)
        for yy in range(100):
            for xx in range(100):
                py_result[yy][xx] = xx + yy

        for xx in range(100):
            for rr in range(50):
                # The loop over the reduction domain occurs inside of
                # the loop over any pure variables used in the update
                # step:
                py_result[rr][xx] = py_result[rr][xx] * py_result[rr][xx]

        # Check the results match:
        for yy in range(100):
            for xx in range(100):
                assert halide_result[xx, yy] == py_result[yy][xx], \
                    "halide_result(%d, %d) = %d instead of %d" % (
                        xx, yy, halide_result[xx, yy], py_result[yy][xx])

    # Now we'll examine a real-world use for an update definition:
    # computing a histogram.
    if True:

        # Some operations on images can't be cleanly expressed as a pure
        # function from the output coordinates to the value stored
        # there. The classic example is computing a histogram. The
        # natural way to do it is to iterate over the input image,
        # updating histogram buckets. Here's how you do that in Halide:
        histogram = hl.Func("histogram")

        # Histogram buckets start as zero.
        histogram[x] = 0

        # Define a multi-dimensional reduction domain over the input image:
        r = hl.RDom([(0, input.width()), (0, input.height())])

        # For every point in the reduction domain, increment the
        # histogram bucket corresponding to the intensity of the
        # input image at that point.
        histogram[input[r.x, r.y]] += 1

        halide_result = histogram.realize(256)

        # The equivalent Python is:
        py_result = np.empty((256), dtype=np.int)
        for xx in range(256):
            py_result[xx] = 0

        for r_y in range(input.height()):
            for r_x in range(input.width()):
                py_result[input_data[r_x, r_y]] += 1

        # Check the answers agree:
        for xx in range(256):
            assert py_result[xx] == halide_result[xx], \
                "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

    # Scheduling update steps
    if True:
        # The pure variables in an update step and can be
        # parallelized, vectorized, split, etc as usual.

        # Vectorizing, splitting, or parallelize the variables that
        # are part of the reduction domain is trickier. We'll cover
        # that in a later lesson.

        # Consider the definition:
        f = hl.Func("x")
        f[x, y] = x * y
        # Set the second row to equal the first row.
        f[x, 1] = f[x, 0]
        # Set the second column to equal the first column plus 2.
        f[1, y] = f[0, y] + 2

        # The pure variables in each stage can be scheduled
        # independently. To control the pure definition, we schedule
        # as we have done in the past. The following code vectorizes
        # and parallelizes the pure definition only.
        f.vectorize(x, 4).parallel(y)

        # We use hl.Func::update(int) to get a handle to an update step
        # for the purposes of scheduling. The following line
        # vectorizes the first update step across x. We can't do
        # anything with y for this update step, because it doesn't
        # use y.
        f.update(0).vectorize(x, 4)

        # Now we parallelize the second update step in chunks of size
        # 4.
        yo, yi = hl.Var("yo"), hl.Var("yi")
        f.update(1).split(y, yo, yi, 4).parallel(yo)

        halide_result = f.realize(16, 16)

        # Here's the equivalent (serial) C:
        py_result = np.empty((16, 16), dtype=np.int)

        # Pure step. Vectorized in x and parallelized in y.
        for yy in range(16):  # Should be a parallel for loop
            for x_vec in range(4):
                xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
                py_result[yy][xx[0]] = xx[0] * yy
                py_result[yy][xx[1]] = xx[1] * yy
                py_result[yy][xx[2]] = xx[2] * yy
                py_result[yy][xx[3]] = xx[3] * yy

        # First update. Vectorized in x.
        for x_vec in range(4):
            xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
            py_result[1][xx[0]] = py_result[0][xx[0]]
            py_result[1][xx[1]] = py_result[0][xx[1]]
            py_result[1][xx[2]] = py_result[0][xx[2]]
            py_result[1][xx[3]] = py_result[0][xx[3]]

        # Second update. Parallelized in chunks of size 4 in y.
        for yo in range(4):  # Should be a parallel for loop
            for yi in range(4):
                yy = yo * 4 + yi
                py_result[yy][1] = py_result[yy][0] + 2

        # Check the C and Halide results match:
        for yy in range(16):
            for xx in range(16):
                assert halide_result[xx, yy] == py_result[yy][xx], \
                    "halide_result(%d, %d) = %d instead of %d" % (
                        xx, yy, halide_result[xx, yy], py_result[yy][xx])

    # That covers how to schedule the variables within a hl.Func that
    # uses update steps, but what about producer-consumer
    # relationships that involve compute_at and store_at? Let's
    # examine a reduction as a producer, in a producer-consumer pair.
    if True:
        # Because an update does multiple passes over a stored array,
        # it's not meaningful to inline them. So the default schedule
        # for them does the closest thing possible. It computes them
        # in the innermost loop of their consumer. Consider this
        # trivial example:
        producer, consumer = hl.Func("producer"), hl.Func("consumer")
        producer[x] = x * 17
        producer[x] += 1
        consumer[x] = 2 * producer[x]
        halide_result = consumer.realize(10)

        # The equivalent Python is:
        py_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            producer_storage = np.empty((1), dtype=np.int)
            # Pure step for producer
            producer_storage[0] = xx * 17
            # Update step for producer
            producer_storage[0] = producer_storage[0] + 1
            # Pure step for consumer
            py_result[xx] = 2 * producer_storage[0]

        # Check the results match
        for xx in range(10):
            assert halide_result[xx] == py_result[xx], \
                "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        # For all other compute_at/store_at options, the reduction
        # gets placed where you would expect, somewhere in the loop
        # nest of the consumer.

    # Now let's consider a reduction as a consumer in a
    # producer-consumer pair. This is a little more involved.
    if True:
        if True:
            # Case 1: The consumer references the producer in the pure step
            # only.
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            # The producer is pure.
            producer[x] = x * 17
            consumer[x] = 2 * producer[x]
            consumer[x] += 1

            # The valid schedules for the producer in this case are
            # the default schedule - inlined, and also:
            #
            # 1) producer.compute_at(x), which places the computation of
            # the producer inside the loop over x in the pure step of the
            # consumer.
            #
            # 2) producer.compute_root(), which computes all of the
            # producer ahead of time.
            #
            # 3) producer.store_root().compute_at(x), which allocates
            # space for the consumer outside the loop over x, but fills
            # it in as needed inside the loop.
            #
            # Let's use option 1.

            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)

            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] = 2 * producer_storage[0]

            # Update step for the consumer
            for xx in range(10):
                py_result[xx] += 1

            # All of the pure step is evaluated before any of the
            # update step, so there are two separate loops over x.

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        if True:
            # Case 2: The consumer references the producer in the update step
            # only
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside the update
            # step of the producer, because that's the only step that
            # uses the producer.
            producer.compute_at(consumer, x)

            # Note however, that we didn't say:
            #
            # producer.compute_at(consumer.update(0), x).
            #
            # Scheduling is done with respect to Vars of a hl.Func, and
            # the Vars of a hl.Func are shared across the pure and
            # update steps.

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                py_result[xx] = xx

            # Update step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        if True:
            # Case 3: The consumer references the producer in
            # multiple steps that share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = producer[x] * x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside both the
            # pure and the update step of the producer. So there ends
            # up being two separate realizations of the producer, and
            # redundant work occurs.
            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] = producer_storage[0] * xx

            # Update step for the consumer
            for xx in range(10):
                # Another copy of the pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        if True:
            # Case 4: The consumer references the producer in
            # multiple steps that do not share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x, y] = x * y
            consumer[x, y] = x + y
            consumer[x, 0] = producer[x, x - 1]
            consumer[0, y] = producer[y, y - 1]

            # In this case neither producer.compute_at(consumer, x)
            # nor producer.compute_at(consumer, y) will work, because
            # either one fails to cover one of the uses of the
            # producer. So we'd have to inline producer, or use
            # producer.compute_root().

            # Let's say we really really want producer to be
            # compute_at the inner loops of both consumer update
            # steps. Halide doesn't allow multiple different
            # schedules for a single hl.Func, but we can work around it
            # by making two wrappers around producer, and scheduling
            # those instead:

            # Attempt 2:
            producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(), hl.Func(), hl.Func()
            producer_wrapper_1[x, y] = producer[x, y]
            producer_wrapper_2[x, y] = producer[x, y]

            consumer_2[x, y] = x + y
            consumer_2[x, 0] += producer_wrapper_1[x, x - 1]
            consumer_2[0, y] += producer_wrapper_2[y, y - 1]

            # The wrapper functions give us two separate handles on
            # the producer, so we can schedule them differently.
            producer_wrapper_1.compute_at(consumer_2, x)
            producer_wrapper_2.compute_at(consumer_2, y)

            halide_result = consumer_2.realize(10, 10)

            # The equivalent Python is:
            py_result = np.empty((10, 10), dtype=np.int)

            # Pure step for the consumer
            for yy in range(10):
                for xx in range(10):
                    py_result[yy][xx] = xx + yy

            # First update step for consumer
            for xx in range(10):
                producer_wrapper_1_storage = np.empty((1), dtype=np.int)
                producer_wrapper_1_storage[0] = xx * (xx - 1)
                py_result[0][xx] += producer_wrapper_1_storage[0]

            # Second update step for consumer
            for yy in range(10):
                producer_wrapper_2_storage = np.empty((1), dtype=np.int)
                producer_wrapper_2_storage[0] = yy * (yy - 1)
                py_result[yy][0] += producer_wrapper_2_storage[0]

            # Check the results match
            for yy in range(10):
                for xx in range(10):
                    assert halide_result[xx, yy] == py_result[yy][xx], \
                        "halide_result(%d, %d) = %d instead of %d" % (
                            xx, yy, halide_result[xx, yy], py_result[yy][xx])

        if True:
            # Case 5: Scheduling a producer under a reduction domain
            # variable of the consumer.

            # We are not just restricted to scheduling producers at
            # the loops over the pure variables of the consumer. If a
            # producer is only used within a loop over a reduction
            # domain (hl.RDom) variable, we can also schedule the
            # producer there.

            producer, consumer = hl.Func("producer"), hl.Func("consumer")

            r = hl.RDom([(0, 5)])
            producer[x] = x * 17
            consumer[x] = x + 10
            consumer[x] += r + producer[x + r]

            producer.compute_at(consumer, r)

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer.
            for xx in range(10):
                py_result[xx] = xx + 10

            # Update step for the consumer.
            for xx in range(10):
                # The loop over the reduction domain is always the inner loop.
                for rr in range(5):
                    # We've schedule the storage and computation of
                    # the producer here. We just need a single value.
                    producer_storage = np.empty((1), dtype=np.int)
                    # Pure step of the producer.
                    producer_storage[0] = (xx + rr) * 17

                    # Now use it in the update step of the consumer.
                    py_result[xx] += rr + producer_storage[0]

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

    # A real-world example of a reduction inside a producer-consumer chain.
    if True:
        # The default schedule for a reduction is a good one for
        # convolution-like operations. For example, the following
        # computes a 5x5 box-blur of our grayscale test image with a
        # hl.clamp-to-edge boundary condition:

        # First add the boundary condition.
        clamped = hl.BoundaryConditions.repeat_edge(input)

        # Define a 5x5 box that starts at (-2, -2)
        r = hl.RDom([(-2, 5), (-2, 5)])

        # Compute the 5x5 sum around each pixel.
        local_sum = hl.Func("local_sum")
        local_sum[x, y] = 0  # Compute the sum as a 32-bit integer
        local_sum[x, y] += clamped[x + r.x, y + r.y]

        # Divide the sum by 25 to make it an average
        blurry = hl.Func("blurry")
        blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25)

        halide_result = blurry.realize(input.width(), input.height())

        # The default schedule will inline 'clamped' into the update
        # step of 'local_sum', because clamped only has a pure
        # definition, and so its default schedule is fully-inlined.
        # We will then compute local_sum per x coordinate of blurry,
        # because the default schedule for reductions is
        # compute-innermost. Here's the equivalent Python:

        #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0]
        local_sum = np.empty((1), dtype=np.int32)

        py_result = hl.Buffer(hl.UInt(8), [input.width(), input.height()])
        for yy in range(input.height()):
            for xx in range(input.width()):
                # FIXME this loop is quite slow
                # Pure step of local_sum
                local_sum[0] = 0
                # Update step of local_sum
                for r_y in range(-2, 2 + 1):
                    for r_x in range(-2, 2 + 1):
                        # The clamping has been inlined into the update step.
                        clamped_x = min(max(xx + r_x, 0), input.width() - 1)
                        clamped_y = min(max(yy + r_y, 0), input.height() - 1)
                        local_sum[0] += input[clamped_x, clamped_y]

                # Pure step of blurry
                # py_result(x, y) = (uint8_t)(local_sum[0] / 25)
                #py_result[xx, yy] = cast_to_uint8(local_sum[0] / 25)
                # hl.cast done internally
                py_result[xx, yy] = int(local_sum[0] / 25)

        # Check the results match
        for yy in range(input.height()):
            for xx in range(input.width()):
                assert halide_result[xx, yy] == py_result[xx, yy], \
                    "halide_result(%d, %d) = %d instead of %d" % (
                        xx, yy, halide_result[xx, yy], py_result[xx, yy])

    # Reduction helpers.
    if True:
        # There are several reduction helper functions provided in
        # Halide.h, which compute small reductions and schedule them
        # innermost into their consumer. The most useful one is
        # "sum".
        f1 = hl.Func("f1")
        r = hl.RDom([(0, 100)])
        f1[x] = hl.sum(r + x) * 7

        # Sum creates a small anonymous hl.Func to do the reduction. It's
        # equivalent to:
        f2, anon = hl.Func("f2"), hl.Func("anon")
        anon[x] = 0
        anon[x] += r + x
        f2[x] = anon[x] * 7

        # So even though f1 references a reduction domain, it is a
        # pure function. The reduction domain has been swallowed to
        # define the inner anonymous reduction.
        halide_result_1 = f1.realize(10)
        halide_result_2 = f2.realize(10)

        # The equivalent Python is:
        py_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            anon = np.empty((1), dtype=np.int)
            anon[0] = 0
            for rr in range(100):
                anon[0] += rr + xx

            py_result[xx] = anon[0] * 7

        # Check they all match.
        for xx in range(10):
            assert halide_result_1[xx] == py_result[xx], \
                "halide_result_1(%d) = %d instead of %d" % (xx, halide_result_1[xx], py_result[xx])
            assert halide_result_2[xx] == py_result[xx], \
                "halide_result_2(%d) = %d instead of %d" % (xx, halide_result_2[xx], py_result[xx])

    print("Success!")
    return 0
Esempio n. 27
0
def bilateral_filter(input, width, height):
    print('    bilateral_filter')

    k = hl.Buffer(hl.Float(32), [7, 7], "gauss_kernel")
    k.translate([-3, -3])

    weights = hl.Func("bilateral_weights")
    total_weights = hl.Func("bilateral_total_weights")
    bilateral = hl.Func("bilateral")
    output = hl.Func("bilateral_filter_output")

    x, y, dx, dy, c = hl.Var("x"), hl.Var("y"), hl.Var("dx"), hl.Var("dy"), hl.Var("c")
    rdom = hl.RDom([(-3, 7), (-3, 7)])

    k.fill(0)
    k[-3, -3] = 0.000690
    k[-2, -3] = 0.002646
    k[-1, -3] = 0.005923
    k[0, -3] = 0.007748
    k[1, -3] = 0.005923
    k[2, -3] = 0.002646
    k[3, -3] = 0.000690
    k[-3, -2] = 0.002646
    k[-2, -2] = 0.010149
    k[-1, -2] = 0.022718
    k[0, -2] = 0.029715
    k[1, -2] = 0.022718
    k[2, -2] = 0.010149
    k[3, -2] = 0.002646
    k[-3, -1] = 0.005923
    k[-2, -1] = 0.022718
    k[-1, -1] = 0.050855
    k[0, -1] = 0.066517
    k[1, -1] = 0.050855
    k[2, -1] = 0.022718
    k[3, -1] = 0.005923
    k[-3, 0] = 0.007748
    k[-2, 0] = 0.029715
    k[-1, 0] = 0.066517
    k[0, 0] = 0.087001
    k[1, 0] = 0.066517
    k[2, 0] = 0.029715
    k[3, 0] = 0.007748
    k[-3, 1] = 0.005923
    k[-2, 1] = 0.022718
    k[-1, 1] = 0.050855
    k[0, 1] = 0.066517
    k[1, 1] = 0.050855
    k[2, 1] = 0.022718
    k[3, 1] = 0.005923
    k[-3, 2] = 0.002646
    k[-2, 2] = 0.010149
    k[-1, 2] = 0.022718
    k[0, 2] = 0.029715
    k[1, 2] = 0.022718
    k[2, 2] = 0.010149
    k[3, 2] = 0.002646
    k[-3, 3] = 0.000690
    k[-2, 3] = 0.002646
    k[-1, 3] = 0.005923
    k[0, 3] = 0.007748
    k[1, 3] = 0.005923
    k[2, 3] = 0.002646
    k[3, 3] = 0.000690

    input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)])

    dist = hl.cast(hl.Float(32),
                   hl.cast(hl.Int(32), input_mirror[x, y, c]) - hl.cast(hl.Int(32), input_mirror[x + dx, y + dy, c]))

    sig2 = 100

    threshold = 25000

    score = hl.select(hl.abs(input_mirror[x + dx, y + dy, c]) > threshold, 0, hl.exp(-dist * dist / sig2))

    weights[dx, dy, x, y, c] = k[dx, dy] * score

    total_weights[x, y, c] = hl.sum(weights[rdom.x, rdom.y, x, y, c])

    bilateral[x, y, c] = hl.sum(input_mirror[x + rdom.x, y + rdom.y, c] * weights[rdom.x, rdom.y, x, y, c]) / \
                         total_weights[x, y, c]

    output[x, y, c] = hl.cast(hl.Float(32), input[x, y, c])

    output[x, y, 1] = bilateral[x, y, 1]
    output[x, y, 2] = bilateral[x, y, 2]

    weights.compute_at(output, y).vectorize(x, 16)

    output.compute_root().parallel(y).vectorize(x, 16)

    output.update(0).parallel(y).vectorize(x, 16)
    output.update(1).parallel(y).vectorize(x, 16)

    return output
Esempio n. 28
0
def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid
Esempio n. 29
0
def main():

    # So far Funcs (such as the one below) have evaluated to a single
    # scalar value for each point in their domain.
    single_valued = hl.Func()
    x, y = hl.Var("x"), hl.Var("y")
    single_valued[x, y] = x + y

    # One way to write a hl.Func that returns a collection of values is
    # to add an additional dimension which indexes that
    # collection. This is how we typically deal with color. For
    # example, the hl.Func below represents a collection of three values
    # for every x, y coordinate indexed by c.
    color_image = hl.Func()
    c = hl.Var("c")
    color_image[x, y, c] = hl.select(
        c == 0,
        245,  # Red value
        c == 1,
        42,  # Green value
        132)  # Blue value

    # Since this pattern appears quite often, Halide provides a
    # syntatic sugar to write the code above as the following,
    # using the "mux" function.
    # color_image[x, y, c] = hl.mux(c, [245, 42, 132]);

    # This method is often convenient because it makes it easy to
    # operate on this hl.Func in a way that treats each item in the
    # collection equally:
    brighter = hl.Func()
    brighter[x, y, c] = color_image[x, y, c] + 10

    # However this method is also inconvenient for three reasons.
    #
    # 1) Funcs are defined over an infinite domain, so users of this
    # hl.Func can for example access color_image(x, y, -17), which is
    # not a meaningful value and is probably indicative of a bug.
    #
    # 2) It requires a hl.select, which can impact performance if not
    # bounded and unrolled:
    # brighter.bound(c, 0, 3).unroll(c)
    #
    # 3) With this method, all values in the collection must have the
    # same type. While the above two issues are merely inconvenient,
    # this one is a hard limitation that makes it impossible to
    # express certain things in this way.

    # It is also possible to represent a collection of values as a
    # collection of Funcs:
    func_array = [hl.Func() for i in range(3)]
    func_array[0][x, y] = x + y
    func_array[1][x, y] = hl.sin(x)
    func_array[2][x, y] = hl.cos(y)

    # This method avoids the three problems above, but introduces a
    # new annoyance. Because these are separate Funcs, it is
    # difficult to schedule them so that they are all computed
    # together inside a single loop over x, y.

    # A third alternative is to define a hl.Func as evaluating to a
    # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of
    # Exprs which may have different type. The following function
    # evaluates to an integer value (x+y), and a floating point value
    # (hl.sin(x*y)).
    multi_valued = hl.Func("multi_valued")
    multi_valued[x, y] = (x + y, hl.sin(x * y))

    # Realizing a tuple-valued hl.Func returns a collection of
    # Buffers. We call this a Realization. It's equivalent to a
    # std::vector of hl.Buffer/Image objects:
    if True:
        im1, im2 = multi_valued.realize([80, 60])
        assert im1.type() == hl.Int(32)
        assert im2.type() == hl.Float(32)
        assert im1[30, 40] == 30 + 40
        assert np.isclose(im2[30, 40], math.sin(30 * 40))

    # You can also pass a tuple of pre-allocated buffers to realize()
    # rather than having new ones created. (The Buffers must have the correct
    # types and have identical sizes.)
    if True:
        im1, im2 = hl.Buffer(hl.Int(32),
                             [80, 60]), hl.Buffer(hl.Float(32), [80, 60])
        multi_valued.realize((im1, im2))
        assert im1[30, 40] == 30 + 40
        assert np.isclose(im2[30, 40], math.sin(30 * 40))

    # All Tuple elements are evaluated together over the same domain
    # in the same loop nest, but stored in distinct allocations. The
    # equivalent C++ code to the above is:
    if True:
        multi_valued_0 = np.empty((80 * 60), dtype=np.int32)
        multi_valued_1 = np.empty((80 * 60), dtype=np.int32)

        for yy in range(80):
            for xx in range(60):
                multi_valued_0[xx + 60 * yy] = xx + yy
                multi_valued_1[xx + 60 * yy] = math.sin(xx * yy)

    # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates
    # into multiple distinct output halide_buffer_t structs. These appear in
    # order at the end of the function signature:
    # int multi_valued(...input buffers and params..., halide_buffer_t
    # *output_1, halide_buffer_t *output_2)

    # You can construct a Tuple by passing multiple Exprs to the
    # Tuple constructor as we did above. Perhaps more elegantly, you
    # can also take advantage of initializer lists and just
    # enclose your Exprs in braces:
    multi_valued_2 = hl.Func("multi_valued_2")
    multi_valued_2[x, y] = (x + y, hl.sin(x * y))

    # Calls to a multi-valued hl.Func cannot be treated as Exprs. The
    # following is a syntax error:
    # hl.Func consumer
    # consumer[x, y] = multi_valued_2[x, y] + 10

    # Instead you must index the returned object with square brackets
    # to retrieve the individual Exprs:
    integer_part = multi_valued_2[x, y][0]
    floating_part = multi_valued_2[x, y][1]
    assert type(integer_part) is hl.FuncTupleElementRef
    assert type(floating_part) is hl.FuncTupleElementRef

    consumer = hl.Func()
    consumer[x, y] = (integer_part + 10, floating_part + 10.0)

    # Tuple reductions.
    if True:
        # Tuples are particularly useful in reductions, as they allow
        # the reduction to maintain complex state as it walks along
        # its domain. The simplest example is an argmax.

        # First we create an Image to take the argmax over.
        input_func = hl.Func()
        input_func[x] = hl.sin(x)
        input = input_func.realize([100])
        assert input.type() == hl.Float(32)

        # Then we defined a 2-valued Tuple which tracks the maximum value
        # its index.
        arg_max = hl.Func()

        # Pure definition.
        # (using [()] for zero-dimensional Funcs is a convention of this python interface)
        arg_max[()] = (0, input[0])

        # Update definition.
        r = hl.RDom([(1, 99)])
        old_index = arg_max[()][0]
        old_max = arg_max[()][1]
        new_index = hl.select(old_max > input[r], r, old_index)
        new_max = hl.max(input[r], old_max)
        arg_max[()] = (new_index, new_max)

        # The equivalent C++ is:
        arg_max_0 = 0
        arg_max_1 = float(input[0])
        for r in range(1, 100):
            old_index = arg_max_0
            old_max = arg_max_1
            new_index = r if (old_max > input[r]) else old_index
            new_max = max(input[r], old_max)
            # In a tuple update definition, all loads and computation
            # are done before any stores, so that all Tuple elements
            # are updated atomically with respect to recursive calls
            # to the same hl.Func.
            arg_max_0 = new_index
            arg_max_1 = new_max

        # Let's verify that the Halide and C++ found the same maximum
        # value and index.
        if True:
            r0, r1 = arg_max.realize()

            assert r0.type() == hl.Int(32)
            assert r1.type() == hl.Float(32)
            assert arg_max_0 == r0[()]
            assert np.isclose(arg_max_1, r1[()])

        # Halide provides argmax and hl.argmin as built-in reductions
        # similar to sum, product, maximum, and minimum. They return
        # a Tuple consisting of the point in the reduction domain
        # corresponding to that value, and the value itself. In the
        # case of ties they return the first value found. We'll use
        # one of these in the following section.

    # Tuples for user-defined types.
    if True:
        # Tuples can also be a convenient way to represent compound
        # objects such as complex numbers. Defining an object that
        # can be converted to and from a Tuple is one way to extend
        # Halide's type system with user-defined types.
        class Complex:
            def __init__(self, r, i=None):
                if type(r) is float and type(i) is float:
                    self.real = hl.Expr(r)
                    self.imag = hl.Expr(i)
                elif i is not None:
                    self.real = r
                    self.imag = i
                else:
                    self.real = r[0]
                    self.imag = r[1]

            def as_tuple(self):
                "Convert to a Tuple"
                return (self.real, self.imag)

            def __add__(self, other):
                "Complex addition"
                return Complex(self.real + other.real, self.imag + other.imag)

            def __mul__(self, other):
                "Complex multiplication"
                return Complex(self.real * other.real - self.imag * other.imag,
                               self.real * other.imag + self.imag * other.real)

            def __getitem__(self, idx):
                return (self.real, self.imag)[idx]

            def __len__(self):
                return 2

            def magnitude(self):
                "Complex magnitude"
                return (self.real * self.real) + (self.imag * self.imag)

            # Other complex operators would go here. The above are
            # sufficient for this example.

        # Let's use the Complex struct to compute a Mandelbrot set.
        mandelbrot = hl.Func()

        # The initial complex value corresponding to an x, y coordinate
        # in our hl.Func.
        initial = Complex(x / 15.0 - 2.5, y / 6.0 - 2.0)

        # Pure definition.
        t = hl.Var("t")
        mandelbrot[x, y, t] = Complex(0.0, 0.0)

        # We'll use an update definition to take 12 steps.
        r = hl.RDom([(1, 12)])
        current = Complex(mandelbrot[x, y, r - 1])

        # The following line uses the complex multiplication and
        # addition we defined above.
        mandelbrot[x, y, r] = (Complex(current * current) + initial)

        # We'll use another tuple reduction to compute the iteration
        # number where the value first escapes a circle of radius 4.
        # This can be expressed as an hl.argmin of a boolean - we want
        # the index of the first time the given boolean expression is
        # false (we consider false to be less than true).  The argmax
        # would return the index of the first time the expression is
        # true.

        escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0
        first_escape = hl.argmin(escape_condition)
        assert type(first_escape) is tuple
        # We only want the index, not the value, but hl.argmin returns
        # both, so we'll index the hl.argmin Tuple expression using
        # square brackets to get the hl.Expr representing the index.
        escape = hl.Func()
        escape[x, y] = first_escape[0]

        # Realize the pipeline and print the result as ascii art.
        result = escape.realize([61, 25])
        assert result.type() == hl.Int(32)
        code = " .:-~*={&%#@"
        for yy in range(result.height()):
            for xx in range(result.width()):
                index = result[xx, yy]
                if index < len(code):
                    print("%c" % code[index], end="")
                else:
                    pass  # is lesson 13 cpp version buggy ?
            print("")

    print("Success!")

    return 0
Esempio n. 30
0
def demosaic(input, width, height):
    print(f'width: {width}, height: {height}')

    f0 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f0")
    f1 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f1")
    f2 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f2")
    f3 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f3")

    f0.translate([-2, -2])
    f1.translate([-2, -2])
    f2.translate([-2, -2])
    f3.translate([-2, -2])

    d0 = hl.Func("demosaic_0")
    d1 = hl.Func("demosaic_1")
    d2 = hl.Func("demosaic_2")
    d3 = hl.Func("demosaic_3")

    output = hl.Func("demosaic_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")
    rdom0 = hl.RDom([(-2, 5), (-2, 5)])
    # rdom1 = hl.RDom([(0, width / 2), (0, height / 2)])

    input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)])

    f0.fill(0)
    f1.fill(0)
    f2.fill(0)
    f3.fill(0)

    f0_sum = 8
    f1_sum = 16
    f2_sum = 16
    f3_sum = 16

    f0[0, -2] = -1
    f0[0, -1] = 2
    f0[-2, 0] = -1
    f0[-1, 0] = 2
    f0[0, 0] = 4
    f0[1, 0] = 2
    f0[2, 0] = -1
    f0[0, 1] = 2
    f0[0, 2] = -1

    f1[0, -2] = 1
    f1[-1, -1] = -2
    f1[1, -1] = -2
    f1[-2, 0] = -2
    f1[-1, 0] = 8
    f1[0, 0] = 10
    f1[1, 0] = 8
    f1[2, 0] = -2
    f1[-1, 1] = -2
    f1[1, 1] = -2
    f1[0, 2] = 1

    f2[0, -2] = -2
    f2[-1, -1] = -2
    f2[0, -1] = 8
    f2[1, -1] = -2
    f2[-2, 0] = 1
    f2[0, 0] = 10
    f2[2, 0] = 1
    f2[-1, 1] = -2
    f2[0, 1] = 8
    f2[1, 1] = -2
    f2[0, 2] = -2

    f3[0, -2] = -3
    f3[-1, -1] = 4
    f3[1, -1] = 4
    f3[-2, 0] = -3
    f3[0, 0] = 12
    f3[2, 0] = -3
    f3[-1, 1] = 4
    f3[1, 1] = 4
    f3[0, 2] = -3

    d0[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f0[rdom0.x, rdom0.y]) / f0_sum)
    d1[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f1[rdom0.x, rdom0.y]) / f1_sum)
    d2[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f2[rdom0.x, rdom0.y]) / f2_sum)
    d3[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f3[rdom0.x, rdom0.y]) / f3_sum)

    R_row = y % 2 == 0
    B_row = y % 2 != 0
    R_col = x % 2 == 0
    B_col = x % 2 != 0
    at_R = c == 0
    at_G = c == 1
    at_B = c == 2

    output[x, y, c] = hl.select(at_R & R_row & B_col, d1[x, y],
                                at_R & B_row & R_col, d2[x, y],
                                at_R & B_row & B_col, d3[x, y],
                                at_G & R_row & R_col, d0[x, y],
                                at_G & B_row & B_col, d0[x, y],
                                at_B & B_row & R_col, d1[x, y],
                                at_B & R_row & B_col, d2[x, y],
                                at_B & R_row & R_col, d3[x, y],
                                input[x, y])

    d0.compute_root().parallel(y).vectorize(x, 16)
    d1.compute_root().parallel(y).vectorize(x, 16)
    d2.compute_root().parallel(y).vectorize(x, 16)
    d3.compute_root().parallel(y).vectorize(x, 16)

    output.compute_root().parallel(y).align_bounds(x, 2).unroll(x, 2).align_bounds(y, 2).unroll(y, 2).vectorize(x, 16)

    return output