コード例 #1
0
def test_basics2():

    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable
    s_sigma = 8 # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0, input.width()-1),
                          hl.clamp(y, 0, input.height()-1),0]

    # Construct the bilateral grid
    r = hl.RDom(0, s_sigma, 0, s_sigma, 'r')
    val0 = clamped[x * s_sigma, y * s_sigma]
    val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1), y * s_sigma * hl.cast(hl.Int(32), 1)]
    #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail
    val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma//2),
                    y * s_sigma - hl.cast(hl.Int(32), s_sigma//2)]
    val2 = clamped[x * s_sigma - s_sigma//2, y * s_sigma - s_sigma//2]
    val3 = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2]

    return
コード例 #2
0
ファイル: basics.py プロジェクト: darkbuck/Halide
def test_basics3():

    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable
    s_sigma = 8 # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0, input.width()-1),
                          hl.clamp(y, 0, input.height()-1),0]

    # Construct the bilateral grid
    r = hl.RDom(0, s_sigma, 0, s_sigma, 'r')
    val = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2]
    val = hl.clamp(val, 0.0, 1.0)
    #zi = hl.cast(hl.Int(32), val * (1.0/r_sigma) + 0.5)
    zi = hl.cast(hl.Int(32), (val / r_sigma) + 0.5)
    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0

    ss = hl.select(c == 0, val, 1.0)
    print("hl.select(c == 0, val, 1.0)", ss)
    left = histogram[x, y, zi, c]
    print("histogram[x, y, zi, c]", histogram[x, y, zi, c])
    print("histogram[x, y, zi, c]", left)
    left += 5
    print("histogram[x, y, zi, c] after += 5", left)
    left += ss

    return
コード例 #3
0
def test_basics3():
    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma',
                       0.1)  # Value needed if not generating an executable
    s_sigma = 8  # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2,
                  y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)
    zi = hl.i32((val / r_sigma) + 0.5)
    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0

    ss = hl.select(c == 0, val, 1.0)
    left = histogram[x, y, zi, c]
    left += 5
    left += ss
コード例 #4
0
def test_basics2():
    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1)
    s_sigma = 8

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val0 = clamped[x * s_sigma, y * s_sigma]
    val00 = clamped[x * s_sigma * hl.i32(1), y * s_sigma * hl.i32(1)]
    val22 = clamped[x * s_sigma - hl.i32(s_sigma // 2),
                    y * s_sigma - hl.i32(s_sigma // 2)]
    val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2]
    val3 = clamped[x * s_sigma + r.x - s_sigma // 2,
                   y * s_sigma + r.y - s_sigma // 2]

    try:
        val1 = clamped[x * s_sigma - s_sigma / 2, y * s_sigma - s_sigma / 2]
    except RuntimeError as e:
        assert 'Implicit cast from float32 to int' in str(e)
    else:
        assert False, 'Did not see expected exception!'
コード例 #5
0
    def __init__(self, input):

        assert type(input) == hl.Buffer_uint8

        self.lut = hl.Func("lut")
        self.padded = hl.Func("padded")
        self.padded16 = hl.Func("padded16")
        self.sharpen = hl.Func("sharpen")
        self.curved = hl.Func("curved")
        self.input = input


        # For this lesson, we'll use a two-stage pipeline that sharpens
        # and then applies a look-up-table (LUT).

        # First we'll define the LUT. It will be a gamma curve.
        self.lut[i] = hl.cast(hl.UInt(8), hl.clamp(pow(i / 255.0, 1.2) * 255.0, 0, 255))

        # Augment the input with a boundary condition.
        self.padded[x, y, c] = input[hl.clamp(x, 0, input.width()-1),
                                hl.clamp(y, 0, input.height()-1), c]

        # Cast it to 16-bit to do the math.
        self.padded16[x, y, c] = hl.cast(hl.UInt(16), self.padded[x, y, c])

        # Next we sharpen it with a five-tap filter.
        self.sharpen[x, y, c] = (self.padded16[x, y, c] * 2-
                            (self.padded16[x - 1, y, c] +
                             self.padded16[x, y - 1, c] +
                             self.padded16[x + 1, y, c] +
                             self.padded16[x, y + 1, c]) / 4)

        # Then apply the LUT.
        self.curved[x, y, c] = self.lut[self.sharpen[x, y, c]]
コード例 #6
0
    def __init__(self, input):

        assert input.type() == hl.UInt(8)

        self.lut = hl.Func("lut")
        self.padded = hl.Func("padded")
        self.padded16 = hl.Func("padded16")
        self.sharpen = hl.Func("sharpen")
        self.curved = hl.Func("curved")
        self.input = input

        # For this lesson, we'll use a two-stage pipeline that sharpens
        # and then applies a look-up-table (LUT).

        # First we'll define the LUT. It will be a gamma curve.
        gamma = hl.f32(1.2)
        self.lut[i] = hl.u8(hl.clamp(hl.pow(i / 255.0, gamma) * 255.0, 0, 255))

        # Augment the input with a boundary condition.
        self.padded[x, y, c] = input[hl.clamp(x, 0,
                                              input.width() - 1),
                                     hl.clamp(y, 0,
                                              input.height() - 1), c]

        # Cast it to 16-bit to do the math.
        self.padded16[x, y, c] = hl.u16(self.padded[x, y, c])

        # Next we sharpen it with a five-tap filter.
        self.sharpen[x, y, c] = (
            self.padded16[x, y, c] * 2 -
            (self.padded16[x - 1, y, c] + self.padded16[x, y - 1, c] +
             self.padded16[x + 1, y, c] + self.padded16[x, y + 1, c]) / 4)

        # Then apply the LUT.
        self.curved[x, y, c] = self.lut[self.sharpen[x, y, c]]
コード例 #7
0
def test_basics2():

    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma',
                       0.1)  # Value needed if not generating an executable
    s_sigma = 8  # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    if True:
        print("s_sigma", s_sigma)
        print("s_sigma/2", s_sigma / 2)
        print("s_sigma//2", s_sigma // 2)
        print()
        print("x * s_sigma", x * s_sigma)
        print("x * 8", x * 8)
        print("x * 8 + 4", x * 8 + 4)
        print("x * 8 * 4", x * 8 * 4)
        print()
        print("x", x)
        print("(x * s_sigma).type()", )
        print("(x * 8).type()", (x * 8).type())
        print("(x * 8 + 4).type()", (x * 8 + 4).type())
        print("(x * 8 * 4).type()", (x * 8 * 4).type())
        print("(x * 8 / 4).type()", (x * 8 / 4).type())
        print("((x * 8) * 4).type()", ((x * 8) * 4).type())
        print("(x * (8 * 4)).type()", (x * (8 * 4)).type())

    assert (x * 8).type() == hl.Int(32)
    assert (x * 8 * 4).type() == hl.Int(32)  # yes this did fail at some point
    assert ((x * 8) / 4).type() == hl.Int(32)
    assert (x * (8 / 4)).type() == hl.Float(32)  # under python3 division rules
    assert (x * (8 // 4)).type() == hl.Int(32)
    #assert (x * 8 // 4).type() == hl.Int(32) # not yet implemented

    # Construct the bilateral grid
    r = hl.RDom(0, s_sigma, 0, s_sigma, 'r')
    val0 = clamped[x * s_sigma, y * s_sigma]
    val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1),
                    y * s_sigma * hl.cast(hl.Int(32), 1)]
    #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail
    val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma // 2),
                    y * s_sigma - hl.cast(hl.Int(32), s_sigma // 2)]
    val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2]
    val3 = clamped[x * s_sigma + r.x - s_sigma // 2,
                   y * s_sigma + r.y - s_sigma // 2]

    return
コード例 #8
0
def prefilterXSobel(image, W, H): 
    x, y = Var("x"), Var("y")
    clamped, gray = Func("clamped"), Func("gray")
    gray[x, y] = 0.2989*image[x, y, 0] + 0.5870*image[x, y, 1] + 0.1140*image[x, y, 2]
    clamped[x, y] = gray[h.clamp(x, 0, W-1), h.clamp(y, 0, H-1)]

    temp, xSobel = Func("temp"), Func("xSobel")
    temp[x, y] = clamped[x+1, y] - clamped[x-1, y]
    xSobel[x, y] = h.cast(Int(8), h.clamp(temp[x, y-1] + 2 * temp[x, y] + temp[x, y+1], -31, 31))

    xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo")
    xSobel.compute_root().tile(x, y, xo, yo, xi, yi, 64, 32).parallel(yo).parallel(xo)
    temp.compute_at(xSobel, yi).vectorize(x, 8)
    return xSobel
コード例 #9
0
def prefilterXSobel(image, W, H):
    x, y = Var("x"), Var("y")
    clamped, gray = Func("clamped"), Func("gray")
    gray[x, y] = 0.2989 * image[x, y, 0] + 0.5870 * image[
        x, y, 1] + 0.1140 * image[x, y, 2]
    clamped[x, y] = gray[h.clamp(x, 0, W - 1), h.clamp(y, 0, H - 1)]

    temp, xSobel = Func("temp"), Func("xSobel")
    temp[x, y] = clamped[x + 1, y] - clamped[x - 1, y]
    xSobel[x, y] = h.cast(
        Int(8),
        h.clamp(temp[x, y - 1] + 2 * temp[x, y] + temp[x, y + 1], -31, 31))

    xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo")
    xSobel.compute_root().tile(x, y, xo, yo, xi, yi, 64,
                               32).parallel(yo).parallel(xo)
    temp.compute_at(xSobel, yi).vectorize(x, 8)
    return xSobel
コード例 #10
0
def get_erode(input):
    """
    Erode on 5x5 stencil, first erode x then erode y.
    """

    x = hl.Var("x")
    y = hl.Var("y")
    c = hl.Var("c")
    input_clamped = hl.Func("input_clamped")
    erode_x = hl.Func("erode_x")
    erode_y = hl.Func("erode_y")

    input_clamped[x, y, c] = input[
        hl.clamp(x, hl.cast(hl.Int(32), 0
                            ), hl.cast(hl.Int(32),
                                       input.width() - 1)),
        hl.clamp(y, hl.cast(hl.Int(32), 0
                            ), hl.cast(hl.Int(32),
                                       input.height() - 1)), c]
    erode_x[x, y, c] = hl.min(
        hl.min(
            hl.min(
                hl.min(input_clamped[x - 2, y, c], input_clamped[x - 1, y, c]),
                input_clamped[x, y, c]), input_clamped[x + 1, y, c]),
        input_clamped[x + 2, y, c])
    erode_y[x, y, c] = hl.min(
        hl.min(
            hl.min(hl.min(erode_x[x, y - 2, c], erode_x[x, y - 1, c]),
                   erode_x[x, y, c]), erode_x[x, y + 1, c]), erode_x[x, y + 2,
                                                                     c])

    yi = hl.Var("yi")

    # CPU Schedule
    erode_x.compute_root().split(y, y, yi, 8).parallel(y)
    erode_y.compute_root().split(y, y, yi, 8).parallel(y)

    return erode_y
コード例 #11
0
ファイル: erode.py プロジェクト: adityaatluri/Halide
def get_erode(input):
    """
    Erode on 5x5 stencil, first erode x then erode y.
    """

    x = hl.Var("x")
    y = hl.Var("y")
    c = hl.Var("c")
    input_clamped = hl.Func("input_clamped")
    erode_x = hl.Func("erode_x")
    erode_y = hl.Func("erode_y")

    input_clamped[x,y,c] = input[hl.clamp(x,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.width()-1)),
                                 hl.clamp(y,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.height()-1)), c]
    erode_x[x,y,c] = hl.min(hl.min(hl.min(hl.min(input_clamped[x-2,y,c],input_clamped[x-1,y,c]),input_clamped[x,y,c]),input_clamped[x+1,y,c]),input_clamped[x+2,y,c])
    erode_y[x,y,c] = hl.min(hl.min(hl.min(hl.min(erode_x[x,y-2,c],erode_x[x,y-1,c]),erode_x[x,y,c]),erode_x[x,y+1,c]),erode_x[x,y+2,c])

    yi = hl.Var("yi")

    # CPU Schedule
    erode_x.compute_root().split(y, y, yi, 8).parallel(y)
    erode_y.compute_root().split(y, y, yi, 8).parallel(y)

    return erode_y
コード例 #12
0
def entropy(x, y, c, img, w, h, hist_index):
    base_gray = gray(x, y, c, img)
    clamped_gray = mkfunc('clamped_gray', base_gray)
    clamped_gray[x,y] = hl.clamp(base_gray[x,y], 0, 255)
    u8_gray = u8(x, y, c, clamped_gray)

    probabilities = histogram(x, y, c, u8_gray, w, h, hist_index)

    r = hl.RDom([(-2, 5), (-2, 5)])

    levels = mkfunc('entropy', img)
    levels[x,y] = 0.0
    # Add in 0.00001 to prevent -Inf's
    levels[x,y] += base_gray[x + r.x, y + r.y] * hl.log(probabilities[u8_gray[x + r.x, y + r.y]]+0.00001)
    levels[x,y] = levels[x,y] * -1.0

    return levels
コード例 #13
0
def main():
    # First we'll declare some Vars to use below.
    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/rgb.png")

    # Now we'll express a multi-stage pipeline that blurs an image
    # first horizontally, and then vertically.
    if True:
        # Take a color 8-bit input
        input = hl.Buffer(imread(image_path))
        assert input.type() == hl.UInt(8)

        # Upgrade it to 16-bit, so we can do math without it overflowing.
        input_16 = hl.Func("input_16")
        input_16[x, y, c] = hl.cast(hl.UInt(16), input[x, y, c])

        # Blur it horizontally:
        blur_x = hl.Func("blur_x")
        blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4

        # Blur it vertically:
        blur_y = hl.Func("blur_y")
        blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4

        # Convert back to 8-bit.
        output = hl.Func("output")
        output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c])

        # Each hl.Func in this pipeline calls a previous one using
        # familiar function call syntax (we've overloaded operator()
        # on hl.Func objects). A hl.Func may call any other hl.Func that has
        # been given a definition. This restriction prevents
        # pipelines with loops in them. Halide pipelines are always
        # feed-forward graphs of Funcs.

        # Now let's realize it...

        # result = output.realize(input.width(), input.height(), 3)

        # Except that the line above is not going to work. Uncomment
        # it to see what happens.

        # Realizing this pipeline over the same domain as the input
        # image requires reading pixels out of bounds in the input,
        # because the blur_x stage reaches outwards horizontally, and
        # the blur_y stage reaches outwards vertically. Halide
        # detects this by injecting a piece of code at the top of the
        # pipeline that computes the region over which the input will
        # be read. When it starts to run the pipeline it first runs
        # this code, determines that the input will be read out of
        # bounds, and refuses to continue. No actual bounds checks
        # occur in the inner loop that would be slow.
        #
        # So what do we do? There are a few options. If we realize
        # over a domain shifted inwards by one pixel, we won't be
        # asking the Halide routine to read out of bounds. We saw how
        # to do this in the previous lesson:
        result = hl.Buffer(hl.UInt(8), [input.width() - 2, input.height() - 2, 3])
        result.set_min([1, 1])
        output.realize(result)

        # Save the result. It should look like a slightly blurry
        # parrot, and it should be two pixels narrower and two pixels
        # shorter than the input image.

        imsave("blurry_parrot_1.png", result)
        print("Created blurry_parrot_1.png")

        # This is usually the fastest way to deal with boundaries:
        # don't write code that reads out of bounds :) The more
        # general solution is our next example.


    # The same pipeline, with a boundary condition on the input.
    if True:
        # Take a color 8-bit input
        input = hl.Buffer(imread(image_path))
        assert input.type() == hl.UInt(8)

        # This time, we'll wrap the input in a hl.Func that prevents
        # reading out of bounds:
        clamped = hl.Func("clamped")

        # Define an expression that clamps x to lie within the the
        # range [0, input.width()-1].
        clamped_x = hl.clamp(x, 0, input.width() - 1)
        # Similarly hl.clamp y.
        clamped_y = hl.clamp(y, 0, input.height() - 1)
        # Load from input at the clamped coordinates. This means that
        # no matter how we evaluated the hl.Func 'clamped', we'll never
        # read out of bounds on the input. This is a hl.clamp-to-edge
        # style boundary condition, and is the simplest boundary
        # condition to express in Halide.
        clamped[x, y, c] = input[clamped_x, clamped_y, c]

        # Defining 'clamped' in that way can be done more concisely
        # using a helper function from the BoundaryConditions
        # namespace like so:
        #
        # clamped = hl.BoundaryConditions.repeat_edge(input)
        #
        # These are important to use for other boundary conditions,
        # because they are expressed in the way that Halide can best
        # understand and optimize.

        # Upgrade it to 16-bit, so we can do math without it
        # overflowing. This time we'll refer to our new hl.Func
        # 'clamped', instead of referring to the input image
        # directly.
        input_16 = hl.Func("input_16")
        input_16[x, y, c] = hl.cast(hl.UInt(16), clamped[x, y, c])

        # The rest of the pipeline will be the same...

        # Blur it horizontally:
        blur_x = hl.Func("blur_x")
        blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4

        # Blur it vertically:
        blur_y = hl.Func("blur_y")
        blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4

        # Convert back to 8-bit.
        output = hl.Func("output")
        output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c])

        # This time it's safe to evaluate the output over the some
        # domain as the input, because we have a boundary condition.
        result = output.realize(input.width(), input.height(), 3)

        # Save the result. It should look like a slightly blurry
        # parrot, but this time it will be the same size as the
        # input.
        imsave("blurry_parrot_2.png", result)
        print("Created blurry_parrot_2.png")

    print("Success!")
    return 0
コード例 #14
0
ファイル: interpolate.py プロジェクト: darkbuck/Halide
def get_interpolate(input, levels):
    """
    Build function, schedules it, and invokes jit compiler
    :return: halide.hl.Func
    """

    # THE ALGORITHM

    downsampled = [hl.Func('downsampled%d'%i) for i in range(levels)]
    downx = [hl.Func('downx%d'%l) for l in range(levels)]
    interpolated = [hl.Func('interpolated%d'%i) for i in range(levels)]
#     level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)]
#     level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)]
    upsampled = [hl.Func('upsampled%d'%l) for l in range(levels)]
    upsampledx = [hl.Func('upsampledx%d'%l) for l in range(levels)]
    x = hl.Var('x')
    y = hl.Var('y')
    c = hl.Var('c')

    clamped = hl.Func('clamped')
    clamped[x, y, c] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c]

    # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we
    # rewrite it in a way that doesn't trigger the bug. The rewritten
    # form assumes the input alpha is zero or one.
    # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3])
    downsampled[0][x,y,c] = clamped[x, y, c] * clamped[x, y, 3]

    for l in range(1, levels):
        prev = hl.Func()
        prev = downsampled[l-1]

        if l == 4:
            # Also add a boundary condition at a middle pyramid level
            # to prevent the footprint of the downsamplings to extend
            # too far off the base image. Otherwise we look 512
            # pixels off each edge.
            w = input.width()/(1 << l)
            h = input.height()/(1 << l)
            prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c])

        downx[l][x,y,c] = (prev[x*2-1,y,c] + 2.0 * prev[x*2,y,c] + prev[x*2+1,y,c]) * 0.25
        downsampled[l][x,y,c] = (downx[l][x,y*2-1,c] + 2.0 * downx[l][x,y*2,c] + downx[l][x,y*2+1,c]) * 0.25


    interpolated[levels-1][x,y,c] = downsampled[levels-1][x,y,c]
    for l in range(levels-1)[::-1]:
        upsampledx[l][x,y,c] = (interpolated[l+1][x/2, y, c] + interpolated[l+1][(x+1)/2, y, c]) / 2.0
        upsampled[l][x,y,c] = (upsampledx[l][x, y/2, c] + upsampledx[l][x, (y+1)/2, c]) / 2.0
        interpolated[l][x,y,c] = downsampled[l][x,y,c] + (1.0 - downsampled[l][x,y,3]) * upsampled[l][x,y,c]

    normalize = hl.Func('normalize')
    normalize[x,y,c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3]

    final = hl.Func('final')
    final[x,y,c] = normalize[x,y,c]

    print("Finished function setup.")

    # THE SCHEDULE

    sched = 2
    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        sched = 4
    else:
        sched = 2

    if sched == 0:
        print ("Flat schedule.")
        for l in range(levels):
            downsampled[l].compute_root()
            interpolated[l].compute_root()

        final.compute_root()

    elif sched == 1:
        print("Flat schedule with vectorization.")
        for l in range(levels):
            downsampled[l].compute_root().vectorize(x, 4)
            interpolated[l].compute_root().vectorize(x, 4)

        final.compute_root()

    elif sched == 2:
        print("Flat schedule with parallelization + vectorization")
        xi, yi = hl.Var('xi'), hl.Var('yi')
        clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
        for l in range(1, levels - 1):
            if l > 0:
                downsampled[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].unroll(x, 2).unroll(y, 2);

        final.reorder(c, x, y).bound(c, 0, 3).parallel(y)
        final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi)
        final.bound(x, 0, input.width())
        final.bound(y, 0, input.height())

    elif sched == 3:
        print("Flat schedule with vectorization sometimes.")
        for l in range(levels):
            if l + 4 < levels:
                yo, yi = hl.Var('yo'), hl.Var('yi')
                downsampled[l].compute_root().vectorize(x, 4)
                interpolated[l].compute_root().vectorize(x, 4)
            else:
                downsampled[l].compute_root()
                interpolated[l].compute_root()

        final.compute_root();

    elif sched == 4:
        print("GPU schedule.")

        # Some gpus don't have enough memory to process the entire
        # image, so we process the image in tiles.
        yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var("ci")
        final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4)
        final.tile(x, y, xo, yo, xi, yi, input.width()/4, input.height()/4)
        normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16, GPU_Default).unroll(c)

        # Start from level 1 to save memory - level zero will be computed on demand
        for l in range(1, levels):
            tile_size = 32 >> l;
            if tile_size < 1: tile_size = 1
            if tile_size > 16: tile_size = 16
            downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default)
            interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default)

    else:
        print("No schedule with this number.")
        exit(1)

    # JIT compile the pipeline eagerly, so we don't interfere with timing
    final.compile_jit(target)

    return final
コード例 #15
0
def test_schedules(verbose=False, test_random=False):
    #random_module.seed(int(sys.argv[1]) if len(sys.argv)>1 else 0)
    halide.exit_on_signal()
    f = halide.Func('f')
    x = halide.Var('x')
    y = halide.Var('y')
    c = halide.Var('c')
    g = halide.Func('g')
    v = halide.Var('v')
    input = halide.UniformImage(halide.UInt(16), 3)
    int_t = halide.Int(32)
    f[x, y, c] = input[
        halide.clamp(x, halide.cast(int_t, 0
                                    ), halide.cast(int_t,
                                                   input.width() - 1)),
        halide.clamp(y, halide.cast(int_t, 0
                                    ), halide.cast(int_t,
                                                   input.height() - 1)),
        halide.clamp(c, halide.cast(int_t, 0), halide.cast(int_t, 2))]
    #g[v] = f[v,v]
    g[x, y, c] = f[x, y, c] + 1
    assert sorted(halide.all_vars(g).keys()) == sorted(['x', 'y',
                                                        'c'])  #, 'v'])

    if verbose:
        print halide.func_varlist(f)
        print 'caller_vars(f) =', caller_vars(g, f)
        print 'caller_vars(g) =', caller_vars(g, g)


#    validL = list(valid_schedules(g, f, 4))
#    validL = [repr(_x) for _x in validL]
#
#    for L in sorted(validL):
#        print repr(L)
    T0 = time.time()
    if not test_random:
        random = True  #False
        nvalid_determ = 0
        for L in schedules_func(g, f, 0, 3):
            nvalid_determ += 1
            if verbose:
                print L
        nvalid_random = 0
        for i in range(100):
            for L in schedules_func(
                    g, f, 0, DEFAULT_MAX_DEPTH, random=True
            ):  #sorted([repr(_x) for _x in valid_schedules(g, f, 3)]):
                if verbose and 0:
                    print L  #repr(L)
                nvalid_random += 1
    s = []
    for i in range(400):
        d = random_schedule(g, 0, DEFAULT_MAX_DEPTH)
        si = str(d)
        s.append(si)
        if verbose:
            print 'Schedule:', si

        d.apply()
        evaluate = d.test((36, 36, 3), input)
        print 'evaluate'
        evaluate()
        if test_random:
            print 'Success'
            sys.exit()
    T1 = time.time()

    s = '\n'.join(s)
    assert 'f.chunk(_c0)' in s
    assert 'f.root().vectorize' in s
    assert 'f.root().unroll' in s
    assert 'f.root().split' in s
    assert 'f.root().tile' in s
    assert 'f.root().parallel' in s
    assert 'f.root().transpose' in s

    assert nvalid_random == 100
    if verbose:
        print 'generated in %.3f secs' % (T1 - T0)
    print 'random_schedule: OK'
コード例 #16
0
ファイル: local_laplacian.py プロジェクト: darkbuck/Halide
def get_local_laplacian(input, levels, alpha, beta, J=8):
    downsample_counter=[0]
    upsample_counter=[0]

    x = hl.Var('x')
    y = hl.Var('y')

    def downsample(f):
        downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0])
        downsample_counter[0] += 1

        downx[x,y,c] = (f[2*x-1,y,c] + 3.0*(f[2*x,y,c]+f[2*x+1,y,c]) + f[2*x+2,y,c])/8.0
        downy[x,y,c] = (downx[x,2*y-1,c] + 3.0*(downx[x,2*y,c]+downx[x,2*y+1,c]) + downx[x,2*y+2,c])/8.0

        return downy

    def upsample(f):
        upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0])
        upsample_counter[0] += 1

        upx[x,y,c] = 0.25 * f[(x//2) - 1 + 2*(x%2),y,c] + 0.75 * f[x//2,y,c]
        upy[x,y,c] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2),c] + 0.75 * upx[x,y//2,c]

        return upy

    def downsample2D(f):
        downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0])
        downsample_counter[0] += 1

        downx[x,y] = (f[2*x-1,y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0
        downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0

        return downy

    def upsample2D(f):
        upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0])
        upsample_counter[0] += 1

        upx[x,y] = 0.25 * f[(x//2) - 1 + 2*(x%2),y] + 0.75 * f[x//2,y]
        upy[x,y] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2)] + 0.75 * upx[x,y//2]

        return upy

    # THE ALGORITHM

    # loop variables
    c = hl.Var('c')
    k = hl.Var('k')

    # Make the remapping function as a lookup table.
    remap = hl.Func('remap')
    fx = hl.cast(float_t, x/256.0)
    #remap[x] = alpha*fx*exp(-fx*fx/2.0)
    remap[x] = alpha*fx*hl.exp(-fx*fx/2.0)

    # Convert to floating point
    floating = hl.Func('floating')
    floating[x,y,c] = hl.cast(float_t, input[x,y,c]) / 65535.0

    # Set a boundary condition
    clamped = hl.Func('clamped')
    clamped[x,y,c] = floating[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c]

    # Get the luminance channel
    gray = hl.Func('gray')
    gray[x,y] = 0.299*clamped[x,y,0] + 0.587*clamped[x,y,1] + 0.114*clamped[x,y,2]

    # Make the processed Gaussian pyramid.
    gPyramid = [hl.Func('gPyramid%d'%i) for i in range(J)]
    # Do a lookup into a lut with 256 entires per intensity level
    level = k / (levels - 1)
    idx = gray[x,y]*hl.cast(float_t, levels-1)*256.0
    idx = hl.clamp(hl.cast(int_t, idx), 0, (levels-1)*256)
    gPyramid[0][x,y,k] = beta*(gray[x, y] - level) + level + remap[idx - 256*k]
    for j in range(1,J):
        gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k]

    # Get its laplacian pyramid
    lPyramid = [hl.Func('lPyramid%d'%i) for i in range(J)]
    lPyramid[J-1] = gPyramid[J-1]
    for j in range(J-1)[::-1]:
        lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k]

    # Make the Gaussian pyramid of the input
    inGPyramid = [hl.Func('inGPyramid%d'%i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1,J):
        inGPyramid[j][x,y] = downsample2D(inGPyramid[j-1])[x,y]

    # Make the laplacian pyramid of the output
    outLPyramid = [hl.Func('outLPyramid%d'%i) for i in range(J)]
    for j in range(J):
        # Split input pyramid value into integer and floating parts
        level = inGPyramid[j][x,y]*hl.cast(float_t, levels-1)
        li = hl.clamp(hl.cast(int_t, level), 0, levels-2)
        lf = level - hl.cast(float_t, li)
        # Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1]

    # Make the Gaussian pyramid of the output
    outGPyramid = [hl.Func('outGPyramid%d'%i) for i in range(J)]
    outGPyramid[J-1] = outLPyramid[J-1]
    for j in range(J-1)[::-1]:
        outGPyramid[j][x,y] = upsample2D(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y]

    # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
    color = hl.Func('color')
    eps = 0.01
    color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c] + eps) / (gray[x,y] + eps)

    output = hl.Func('local_laplacian')
    # Convert back to 16-bit
    output[x,y,c] = hl.cast(hl.UInt(16), hl.clamp(color[x,y,c], 0.0, 1.0) * 65535.0)

    # THE SCHEDULE
    remap.compute_root()

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU Schedule
        print ("Compiling for GPU")
        xi, yi = hl.Var("xi"), hl.Var("yi")
        output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default)
        for j in range(J):
            blockw = 32
            blockh = 16
            if j > 3:
                blockw = 2
                blockh = 2
            if j > 0:
                inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default)
            if j > 0:
                gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default)
            outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default)
    else:
        # CPU schedule
        print ("Compiling for CPU")
        output.parallel(y, 4).vectorize(x, 4);
        gray.compute_root().parallel(y, 4).vectorize(x, 4);
        for j in range(4):
            if j > 0:
                inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            if j > 0:
                gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            outGPyramid[j].compute_root().parallel(y).vectorize(x, 4)
        for j in range(4,J):
            inGPyramid[j].compute_root().parallel(y)
            gPyramid[j].compute_root().parallel(k)
            outGPyramid[j].compute_root().parallel(y)


    return output
コード例 #17
0
def main():
    # Declare some Vars to use below.
    x, y = hl.Var ("x"), hl.Var ("y")

    # Load a grayscale image to use as an input.
    image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png")
    input_data = imread(image_path)
    if True:
         # making the image smaller to go faster
        input_data = input_data[:160, :150]
    assert input_data.dtype == np.uint8
    input = hl.Buffer(input_data)

    # You can define a hl.Func in multiple passes. Let's see a toy
    # example first.
    if True:
        # The first definition must be one like we have seen already
        # - a mapping from Vars to an hl.Expr:
        f = hl.Func("f")
        f[x, y] = x + y
        # We call this first definition the "pure" definition.

        # But the later definitions can include computed expressions on
        # both sides. The simplest example is modifying a single point:
        f[3, 7] = 42

        # We call these extra definitions "update" definitions, or
        # "reduction" definitions. A reduction definition is an
        # update definition that recursively refers back to the
        # function's current value at the same site:
        if False:
            e = f[x, y] + 17
            print("f[x, y] + 17", e)
            print("(f[x, y] + 17).type()", e.type())
            print("(f[x, y]).type()", f[x,y].type())

        f[x, y] = f[x, y] + 17

        # If we confine our update to a single row, we can
        # recursively refer to values in the same column:
        f[x, 3] = f[x, 0] * f[x, 10]

        # Similarly, if we confine our update to a single column, we
        # can recursively refer to other values in the same row.
        f[0, y] = f[0, y] / f[3, y]

        # The general rule is: Each hl.Var used in an update definition
        # must appear unadorned in the same position as in the pure
        # definition in all references to the function on the left-
        # and right-hand sides. So the following definitions are
        # legal updates:
        f[x, 17] = x + 8 # x is used, so all uses of f must have x as the first argument.
        f[0, y] = y * 8  # y is used, so all uses of f must have y as the second argument.
        f[x, x + 1] = x + 8
        f[y/2, y] = f[0, y] * 17

        # But these ones would cause an error:
        # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'.
        # f[y, y + 1) = y + 8   <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'.
        # f[y, x) = y - x      <- Arguments to f on the left-hand-side are in the wrong places.
        # f[3, 4) = x + y      <- Free variables appear on the right-hand-side but not the left-hand-side.

        # We'll realize this one just to make sure it compiles. The
        # second-to-last definition forces us to realize over a
        # domain that is taller than it is wide.
        f.realize(100, 101)

        # For each realization of f, each step runs in its entirety
        # before the next one begins. Let's trace the loads and
        # stores for a simpler example:
        g = hl.Func("g")
        g[x, y] = x + y   # Pure definition
        g[2, 1] = 42      # First update definition
        g[x, 0] = g[x, 1] # Second update definition

        g.trace_loads()
        g.trace_stores()

        g.realize(4, 4)

        # Reading the log, we see that each pass is applied in turn. The equivalent C is:
        result = np.empty( (4,4), dtype=np.int)
        # Pure definition
        for yy in range(4):
            for xx in range(4):
                result[yy][xx] = xx + yy


        # First update definition
        result[1][2] = 42
        # Second update definition
        for xx in range(4):
            result[0][xx] = result[1][xx]
    # end of section


    # Putting update passes inside loops.
    if True:
        # Starting with this pure definition:
        f = hl.Func("f")
        f[x, y] = x + y

        # Say we want an update that squares the first fifty rows. We
        # could do this by adding 50 update definitions:

        # f[x, 0) = f[x, 0) * f[x, 0)
        # f[x, 1) = f[x, 1) * f[x, 1)
        # f[x, 2) = f[x, 2) * f[x, 2)
        # ...
        # f[x, 49) = f[x, 49) * f[x, 49)

        # Or equivalently using a compile-time loop in our C++:
        # for (int i = 0 i < 50 i++) {
        #   f[x, i) = f[x, i) * f[x, i)
        #

        # But it's more manageable and more flexible to put the loop
        # in the generated code. We do this by defining a "reduction
        # domain" and using it inside an update definition:
        r = hl.RDom([(0, 50)])
        f[x, r] = f[x, r] * f[x, r]
        halide_result = f.realize(100, 100)

        # The equivalent C is:
        c_result = np.empty((100, 100), dtype=np.int)
        for yy in range(100):
            for xx in range(100):
                c_result[yy][xx] = xx + yy

        for xx in range(100):
            for rr in range(50):
                # The loop over the reduction domain occurs inside of
                # the loop over any pure variables used in the update
                # step:
                c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx]



        # Check the results match:
        for yy in range(100):
            for xx in range(100):
                if halide_result[xx, yy] != c_result[yy][xx]:
                    raise Exception("halide_result(%d, %d) = %d instead of %d" % (
                           xx, yy, halide_result[xx, yy], c_result[yy][xx]))
                    return -1





    # Now we'll examine a real-world use for an update definition:
    # computing a histogram.
    if True:

        # Some operations on images can't be cleanly expressed as a pure
        # function from the output coordinates to the value stored
        # there. The classic example is computing a histogram. The
        # natural way to do it is to iterate over the input image,
        # updating histogram buckets. Here's how you do that in Halide:
        histogram = hl.Func("histogram")

        # Histogram buckets start as zero.
        histogram[x] = 0

        # Define a multi-dimensional reduction domain over the input image:
        r = hl.RDom([(0, input.width()), (0, input.height())])

        # For every point in the reduction domain, increment the
        # histogram bucket corresponding to the intensity of the
        # input image at that point.
        histogram[input[r.x, r.y]] += 1

        halide_result = histogram.realize(256)

        # The equivalent C is:
        c_result = np.empty((256), dtype=np.int)
        for xx in range(256):
            c_result[xx] = 0

        for r_y in range(input.height()):
            for r_x in range(input.width()):
                c_result[input_data[r_x, r_y]] += 1



        # Check the answers agree:
        for xx in range(256):
            if c_result[xx] != halide_result[xx]:
                raise Exception("halide_result(%d) = %d instead of %d" % (
                       xx, halide_result[xx], c_result[xx]))
                return -1




    # Scheduling update steps
    if True:
        # The pure variables in an update step and can be
        # parallelized, vectorized, split, etc as usual.

        # Vectorizing, splitting, or parallelize the variables that
        # are part of the reduction domain is trickier. We'll cover
        # that in a later lesson.

        # Consider the definition:
        f = hl.Func("x")
        f[x, y] = x*y
        # Set the second row to equal the first row.
        f[x, 1] = f[x, 0]
        # Set the second column to equal the first column plus 2.
        f[1, y] = f[0, y] + 2

        # The pure variables in each stage can be scheduled
        # independently. To control the pure definition, we schedule
        # as we have done in the past. The following code vectorizes
        # and parallelizes the pure definition only.
        f.vectorize(x, 4).parallel(y)

        # We use hl.Func::update(int) to get a handle to an update step
        # for the purposes of scheduling. The following line
        # vectorizes the first update step across x. We can't do
        # anything with y for this update step, because it doesn't
        # use y.
        f.update(0).vectorize(x, 4)

        # Now we parallelize the second update step in chunks of size
        # 4.
        yo, yi = hl.Var("yo"), hl.Var("yi")
        f.update(1).split(y, yo, yi, 4).parallel(yo)

        halide_result = f.realize(16, 16)


        # Here's the equivalent (serial) C:
        c_result = np.empty((16, 16), dtype=np.int)


        # Pure step. Vectorized in x and parallelized in y.
        for yy in range( 16): # Should be a parallel for loop
            for x_vec in range(4):
                xx = [x_vec*4, x_vec*4+1, x_vec*4+2, x_vec*4+3]
                c_result[yy][xx[0]] = xx[0] * yy
                c_result[yy][xx[1]] = xx[1] * yy
                c_result[yy][xx[2]] = xx[2] * yy
                c_result[yy][xx[3]] = xx[3] * yy



        # First update. Vectorized in x.
        for x_vec in range(4):
            xx = [x_vec*4, x_vec*4+1, x_vec*4+2, x_vec*4+3]
            c_result[1][xx[0]] = c_result[0][xx[0]]
            c_result[1][xx[1]] = c_result[0][xx[1]]
            c_result[1][xx[2]] = c_result[0][xx[2]]
            c_result[1][xx[3]] = c_result[0][xx[3]]


        # Second update. Parallelized in chunks of size 4 in y.
        for yo in range(4): # Should be a parallel for loop
            for yi in range(4):
                yy = yo*4 + yi
                c_result[yy][1] = c_result[yy][0] + 2



        # Check the C and Halide results match:
        for yy in range( 16):
            for xx in range( 16 ):
                if halide_result[xx, yy] != c_result[yy][xx]:
                    raise Exception("halide_result(%d, %d) = %d instead of %d" % (
                           xx, yy, halide_result[xx, yy], c_result[yy][xx]))
                    return -1





    # That covers how to schedule the variables within a hl.Func that
    # uses update steps, but what about producer-consumer
    # relationships that involve compute_at and store_at? Let's
    # examine a reduction as a producer, in a producer-consumer pair.
    if True:
        # Because an update does multiple passes over a stored array,
        # it's not meaningful to inline them. So the default schedule
        # for them does the closest thing possible. It computes them
        # in the innermost loop of their consumer. Consider this
        # trivial example:
        producer, consumer = hl.Func("producer"), hl.Func("consumer")
        producer[x] = x*17
        producer[x] += 1
        consumer[x] = 2 * producer[x]
        halide_result = consumer.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            producer_storage = np.empty((1), dtype=np.int)
            # Pure step for producer
            producer_storage[0] = xx * 17
            # Update step for producer
            producer_storage[0] = producer_storage[0] + 1
            # Pure step for consumer
            c_result[xx] = 2 * producer_storage[0]


        # Check the results match
        for xx in range( 10 ):
            if halide_result[xx] != c_result[xx]:
                raise Exception("halide_result(%d) = %d instead of %d" % (
                       xx, halide_result[xx], c_result[xx]))
                return -1



        # For all other compute_at/store_at options, the reduction
        # gets placed where you would expect, somewhere in the loop
        # nest of the consumer.


    # Now let's consider a reduction as a consumer in a
    # producer-consumer pair. This is a little more involved.
    if True:
        if True:
            # Case 1: The consumer references the producer in the pure step only.
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            # The producer is pure.
            producer[x] = x*17
            consumer[x] = 2 * producer[x]
            consumer[x] += 1

            # The valid schedules for the producer in this case are
            # the default schedule - inlined, and also:
            #
            # 1) producer.compute_at(x), which places the computation of
            # the producer inside the loop over x in the pure step of the
            # consumer.
            #
            # 2) producer.compute_root(), which computes all of the
            # producer ahead of time.
            #
            # 3) producer.store_root().compute_at(x), which allocates
            # space for the consumer outside the loop over x, but fills
            # it in as needed inside the loop.
            #
            # Let's use option 1.

            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)


            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)

            # Pure step for the consumer
            for xx in range( 10 ):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = 2 * producer_storage[0]

            # Update step for the consumer
            for xx in range( 10 ):
                c_result[xx] += 1


            # All of the pure step is evaluated before any of the
            # update step, so there are two separate loops over x.

            # Check the results match
            for xx in range( 10 ):
                if halide_result[xx] != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" % (
                           xx, halide_result[xx], c_result[xx]))
                    return -1




        if True:
            # Case 2: The consumer references the producer in the update step only
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside the update
            # step of the producer, because that's the only step that
            # uses the producer.
            producer.compute_at(consumer, x)

            # Note however, that we didn't say:
            #
            # producer.compute_at(consumer.update(0), x).
            #
            # Scheduling is done with respect to Vars of a hl.Func, and
            # the Vars of a hl.Func are shared across the pure and
            # update steps.

            halide_result = consumer.realize(10)


            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range( 10 ):
                c_result[xx] = xx

            # Update step for the consumer
            for xx in range( 10 ):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]



            # Check the results match
            for xx in range( 10 ):
                if halide_result[xx] != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" % (
                           xx, halide_result[xx], c_result[xx]))
                    return -1




        if True:
            # Case 3: The consumer references the producer in
            # multiple steps that share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = producer[x] * x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside both the
            # pure and the update step of the producer. So there ends
            # up being two separate realizations of the producer, and
            # redundant work occurs.
            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range( 10 ):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = producer_storage[0] * xx

            # Update step for the consumer
            for xx in range( 10 ):
                # Another copy of the pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]


            # Check the results match
            for xx in range( 10 ):
                if halide_result[xx] != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" % (
                           xx, halide_result[xx], c_result[xx]))
                    return -1




        if True:
            # Case 4: The consumer references the producer in
            # multiple steps that do not share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x, y] = x*y
            consumer[x, y] = x + y
            consumer[x, 0] = producer[x, x-1]
            consumer[0, y] = producer[y, y-1]

            # In this case neither producer.compute_at(consumer, x)
            # nor producer.compute_at(consumer, y) will work, because
            # either one fails to cover one of the uses of the
            # producer. So we'd have to inline producer, or use
            # producer.compute_root().

            # Let's say we really really want producer to be
            # compute_at the inner loops of both consumer update
            # steps. Halide doesn't allow multiple different
            # schedules for a single hl.Func, but we can work around it
            # by making two wrappers around producer, and scheduling
            # those instead:

            # Attempt 2:
            producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(), hl.Func(), hl.Func()
            producer_wrapper_1[x, y] = producer[x, y]
            producer_wrapper_2[x, y] = producer[x, y]

            consumer_2[x, y] = x + y
            consumer_2[x, 0] += producer_wrapper_1[x, x-1]
            consumer_2[0, y] += producer_wrapper_2[y, y-1]

            # The wrapper functions give us two separate handles on
            # the producer, so we can schedule them differently.
            producer_wrapper_1.compute_at(consumer_2, x)
            producer_wrapper_2.compute_at(consumer_2, y)

            halide_result = consumer_2.realize(10, 10)

            # The equivalent C is:
            c_result = np.empty((10, 10), dtype=np.int)

            # Pure step for the consumer
            for yy in range( 10):
                for xx in range( 10 ):
                    c_result[yy][xx] = xx + yy


            # First update step for consumer
            for xx in range( 10 ):
                producer_wrapper_1_storage = np.empty((1), dtype=np.int)
                producer_wrapper_1_storage[0] = xx * (xx-1)
                c_result[0][xx] += producer_wrapper_1_storage[0]

            # Second update step for consumer
            for yy in range( 10):
                producer_wrapper_2_storage = np.empty((1), dtype=np.int)
                producer_wrapper_2_storage[0] = yy * (yy-1)
                c_result[yy][0] += producer_wrapper_2_storage[0]


            # Check the results match
            for yy in range( 10):
                for xx in range( 10 ):
                    if halide_result[xx, yy] != c_result[yy][xx]:
                        print("halide_result(%d, %d) = %d instead of %d",
                               xx, yy, halide_result[xx, yy], c_result[yy][xx])
                        return -1





        if True:
            # Case 5: Scheduling a producer under a reduction domain
            # variable of the consumer.

            # We are not just restricted to scheduling producers at
            # the loops over the pure variables of the consumer. If a
            # producer is only used within a loop over a reduction
            # domain (hl.RDom) variable, we can also schedule the
            # producer there.

            producer, consumer = hl.Func("producer"), hl.Func("consumer")

            r = hl.RDom([(0, 5)])
            producer[x] = x * 17
            consumer[x] = x + 10
            consumer[x] += r + producer[x + r]

            producer.compute_at(consumer, r)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer.
            for xx in range(10):
                c_result[xx] = xx + 10

            # Update step for the consumer.
            for xx in range( 10 ):
                for rr in range(5): # The loop over the reduction domain is always the inner loop.
                    # We've schedule the storage and computation of
                    # the producer here. We just need a single value.
                    producer_storage = np.empty((1), dtype=np.int)
                    # Pure step of the producer.
                    producer_storage[0] = (xx + rr) * 17

                    # Now use it in the update step of the consumer.
                    c_result[xx] += rr + producer_storage[0]


            # Check the results match
            for xx in range( 10 ):
                if halide_result[xx] != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" % (
                           xx, halide_result[xx], c_result[xx]))
                    return -1


    # A real-world example of a reduction inside a producer-consumer chain.
    if True:
        # The default schedule for a reduction is a good one for
        # convolution-like operations. For example, the following
        # computes a 5x5 box-blur of our grayscale test image with a
        # hl.clamp-to-edge boundary condition:

        # First add the boundary condition.
        clamped = hl.BoundaryConditions.repeat_edge(input)

        # Define a 5x5 box that starts at (-2, -2)
        r = hl.RDom([(-2, 5), (-2, 5)])

        # Compute the 5x5 sum around each pixel.
        local_sum = hl.Func("local_sum")
        local_sum[x, y] = 0 # Compute the sum as a 32-bit integer
        local_sum[x, y] += clamped[x + r.x, y + r.y]

        # Divide the sum by 25 to make it an average
        blurry = hl.Func("blurry")
        blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25)

        halide_result = blurry.realize(input.width(), input.height())

        # The default schedule will inline 'clamped' into the update
        # step of 'local_sum', because clamped only has a pure
        # definition, and so its default schedule is fully-inlined.
        # We will then compute local_sum per x coordinate of blurry,
        # because the default schedule for reductions is
        # compute-innermost. Here's the equivalent C:

        #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0]
        local_sum = np.empty((1), dtype=np.int32)

        c_result = hl.Buffer(hl.UInt(8), [input.width(), input.height()])
        for yy in range(input.height()):
            for xx in range(input.width()):
                # FIXME this loop is quite slow
                # Pure step of local_sum
                local_sum[0] = 0
                # Update step of local_sum
                for r_y in range(-2, 2+1):
                    for r_x in range(-2, 2+1):
                        # The clamping has been inlined into the update step.
                        clamped_x = min(max(xx + r_x, 0), input.width()-1)
                        clamped_y = min(max(yy + r_y, 0), input.height()-1)
                        local_sum[0] += input[clamped_x, clamped_y]

                # Pure step of blurry
                #c_result(x, y) = (uint8_t)(local_sum[0] / 25)
                #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25)
                c_result[xx, yy] = int(local_sum[0] / 25) # hl.cast done internally

        # Check the results match
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result[xx, yy] != c_result[xx, yy]:
                    raise Exception("halide_result(%d, %d) = %d instead of %d"
                                    % (xx, yy,
                                       halide_result[xx, yy], c_result[xx, yy]))
                    return -1


    # Reduction helpers.
    if True:
        # There are several reduction helper functions provided in
        # Halide.h, which compute small reductions and schedule them
        # innermost into their consumer. The most useful one is
        # "sum".
        f1 = hl.Func ("f1")
        r = hl.RDom([(0, 100)])
        f1[x] = hl.sum(r + x) * 7

        # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to:
        f2, anon = hl.Func("f2"), hl.Func("anon")
        anon[x] = 0
        anon[x] += r + x
        f2[x] = anon[x] * 7

        # So even though f1 references a reduction domain, it is a
        # pure function. The reduction domain has been swallowed to
        # define the inner anonymous reduction.
        halide_result_1 = f1.realize(10)
        halide_result_2 = f2.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range( 10 ):
            anon = np.empty((1), dtype=np.int)
            anon[0] = 0
            for rr in range(100):
                anon[0] += rr + xx

            c_result[xx] = anon[0] * 7


        # Check they all match.
        for xx in range( 10 ):
            if halide_result_1[xx] != c_result[xx]:
                print("halide_result_1(%d) = %d instead of %d",
                       xx, halide_result_1[xx], c_result[xx])
                return -1

            if halide_result_2[xx] != c_result[xx]:
                print("halide_result_2(%d) = %d instead of %d",
                       xx, halide_result_2[xx], c_result[xx])
                return -1





    # A complex example that uses reduction helpers.
    if False: # non-sense to port SSE code to python, skipping this test

        # Other reduction helpers include "product", "minimum",
        # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax
        # requires understanding tuples, which come in a later
        # lesson. Let's use minimum and maximum to compute the local
        # spread of our grayscale image.

        # First, add a boundary condition to the input.
        clamped = hl.Func("clamped")
        x_clamped = hl.clamp(x, 0, input.width()-1)
        y_clamped = hl.clamp(y, 0, input.height()-1)
        clamped[x, y] = input[x_clamped, y_clamped]

        box = hl.RDom([(-2, 5), (-2, 5)])
        # Compute the local maximum minus the local minimum:
        spread = hl.Func("spread")
        spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) -
                        minimum(clamped(x + box.x, y + box.y)))

        # Compute the result in strips of 32 scanlines
        yo, yi = hl.Var("yo"), hl.Var("yi")
        spread.split(y, yo, yi, 32).parallel(yo)

        # Vectorize across x within the strips. This implicitly
        # vectorizes stuff that is computed within the loop over x in
        # spread, which includes our minimum and maximum helpers, so
        # they get vectorized too.
        spread.vectorize(x, 16)

        # We'll apply the boundary condition by padding each scanline
        # as we need it in a circular buffer (see lesson 08).
        clamped.store_at(spread, yo).compute_at(spread, yi)

        halide_result = spread.realize(input.width(), input.height())


        # The C equivalent is almost too horrible to contemplate (and
        # took me a long time to debug). This time I want to time
        # both the Halide version and the C version, so I'll use sse
        # intrinsics for the vectorization, and openmp to do the
        # parallel for loop (you'll need to compile with -fopenmp or
        # similar to get correct timing).
        #ifdef __SSE2__

        # Don't include the time required to allocate the output buffer.
        c_result = hl.Buffer(hl.UInt(8), input.width(), input.height())

        #ifdef _OPENMP
        t1 = datetime.now()
        #endif

        # Run this one hundred times so we can average the timing results.
        for iters in range(100):
            pass
            # #pragma omp parallel for
            # for yo in range((input.height() + 31)/32):
            #     y_base = hl.min(yo * 32, input.height() - 32)
            #
            #     # Compute clamped in a circular buffer of size 8
            #     # (smallest power of two greater than 5). Each thread
            #     # needs its own allocation, so it must occur here.
            #
            #     clamped_width = input.width() + 4
            #     clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8)
            #
            #     for yi in range(32):
            #         y = y_base + yi
            #
            #         uint8_t *output_row = &c_result(0, y)
            #
            #         # Compute clamped for this scanline, skipping rows
            #         # already computed within this slice.
            #         int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2)
            #         int max_y_clamped = (y + 2)
            #         for (int cy = min_y_clamped cy <= max_y_clamped cy++) {
            #             # Figure out which row of the circular buffer
            #             # we're filling in using bitmasking:
            #             uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width
            #
            #             # Figure out which row of the input we're reading
            #             # from by clamping the y coordinate:
            #             int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1)
            #             uint8_t *input_row = &input(0, clamped_y)
            #
            #             # Fill it in with the padding.
            #             for (int x = -2 x < input.width() + 2 ):
            #                 int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1)
            #                 *clamped_row++ = input_row[clamped_x]
            #
            #
            #
            #         # Now iterate over vectors of x for the pure step of the output.
            #         for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) {
            #             int x_base = std::hl.min(x_vec * 16, input.width() - 16)
            #
            #             # Allocate storage for the minimum and maximum
            #             # helpers. One vector is enough.
            #             __m128i minimum_storage, maximum_storage
            #
            #             # The pure step for the maximum is a vector of zeros
            #             maximum_storage = (__m128i)_mm_setzero_ps()
            #
            #             # The update step for maximum
            #             for (int max_y = y - 2 max_y <= y + 2 max_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width
            #                 for (int max_x = x_base - 2 max_x <= x_base + 2 max_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2))
            #                     maximum_storage = _mm_max_epu8(maximum_storage, v)
            #
            #
            #
            #             # The pure step for the minimum is a vector of
            #             # ones. Create it by comparing something to
            #             # itself.
            #             minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(),
            #                                                     _mm_setzero_ps())
            #
            #             # The update step for minimum.
            #             for (int min_y = y - 2 min_y <= y + 2 min_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width
            #                 for (int min_x = x_base - 2 min_x <= x_base + 2 min_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2))
            #                     minimum_storage = _mm_min_epu8(minimum_storage, v)
            #
            #
            #
            #             # Now compute the spread.
            #             __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage)
            #
            #             # Store it.
            #             _mm_storeu_si128((__m128i *)(output_row + x_base), spread)
            #
            #
            #
            #     del clamped_storage
            #
        # end of hundred iterations

        # Skip the timing comparison if we don't have openmp
        # enabled. Otherwise it's unfair to C.
        #ifdef _OPENMP
        t2 = datetime.now()

        # Now run the Halide version again without the
        # jit-compilation overhead. Also run it one hundred times.
        for iters in range(100):
            spread.realize(halide_result)

        t3 = datetime.now()

        # Report the timings. On my machine they both take about 3ms
        # for the 4-megapixel input (fast!), which makes sense,
        # because they're using the same vectorization and
        # parallelization strategy. However I find the Halide easier
        # to read, write, debug, modify, and port.
        print("Halide spread took %f ms. C equivalent took %f ms" % (
               (t3 - t2).total_seconds() * 1000,
               (t2 - t1).total_seconds() * 1000))

        #endif # _OPENMP

        # Check the results match:
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result(xx, yy) != c_result(xx, yy):
                    raise Exception("halide_result(%d, %d) = %d instead of %d" % (
                           xx, yy, halide_result(xx, yy), c_result(xx, yy)))
                    return -1



        #endif # __SSE2__
    else:
        print("(Skipped the SSE2 section of the code, "
              "since non-sense in python world.)")

    print("Success!")
    return 0
コード例 #18
0
ファイル: bilateral_grid.py プロジェクト: wsmoses/Halide-AS
def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid
コード例 #19
0
def main():
    # First we'll declare some Vars to use below.
    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    image_path = os.path.join(os.path.dirname(__file__),
                              "../../tutorial/images/rgb.png")

    # Now we'll express a multi-stage pipeline that blurs an image
    # first horizontally, and then vertically.
    if True:
        # Take a color 8-bit input
        input = hl.Buffer(imageio.imread(image_path))
        assert input.type() == hl.UInt(8)

        # Upgrade it to 16-bit, so we can do math without it overflowing.
        input_16 = hl.Func("input_16")
        input_16[x, y, c] = hl.cast(hl.UInt(16), input[x, y, c])

        # Blur it horizontally:
        blur_x = hl.Func("blur_x")
        blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] +
                           input_16[x + 1, y, c]) / 4

        # Blur it vertically:
        blur_y = hl.Func("blur_y")
        blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] +
                           blur_x[x, y + 1, c]) / 4

        # Convert back to 8-bit.
        output = hl.Func("output")
        output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c])

        # Each hl.Func in this pipeline calls a previous one using
        # familiar function call syntax (we've overloaded operator()
        # on hl.Func objects). A hl.Func may call any other hl.Func that has
        # been given a definition. This restriction prevents
        # pipelines with loops in them. Halide pipelines are always
        # feed-forward graphs of Funcs.

        # Now let's realize it...

        # result = output.realize(input.width(), input.height(), 3)

        # Except that the line above is not going to work. Uncomment
        # it to see what happens.

        # Realizing this pipeline over the same domain as the input
        # image requires reading pixels out of bounds in the input,
        # because the blur_x stage reaches outwards horizontally, and
        # the blur_y stage reaches outwards vertically. Halide
        # detects this by injecting a piece of code at the top of the
        # pipeline that computes the region over which the input will
        # be read. When it starts to run the pipeline it first runs
        # this code, determines that the input will be read out of
        # bounds, and refuses to continue. No actual bounds checks
        # occur in the inner loop that would be slow.
        #
        # So what do we do? There are a few options. If we realize
        # over a domain shifted inwards by one pixel, we won't be
        # asking the Halide routine to read out of bounds. We saw how
        # to do this in the previous lesson:
        result = hl.Buffer(
            hl.UInt(8),
            [input.width() - 2, input.height() - 2, 3])
        result.set_min([1, 1])
        output.realize(result)

        # Save the result. It should look like a slightly blurry
        # parrot, and it should be two pixels narrower and two pixels
        # shorter than the input image.

        imageio.imsave("blurry_parrot_1.png", result)
        print("Created blurry_parrot_1.png")

        # This is usually the fastest way to deal with boundaries:
        # don't write code that reads out of bounds :) The more
        # general solution is our next example.

    # The same pipeline, with a boundary condition on the input.
    if True:
        # Take a color 8-bit input
        input = hl.Buffer(imageio.imread(image_path))
        assert input.type() == hl.UInt(8)

        # This time, we'll wrap the input in a hl.Func that prevents
        # reading out of bounds:
        clamped = hl.Func("clamped")

        # Define an expression that clamps x to lie within the the
        # range [0, input.width()-1].
        clamped_x = hl.clamp(x, 0, input.width() - 1)
        # Similarly hl.clamp y.
        clamped_y = hl.clamp(y, 0, input.height() - 1)
        # Load from input at the clamped coordinates. This means that
        # no matter how we evaluated the hl.Func 'clamped', we'll never
        # read out of bounds on the input. This is a hl.clamp-to-edge
        # style boundary condition, and is the simplest boundary
        # condition to express in Halide.
        clamped[x, y, c] = input[clamped_x, clamped_y, c]

        # Defining 'clamped' in that way can be done more concisely
        # using a helper function from the BoundaryConditions
        # namespace like so:
        #
        # clamped = hl.BoundaryConditions.repeat_edge(input)
        #
        # These are important to use for other boundary conditions,
        # because they are expressed in the way that Halide can best
        # understand and optimize.

        # Upgrade it to 16-bit, so we can do math without it
        # overflowing. This time we'll refer to our new hl.Func
        # 'clamped', instead of referring to the input image
        # directly.
        input_16 = hl.Func("input_16")
        input_16[x, y, c] = hl.cast(hl.UInt(16), clamped[x, y, c])

        # The rest of the pipeline will be the same...

        # Blur it horizontally:
        blur_x = hl.Func("blur_x")
        blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] +
                           input_16[x + 1, y, c]) / 4

        # Blur it vertically:
        blur_y = hl.Func("blur_y")
        blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] +
                           blur_x[x, y + 1, c]) / 4

        # Convert back to 8-bit.
        output = hl.Func("output")
        output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c])

        # This time it's safe to evaluate the output over the some
        # domain as the input, because we have a boundary condition.
        result = output.realize(input.width(), input.height(), 3)

        # Save the result. It should look like a slightly blurry
        # parrot, but this time it will be the same size as the
        # input.
        imageio.imsave("blurry_parrot_2.png", result)
        print("Created blurry_parrot_2.png")

    print("Success!")
    return 0
コード例 #20
0
ファイル: bilateral_grid.py プロジェクト: adityaatluri/Halide
def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid
コード例 #21
0
    def gen_g(self):
        ''' define g() function '''
        # vars
        i, j, k, l = [self.vars[c] for c in "ijkl"]
        # clamped inputs
        x, y, z, expnt, fm, rnorm = [
            self.clamps[c] for c in ["x", "y", "z", "expnt", "fm", "rnorm"]
        ]
        # unclamped input (for sizing)
        fm_in = self.inputs["fm_in"]
        # scalar inputs
        delo2, delta, rdelta = [
            self.inputs[c] for c in ["delo2", "delta", "rdelta"]
        ]

        dx = hl.Func("dx")
        dy = hl.Func("dy")
        dz = hl.Func("dz")
        r2 = hl.Func("g_r2")
        expnt2 = hl.Func("expnt2")
        expnt_inv = hl.Func("expnt_inv")
        self.add_funcs_by_name([dx, dy, dz, r2, expnt2, expnt_inv])

        dx[i, j] = x[i] - x[j]
        dy[i, j] = y[i] - y[j]
        dz[i, j] = z[i] - z[j]

        r2[i,
           j] = dx[i, j] * dx[i, j] + dy[i, j] * dy[i, j] + dz[i, j] * dz[i, j]

        expnt2[i, j] = expnt[i] + expnt[j]
        expnt_inv[i, j] = hl.f64(1.0) / expnt2[i, j]

        fac2 = hl.Func("fac2")
        ex_arg = hl.Func("ex_arg")
        ex = hl.Func("ex")
        denom = hl.Func("denom")
        fac4d = hl.Func("fac4d")
        self.add_funcs_by_name([fac2, ex_arg, ex, denom, fac4d])
        fac2[i, j] = expnt[i] * expnt[j] * expnt_inv[i, j]
        ex_arg[i, j, k, l] = -fac2[i, j] * r2[i, j] - fac2[k, l] * r2[k, l]
        ex[i, j, k, l] = hl.select(ex_arg[i, j, k, l] < hl.f64(-37.0),
                                   hl.f64(0.0), hl.exp(ex_arg[i, j, k, l]))
        denom[i, j, k,
              l] = expnt2[i, j] * expnt2[k, l] * hl.sqrt(expnt2[i, j] +
                                                         expnt2[k, l])
        fac4d[i, j, k,
              l] = expnt2[i, j] * expnt2[k, l] / (expnt2[i, j] + expnt2[k, l])

        x2 = hl.Func("g_x2")
        y2 = hl.Func("g_y2")
        z2 = hl.Func("g_z2")
        rpq2 = hl.Func("rpq2")
        self.add_funcs_by_name([x2, y2, z2, rpq2])
        x2[i, j] = (x[i] * expnt[i] + x[j] * expnt[j]) * expnt_inv[i, j]
        y2[i, j] = (y[i] * expnt[i] + y[j] * expnt[j]) * expnt_inv[i, j]
        z2[i, j] = (z[i] * expnt[i] + z[j] * expnt[j]) * expnt_inv[i, j]
        rpq2[i, j, k, l] = ((x2[i, j] - x2[k, l]) * (x2[i, j] - x2[k, l]) +
                            (y2[i, j] - y2[k, l]) * (y2[i, j] - y2[k, l]) +
                            (z2[i, j] - z2[k, l]) * (z2[i, j] - z2[k, l]))

        f0t = hl.Func("f0t")
        f0n = hl.Func("f0n")
        f0x = hl.Func("f0x")
        f0val = hl.Func("f0val")
        self.add_funcs_by_name([f0t, f0n, f0x, f0val])
        f0t[i, j, k, l] = fac4d[i, j, k, l] * rpq2[i, j, k, l]
        f0n[i, j, k, l] = hl.clamp(hl.i32((f0t[i, j, k, l] + delo2) * rdelta),
                                   fm_in.dim(0).min(),
                                   fm_in.dim(0).max())
        f0x[i, j, k, l] = delta * f0n[i, j, k, l] - f0t[i, j, k, l]
        f0val[i, j, k, l] = hl.select(
            f0t[i, j, k, l] >= hl.f64(28.0),
            hl.f64(0.88622692545276) / hl.sqrt(f0t[i, j, k, l]),
            fm[f0n[i, j, k, l], 0] + f0x[i, j, k, l] *
            (fm[f0n[i, j, k, l], 1] + f0x[i, j, k, l] * hl.f64(0.5) *
             (fm[f0n[i, j, k, l], 2] + f0x[i, j, k, l] * hl.f64(1. / 3.) *
              (fm[f0n[i, j, k, l], 3] +
               f0x[i, j, k, l] * hl.f64(0.25) * fm[f0n[i, j, k, l], 4]))))

        g = hl.Func("g")
        self.add_funcs_by_name([g])

        if self.tracing and self.tracing_g:
            g_trace_in = hl.ImageParam(hl.Float(64), 4, "g_trace_in")
            g_trace = hl.BoundaryConditions.constant_exterior(g_trace_in, 0)
            self.inputs["g_trace_in"] = g_trace_in
            self.clamps["g_trace"] = g_trace
            g_trace.compute_root()
            g[i, j, k,
              l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i, j, k, l]
                    ) * ex[i, j, k, l] * f0val[i, j, k, l] * rnorm[i] * rnorm[
                        j] * rnorm[k] * rnorm[l] + g_trace[i, j, k, l]
        else:
            g_trace = None
            g[i, j, k,
              l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) /
                    denom[i, j, k, l]) * ex[i, j, k, l] * f0val[
                        i, j, k, l] * rnorm[i] * rnorm[j] * rnorm[k] * rnorm[l]
コード例 #22
0
ファイル: local_laplacian.py プロジェクト: zy20091082/Halide
def get_local_laplacian(input, levels, alpha, beta, J=8):
    downsample_counter = [0]
    upsample_counter = [0]

    x = hl.Var('x')
    y = hl.Var('y')

    def downsample(f):
        downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 *
                          (f[2 * x, y, c] + f[2 * x + 1, y, c]) +
                          f[2 * x + 2, y, c]) / 8.0
        downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 *
                          (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) +
                          downx[x, 2 * y + 2, c]) / 8.0

        return downy

    def upsample(f):
        upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 *
                                (x % 2), y, c] + 0.75 * f[x // 2, y, c]
        upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 *
                                  (y % 2), c] + 0.75 * upx[x, y // 2, c]

        return upy

    def downsample2D(f):
        downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y] = (f[2 * x - 1, y] + 3.0 *
                       (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0
        downy[x, y] = (downx[x, 2 * y - 1] + 3.0 *
                       (downx[x, 2 * y] + downx[x, 2 * y + 1]) +
                       downx[x, 2 * y + 2]) / 8.0

        return downy

    def upsample2D(f):
        upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x,
            y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y]
        upy[x,
            y] = 0.25 * upx[x,
                            (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2]

        return upy

    # THE ALGORITHM

    # loop variables
    c = hl.Var('c')
    k = hl.Var('k')

    # Make the remapping function as a lookup table.
    remap = hl.Func('remap')
    fx = hl.cast(float_t, x / 256.0)
    #remap[x] = alpha*fx*exp(-fx*fx/2.0)
    remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0)

    # Convert to floating point
    floating = hl.Func('floating')
    floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0

    # Set a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y, c] = floating[hl.clamp(x, 0,
                                         input.width() - 1),
                                hl.clamp(y, 0,
                                         input.height() - 1), c]

    # Get the luminance channel
    gray = hl.Func('gray')
    gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[
        x, y, 1] + 0.114 * clamped[x, y, 2]

    # Make the processed Gaussian pyramid.
    gPyramid = [hl.Func('gPyramid%d' % i) for i in range(J)]
    # Do a lookup into a lut with 256 entires per intensity level
    level = k / (levels - 1)
    idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0
    idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256)
    gPyramid[0][x, y,
                k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k]
    for j in range(1, J):
        gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k]

    # Get its laplacian pyramid
    lPyramid = [hl.Func('lPyramid%d' % i) for i in range(J)]
    lPyramid[J - 1] = gPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(
            gPyramid[j + 1])[x, y, k]

    # Make the Gaussian pyramid of the input
    inGPyramid = [hl.Func('inGPyramid%d' % i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1, J):
        inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y]

    # Make the laplacian pyramid of the output
    outLPyramid = [hl.Func('outLPyramid%d' % i) for i in range(J)]
    for j in range(J):
        # Split input pyramid value into integer and floating parts
        level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1)
        li = hl.clamp(hl.cast(int_t, level), 0, levels - 2)
        lf = level - hl.cast(float_t, li)
        # Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j][x, y] = (
            1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1]

    # Make the Gaussian pyramid of the output
    outGPyramid = [hl.Func('outGPyramid%d' % i) for i in range(J)]
    outGPyramid[J - 1] = outLPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        outGPyramid[j][x, y] = upsample2D(
            outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]

    # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
    color = hl.Func('color')
    eps = 0.01
    color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] +
                                             eps) / (gray[x, y] + eps)

    output = hl.Func('local_laplacian')
    # Convert back to 16-bit
    output[x, y, c] = hl.cast(hl.UInt(16),
                              hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0)

    # THE SCHEDULE
    remap.compute_root()

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU Schedule
        print("Compiling for GPU")
        xi, yi = hl.Var("xi"), hl.Var("yi")
        output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default)
        for j in range(J):
            blockw = 32
            blockh = 16
            if j > 3:
                blockw = 2
                blockh = 2
            if j > 0:
                inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
                                                      blockh, GPU_Default)
            if j > 0:
                gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(
                    x, y, xi, yi, blockw, blockh, GPU_Default)
            outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
                                                   blockh, GPU_Default)
    else:
        # CPU schedule
        print("Compiling for CPU")
        output.parallel(y, 4).vectorize(x, 4)
        gray.compute_root().parallel(y, 4).vectorize(x, 4)
        for j in range(4):
            if j > 0:
                inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            if j > 0:
                gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            outGPyramid[j].compute_root().parallel(y).vectorize(x, 4)
        for j in range(4, J):
            inGPyramid[j].compute_root().parallel(y)
            gPyramid[j].compute_root().parallel(k)
            outGPyramid[j].compute_root().parallel(y)

    return output
コード例 #23
0
 def clamp(self, min_p, max_p):
     return Point(hl.clamp(self.x, min_p.x, max_p.x),
                  hl.clamp(self.y, min_p.y, max_p.y))
コード例 #24
0
ファイル: autotune.py プロジェクト: nzinfo/Halide
def test_schedules(verbose=False, test_random=False):
    #random_module.seed(int(sys.argv[1]) if len(sys.argv)>1 else 0)
    halide.exit_on_signal()
    f = halide.Func('f')
    x = halide.Var('x')
    y = halide.Var('y')
    c = halide.Var('c')
    g = halide.Func('g')
    v = halide.Var('v')
    input = halide.UniformImage(halide.UInt(16), 3)
    int_t = halide.Int(32)
    f[x,y,c] = input[halide.clamp(x,halide.cast(int_t,0),halide.cast(int_t,input.width()-1)),
                     halide.clamp(y,halide.cast(int_t,0),halide.cast(int_t,input.height()-1)),
                     halide.clamp(c,halide.cast(int_t,0),halide.cast(int_t,2))]
    #g[v] = f[v,v]
    g[x,y,c] = f[x,y,c]+1
    assert sorted(halide.all_vars(g).keys()) == sorted(['x', 'y', 'c']) #, 'v'])

    if verbose:
        print halide.func_varlist(f)
        print 'caller_vars(f) =', caller_vars(g, f)
        print 'caller_vars(g) =', caller_vars(g, g)
    
#    validL = list(valid_schedules(g, f, 4))
#    validL = [repr(_x) for _x in validL]
#    
#    for L in sorted(validL):
#        print repr(L)
    T0 = time.time()
    if not test_random:
        random = True #False
        nvalid_determ = 0
        for L in schedules_func(g, f, 0, 3):
            nvalid_determ += 1
            if verbose:
                print L
        nvalid_random = 0
        for i in range(100):
            for L in schedules_func(g, f, 0, DEFAULT_MAX_DEPTH, random=True): #sorted([repr(_x) for _x in valid_schedules(g, f, 3)]):
                if verbose and 0:
                    print L#repr(L)
                nvalid_random += 1
    s = []
    for i in range(400):
        d = random_schedule(g, 0, DEFAULT_MAX_DEPTH)
        si = str(d)
        s.append(si)
        if verbose:
            print 'Schedule:', si

        d.apply()
        evaluate = d.test((36, 36, 3), input)
        print 'evaluate'
        evaluate()
        if test_random:
            print 'Success'
            sys.exit()
    T1 = time.time()
    
    s = '\n'.join(s)
    assert 'f.chunk(_c0)' in s
    assert 'f.root().vectorize' in s
    assert 'f.root().unroll' in s
    assert 'f.root().split' in s
    assert 'f.root().tile' in s
    assert 'f.root().parallel' in s
    assert 'f.root().transpose' in s

    assert nvalid_random == 100
    if verbose:
        print 'generated in %.3f secs' % (T1-T0)
    print 'random_schedule: OK'
コード例 #25
0
def get_interpolate(input, levels):
    """
    Build function, schedules it, and invokes jit compiler
    :return: halide.hl.Func
    """

    # THE ALGORITHM

    downsampled = [hl.Func('downsampled%d' % i) for i in range(levels)]
    downx = [hl.Func('downx%d' % l) for l in range(levels)]
    interpolated = [hl.Func('interpolated%d' % i) for i in range(levels)]
    #     level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)]
    #     level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)]
    upsampled = [hl.Func('upsampled%d' % l) for l in range(levels)]
    upsampledx = [hl.Func('upsampledx%d' % l) for l in range(levels)]
    x = hl.Var('x')
    y = hl.Var('y')
    c = hl.Var('c')

    clamped = hl.Func('clamped')
    clamped[x, y, c] = input[hl.clamp(x, 0,
                                      input.width() - 1),
                             hl.clamp(y, 0,
                                      input.height() - 1), c]

    # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we
    # rewrite it in a way that doesn't trigger the bug. The rewritten
    # form assumes the input alpha is zero or one.
    # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3])
    downsampled[0][x, y, c] = clamped[x, y, c] * clamped[x, y, 3]

    for l in range(1, levels):
        prev = hl.Func()
        prev = downsampled[l - 1]

        if l == 4:
            # Also add a boundary condition at a middle pyramid level
            # to prevent the footprint of the downsamplings to extend
            # too far off the base image. Otherwise we look 512
            # pixels off each edge.
            w = input.width() / (1 << l)
            h = input.height() / (1 << l)
            prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w),
                                             hl.clamp(y, 0, h), c])

        downx[l][x, y, c] = (prev[x * 2 - 1, y, c] + 2.0 * prev[x * 2, y, c] +
                             prev[x * 2 + 1, y, c]) * 0.25
        downsampled[l][x, y, c] = (downx[l][x, y * 2 - 1, c] +
                                   2.0 * downx[l][x, y * 2, c] +
                                   downx[l][x, y * 2 + 1, c]) * 0.25

    interpolated[levels - 1][x, y, c] = downsampled[levels - 1][x, y, c]
    for l in range(levels - 1)[::-1]:
        upsampledx[l][x, y, c] = (interpolated[l + 1][x / 2, y, c] +
                                  interpolated[l + 1][(x + 1) / 2, y, c]) / 2.0
        upsampled[l][x, y, c] = (upsampledx[l][x, y / 2, c] +
                                 upsampledx[l][x, (y + 1) / 2, c]) / 2.0
        interpolated[l][x, y, c] = downsampled[l][
            x, y, c] + (1.0 - downsampled[l][x, y, 3]) * upsampled[l][x, y, c]

    normalize = hl.Func('normalize')
    normalize[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3]

    final = hl.Func('final')
    final[x, y, c] = normalize[x, y, c]

    print("Finished function setup.")

    # THE SCHEDULE

    sched = 2
    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        sched = 4
    else:
        sched = 2

    if sched == 0:
        print("Flat schedule.")
        for l in range(levels):
            downsampled[l].compute_root()
            interpolated[l].compute_root()

        final.compute_root()

    elif sched == 1:
        print("Flat schedule with vectorization.")
        for l in range(levels):
            downsampled[l].compute_root().vectorize(x, 4)
            interpolated[l].compute_root().vectorize(x, 4)

        final.compute_root()

    elif sched == 2:
        print("Flat schedule with parallelization + vectorization")
        xi, yi = hl.Var('xi'), hl.Var('yi')
        clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(
            c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
        for l in range(1, levels - 1):
            if l > 0:
                downsampled[l].compute_root().parallel(y).reorder(
                    c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].compute_root().parallel(y).reorder(
                c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].unroll(x, 2).unroll(y, 2)

        final.reorder(c, x, y).bound(c, 0, 3).parallel(y)
        final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi)
        final.bound(x, 0, input.width())
        final.bound(y, 0, input.height())

    elif sched == 3:
        print("Flat schedule with vectorization sometimes.")
        for l in range(levels):
            if l + 4 < levels:
                yo, yi = hl.Var('yo'), hl.Var('yi')
                downsampled[l].compute_root().vectorize(x, 4)
                interpolated[l].compute_root().vectorize(x, 4)
            else:
                downsampled[l].compute_root()
                interpolated[l].compute_root()

        final.compute_root()

    elif sched == 4:
        print("GPU schedule.")

        # Some gpus don't have enough memory to process the entire
        # image, so we process the image in tiles.
        yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var(
            "ci")
        final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4)
        final.tile(x, y, xo, yo, xi, yi, input.width() / 4, input.height() / 4)
        normalize.compute_at(final,
                             xo).reorder(c, x,
                                         y).gpu_tile(x, y, xi, yi, 16, 16,
                                                     GPU_Default).unroll(c)

        # Start from level 1 to save memory - level zero will be computed on demand
        for l in range(1, levels):
            tile_size = 32 >> l
            if tile_size < 1: tile_size = 1
            if tile_size > 16: tile_size = 16
            downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci,
                                                   tile_size, tile_size, 4,
                                                   GPU_Default)
            interpolated[l].compute_at(final,
                                       xo).gpu_tile(x, y, c, xi, yi, ci,
                                                    tile_size, tile_size, 4,
                                                    GPU_Default)

    else:
        print("No schedule with this number.")
        exit(1)

    # JIT compile the pipeline eagerly, so we don't interfere with timing
    final.compile_jit(target)

    return final
コード例 #26
0
def main():
    # Declare some Vars to use below.
    x, y = hl.Var("x"), hl.Var("y")

    # Load a grayscale image to use as an input.
    image_path = os.path.join(os.path.dirname(__file__),
                              "../../tutorial/images/gray.png")
    input_data = imread(image_path)
    if True:
        # making the image smaller to go faster
        input_data = input_data[:160, :150]
    assert input_data.dtype == np.uint8
    input = hl.Buffer(input_data)

    # You can define a hl.Func in multiple passes. Let's see a toy
    # example first.
    if True:
        # The first definition must be one like we have seen already
        # - a mapping from Vars to an hl.Expr:
        f = hl.Func("f")
        f[x, y] = x + y
        # We call this first definition the "pure" definition.

        # But the later definitions can include computed expressions on
        # both sides. The simplest example is modifying a single point:
        f[3, 7] = 42

        # We call these extra definitions "update" definitions, or
        # "reduction" definitions. A reduction definition is an
        # update definition that recursively refers back to the
        # function's current value at the same site:
        if False:
            e = f[x, y] + 17
            print("f[x, y] + 17", e)
            print("(f[x, y] + 17).type()", e.type())
            print("(f[x, y]).type()", f[x, y].type())

        f[x, y] = f[x, y] + 17

        # If we confine our update to a single row, we can
        # recursively refer to values in the same column:
        f[x, 3] = f[x, 0] * f[x, 10]

        # Similarly, if we confine our update to a single column, we
        # can recursively refer to other values in the same row.
        f[0, y] = f[0, y] / f[3, y]

        # The general rule is: Each hl.Var used in an update definition
        # must appear unadorned in the same position as in the pure
        # definition in all references to the function on the left-
        # and right-hand sides. So the following definitions are
        # legal updates:
        f[x,
          17] = x + 8  # x is used, so all uses of f must have x as the first argument.
        f[0,
          y] = y * 8  # y is used, so all uses of f must have y as the second argument.
        f[x, x + 1] = x + 8
        f[y / 2, y] = f[0, y] * 17

        # But these ones would cause an error:
        # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'.
        # f[y, y + 1) = y + 8   <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'.
        # f[y, x) = y - x      <- Arguments to f on the left-hand-side are in the wrong places.
        # f[3, 4) = x + y      <- Free variables appear on the right-hand-side but not the left-hand-side.

        # We'll realize this one just to make sure it compiles. The
        # second-to-last definition forces us to realize over a
        # domain that is taller than it is wide.
        f.realize(100, 101)

        # For each realization of f, each step runs in its entirety
        # before the next one begins. Let's trace the loads and
        # stores for a simpler example:
        g = hl.Func("g")
        g[x, y] = x + y  # Pure definition
        g[2, 1] = 42  # First update definition
        g[x, 0] = g[x, 1]  # Second update definition

        g.trace_loads()
        g.trace_stores()

        g.realize(4, 4)

        # Reading the log, we see that each pass is applied in turn. The equivalent C is:
        result = np.empty((4, 4), dtype=np.int)
        # Pure definition
        for yy in range(4):
            for xx in range(4):
                result[yy][xx] = xx + yy

        # First update definition
        result[1][2] = 42
        # Second update definition
        for xx in range(4):
            result[0][xx] = result[1][xx]
    # end of section

    # Putting update passes inside loops.
    if True:
        # Starting with this pure definition:
        f = hl.Func("f")
        f[x, y] = x + y

        # Say we want an update that squares the first fifty rows. We
        # could do this by adding 50 update definitions:

        # f[x, 0) = f[x, 0) * f[x, 0)
        # f[x, 1) = f[x, 1) * f[x, 1)
        # f[x, 2) = f[x, 2) * f[x, 2)
        # ...
        # f[x, 49) = f[x, 49) * f[x, 49)

        # Or equivalently using a compile-time loop in our C++:
        # for (int i = 0 i < 50 i++) {
        #   f[x, i) = f[x, i) * f[x, i)
        #

        # But it's more manageable and more flexible to put the loop
        # in the generated code. We do this by defining a "reduction
        # domain" and using it inside an update definition:
        r = hl.RDom(0, 50)
        f[x, r] = f[x, r] * f[x, r]
        halide_result = f.realize(100, 100)

        # The equivalent C is:
        c_result = np.empty((100, 100), dtype=np.int)
        for yy in range(100):
            for xx in range(100):
                c_result[yy][xx] = xx + yy

        for xx in range(100):
            for rr in range(50):
                # The loop over the reduction domain occurs inside of
                # the loop over any pure variables used in the update
                # step:
                c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx]

        # Check the results match:
        for yy in range(100):
            for xx in range(100):
                if halide_result(xx, yy) != c_result[yy][xx]:
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result[yy][xx]))
                    return -1

    # Now we'll examine a real-world use for an update definition:
    # computing a histogram.
    if True:

        # Some operations on images can't be cleanly expressed as a pure
        # function from the output coordinates to the value stored
        # there. The classic example is computing a histogram. The
        # natural way to do it is to iterate over the input image,
        # updating histogram buckets. Here's how you do that in Halide:
        histogram = hl.Func("histogram")

        # Histogram buckets start as zero.
        histogram[x] = 0

        # Define a multi-dimensional reduction domain over the input image:
        r = hl.RDom(0, input.width(), 0, input.height())

        # For every point in the reduction domain, increment the
        # histogram bucket corresponding to the intensity of the
        # input image at that point.
        histogram[input[r.x, r.y]] += 1

        halide_result = histogram.realize(256)

        # The equivalent C is:
        c_result = np.empty((256), dtype=np.int)
        for xx in range(256):
            c_result[xx] = 0

        for r_y in range(input.height()):
            for r_x in range(input.width()):
                c_result[input_data[r_x, r_y]] += 1

        # Check the answers agree:
        for xx in range(256):
            if c_result[xx] != halide_result(xx):
                raise Exception("halide_result(%d) = %d instead of %d" %
                                (xx, halide_result(xx), c_result[xx]))
                return -1

    # Scheduling update steps
    if True:
        # The pure variables in an update step and can be
        # parallelized, vectorized, split, etc as usual.

        # Vectorizing, splitting, or parallelize the variables that
        # are part of the reduction domain is trickier. We'll cover
        # that in a later lesson.

        # Consider the definition:
        f = hl.Func("x")
        f[x, y] = x * y
        # Set the second row to equal the first row.
        f[x, 1] = f[x, 0]
        # Set the second column to equal the first column plus 2.
        f[1, y] = f[0, y] + 2

        # The pure variables in each stage can be scheduled
        # independently. To control the pure definition, we schedule
        # as we have done in the past. The following code vectorizes
        # and parallelizes the pure definition only.
        f.vectorize(x, 4).parallel(y)

        # We use hl.Func::update(int) to get a handle to an update step
        # for the purposes of scheduling. The following line
        # vectorizes the first update step across x. We can't do
        # anything with y for this update step, because it doesn't
        # use y.
        f.update(0).vectorize(x, 4)

        # Now we parallelize the second update step in chunks of size
        # 4.
        yo, yi = hl.Var("yo"), hl.Var("yi")
        f.update(1).split(y, yo, yi, 4).parallel(yo)

        halide_result = f.realize(16, 16)

        # Here's the equivalent (serial) C:
        c_result = np.empty((16, 16), dtype=np.int)

        # Pure step. Vectorized in x and parallelized in y.
        for yy in range(16):  # Should be a parallel for loop
            for x_vec in range(4):
                xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
                c_result[yy][xx[0]] = xx[0] * yy
                c_result[yy][xx[1]] = xx[1] * yy
                c_result[yy][xx[2]] = xx[2] * yy
                c_result[yy][xx[3]] = xx[3] * yy

        # First update. Vectorized in x.
        for x_vec in range(4):
            xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
            c_result[1][xx[0]] = c_result[0][xx[0]]
            c_result[1][xx[1]] = c_result[0][xx[1]]
            c_result[1][xx[2]] = c_result[0][xx[2]]
            c_result[1][xx[3]] = c_result[0][xx[3]]

        # Second update. Parallelized in chunks of size 4 in y.
        for yo in range(4):  # Should be a parallel for loop
            for yi in range(4):
                yy = yo * 4 + yi
                c_result[yy][1] = c_result[yy][0] + 2

        # Check the C and Halide results match:
        for yy in range(16):
            for xx in range(16):
                if halide_result(xx, yy) != c_result[yy][xx]:
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result[yy][xx]))
                    return -1

    # That covers how to schedule the variables within a hl.Func that
    # uses update steps, but what about producer-consumer
    # relationships that involve compute_at and store_at? Let's
    # examine a reduction as a producer, in a producer-consumer pair.
    if True:
        # Because an update does multiple passes over a stored array,
        # it's not meaningful to inline them. So the default schedule
        # for them does the closest thing possible. It computes them
        # in the innermost loop of their consumer. Consider this
        # trivial example:
        producer, consumer = hl.Func("producer"), hl.Func("consumer")
        producer[x] = x * 17
        producer[x] += 1
        consumer[x] = 2 * producer[x]
        halide_result = consumer.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            producer_storage = np.empty((1), dtype=np.int)
            # Pure step for producer
            producer_storage[0] = xx * 17
            # Update step for producer
            producer_storage[0] = producer_storage[0] + 1
            # Pure step for consumer
            c_result[xx] = 2 * producer_storage[0]

        # Check the results match
        for xx in range(10):
            if halide_result(xx) != c_result[xx]:
                raise Exception("halide_result(%d) = %d instead of %d" %
                                (xx, halide_result(xx), c_result[xx]))
                return -1

        # For all other compute_at/store_at options, the reduction
        # gets placed where you would expect, somewhere in the loop
        # nest of the consumer.

    # Now let's consider a reduction as a consumer in a
    # producer-consumer pair. This is a little more involved.
    if True:
        if True:
            # Case 1: The consumer references the producer in the pure step only.
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            # The producer is pure.
            producer[x] = x * 17
            consumer[x] = 2 * producer[x]
            consumer[x] += 1

            # The valid schedules for the producer in this case are
            # the default schedule - inlined, and also:
            #
            # 1) producer.compute_at(x), which places the computation of
            # the producer inside the loop over x in the pure step of the
            # consumer.
            #
            # 2) producer.compute_root(), which computes all of the
            # producer ahead of time.
            #
            # 3) producer.store_root().compute_at(x), which allocates
            # space for the consumer outside the loop over x, but fills
            # it in as needed inside the loop.
            #
            # Let's use option 1.

            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)

            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = 2 * producer_storage[0]

            # Update step for the consumer
            for xx in range(10):
                c_result[xx] += 1

            # All of the pure step is evaluated before any of the
            # update step, so there are two separate loops over x.

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 2: The consumer references the producer in the update step only
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside the update
            # step of the producer, because that's the only step that
            # uses the producer.
            producer.compute_at(consumer, x)

            # Note however, that we didn't say:
            #
            # producer.compute_at(consumer.update(0), x).
            #
            # Scheduling is done with respect to Vars of a hl.Func, and
            # the Vars of a hl.Func are shared across the pure and
            # update steps.

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                c_result[xx] = xx

            # Update step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 3: The consumer references the producer in
            # multiple steps that share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = producer[x] * x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside both the
            # pure and the update step of the producer. So there ends
            # up being two separate realizations of the producer, and
            # redundant work occurs.
            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = producer_storage[0] * xx

            # Update step for the consumer
            for xx in range(10):
                # Another copy of the pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 4: The consumer references the producer in
            # multiple steps that do not share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x, y] = x * y
            consumer[x, y] = x + y
            consumer[x, 0] = producer[x, x - 1]
            consumer[0, y] = producer[y, y - 1]

            # In this case neither producer.compute_at(consumer, x)
            # nor producer.compute_at(consumer, y) will work, because
            # either one fails to cover one of the uses of the
            # producer. So we'd have to inline producer, or use
            # producer.compute_root().

            # Let's say we really really want producer to be
            # compute_at the inner loops of both consumer update
            # steps. Halide doesn't allow multiple different
            # schedules for a single hl.Func, but we can work around it
            # by making two wrappers around producer, and scheduling
            # those instead:

            # Attempt 2:
            producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(
            ), hl.Func(), hl.Func()
            producer_wrapper_1[x, y] = producer[x, y]
            producer_wrapper_2[x, y] = producer[x, y]

            consumer_2[x, y] = x + y
            consumer_2[x, 0] += producer_wrapper_1[x, x - 1]
            consumer_2[0, y] += producer_wrapper_2[y, y - 1]

            # The wrapper functions give us two separate handles on
            # the producer, so we can schedule them differently.
            producer_wrapper_1.compute_at(consumer_2, x)
            producer_wrapper_2.compute_at(consumer_2, y)

            halide_result = consumer_2.realize(10, 10)

            # The equivalent C is:
            c_result = np.empty((10, 10), dtype=np.int)

            # Pure step for the consumer
            for yy in range(10):
                for xx in range(10):
                    c_result[yy][xx] = xx + yy

            # First update step for consumer
            for xx in range(10):
                producer_wrapper_1_storage = np.empty((1), dtype=np.int)
                producer_wrapper_1_storage[0] = xx * (xx - 1)
                c_result[0][xx] += producer_wrapper_1_storage[0]

            # Second update step for consumer
            for yy in range(10):
                producer_wrapper_2_storage = np.empty((1), dtype=np.int)
                producer_wrapper_2_storage[0] = yy * (yy - 1)
                c_result[yy][0] += producer_wrapper_2_storage[0]

            # Check the results match
            for yy in range(10):
                for xx in range(10):
                    if halide_result(xx, yy) != c_result[yy][xx]:
                        print("halide_result(%d, %d) = %d instead of %d", xx,
                              yy, halide_result(xx, yy), c_result[yy][xx])
                        return -1

        if True:
            # Case 5: Scheduling a producer under a reduction domain
            # variable of the consumer.

            # We are not just restricted to scheduling producers at
            # the loops over the pure variables of the consumer. If a
            # producer is only used within a loop over a reduction
            # domain (hl.RDom) variable, we can also schedule the
            # producer there.

            producer, consumer = hl.Func("producer"), hl.Func("consumer")

            r = hl.RDom(0, 5)
            producer[x] = x * 17
            consumer[x] = x + 10
            consumer[x] += r + producer[x + r]

            producer.compute_at(consumer, r)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer.
            for xx in range(10):
                c_result[xx] = xx + 10

            # Update step for the consumer.
            for xx in range(10):
                for rr in range(
                        5
                ):  # The loop over the reduction domain is always the inner loop.
                    # We've schedule the storage and computation of
                    # the producer here. We just need a single value.
                    producer_storage = np.empty((1), dtype=np.int)
                    # Pure step of the producer.
                    producer_storage[0] = (xx + rr) * 17

                    # Now use it in the update step of the consumer.
                    c_result[xx] += rr + producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

    # A real-world example of a reduction inside a producer-consumer chain.
    if True:
        # The default schedule for a reduction is a good one for
        # convolution-like operations. For example, the following
        # computes a 5x5 box-blur of our grayscale test image with a
        # hl.clamp-to-edge boundary condition:

        # First add the boundary condition.
        clamped = hl.repeat_edge(input)

        # Define a 5x5 box that starts at (-2, -2)
        r = hl.RDom(-2, 5, -2, 5)

        # Compute the 5x5 sum around each pixel.
        local_sum = hl.Func("local_sum")
        local_sum[x, y] = 0  # Compute the sum as a 32-bit integer
        local_sum[x, y] += clamped[x + r.x, y + r.y]

        # Divide the sum by 25 to make it an average
        blurry = hl.Func("blurry")
        blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25)

        halide_result = blurry.realize(input.width(), input.height())

        # The default schedule will inline 'clamped' into the update
        # step of 'local_sum', because clamped only has a pure
        # definition, and so its default schedule is fully-inlined.
        # We will then compute local_sum per x coordinate of blurry,
        # because the default schedule for reductions is
        # compute-innermost. Here's the equivalent C:

        #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0]
        local_sum = np.empty((1), dtype=np.int32)

        c_result = hl.Buffer(hl.UInt(8), input.width(), input.height())
        for yy in range(input.height()):
            for xx in range(input.width()):
                # FIXME this loop is quite slow
                # Pure step of local_sum
                local_sum[0] = 0
                # Update step of local_sum
                for r_y in range(-2, 2 + 1):
                    for r_x in range(-2, 2 + 1):
                        # The clamping has been inlined into the update step.
                        clamped_x = min(max(xx + r_x, 0), input.width() - 1)
                        clamped_y = min(max(yy + r_y, 0), input.height() - 1)
                        local_sum[0] += input(clamped_x, clamped_y)

                # Pure step of blurry
                #c_result(x, y) = (uint8_t)(local_sum[0] / 25)
                #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25)
                c_result[xx, yy] = int(local_sum[0] /
                                       25)  # hl.cast done internally

        # Check the results match
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result(xx, yy) != c_result(xx, yy):
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result(xx, yy)))
                    return -1

    # Reduction helpers.
    if True:
        # There are several reduction helper functions provided in
        # Halide.h, which compute small reductions and schedule them
        # innermost into their consumer. The most useful one is
        # "sum".
        f1 = hl.Func("f1")
        r = hl.RDom(0, 100)
        f1[x] = hl.sum(r + x) * 7

        # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to:
        f2, anon = hl.Func("f2"), hl.Func("anon")
        anon[x] = 0
        anon[x] += r + x
        f2[x] = anon[x] * 7

        # So even though f1 references a reduction domain, it is a
        # pure function. The reduction domain has been swallowed to
        # define the inner anonymous reduction.
        halide_result_1 = f1.realize(10)
        halide_result_2 = f2.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            anon = np.empty((1), dtype=np.int)
            anon[0] = 0
            for rr in range(100):
                anon[0] += rr + xx

            c_result[xx] = anon[0] * 7

        # Check they all match.
        for xx in range(10):
            if halide_result_1(xx) != c_result[xx]:
                print("halide_result_1(%d) = %d instead of %d", x,
                      halide_result_1(x), c_result[x])
                return -1

            if halide_result_2(xx) != c_result[xx]:
                print("halide_result_2(%d) = %d instead of %d", x,
                      halide_result_2(x), c_result[x])
                return -1

    # A complex example that uses reduction helpers.
    if False:  # non-sense to port SSE code to python, skipping this test

        # Other reduction helpers include "product", "minimum",
        # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax
        # requires understanding tuples, which come in a later
        # lesson. Let's use minimum and maximum to compute the local
        # spread of our grayscale image.

        # First, add a boundary condition to the input.
        clamped = hl.Func("clamped")
        x_clamped = hl.clamp(x, 0, input.width() - 1)
        y_clamped = hl.clamp(y, 0, input.height() - 1)
        clamped[x, y] = input[x_clamped, y_clamped]

        box = hl.RDom(-2, 5, -2, 5)
        # Compute the local maximum minus the local minimum:
        spread = hl.Func("spread")
        spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) -
                        minimum(clamped(x + box.x, y + box.y)))

        # Compute the result in strips of 32 scanlines
        yo, yi = hl.Var("yo"), hl.Var("yi")
        spread.split(y, yo, yi, 32).parallel(yo)

        # Vectorize across x within the strips. This implicitly
        # vectorizes stuff that is computed within the loop over x in
        # spread, which includes our minimum and maximum helpers, so
        # they get vectorized too.
        spread.vectorize(x, 16)

        # We'll apply the boundary condition by padding each scanline
        # as we need it in a circular buffer (see lesson 08).
        clamped.store_at(spread, yo).compute_at(spread, yi)

        halide_result = spread.realize(input.width(), input.height())

        # The C equivalent is almost too horrible to contemplate (and
        # took me a long time to debug). This time I want to time
        # both the Halide version and the C version, so I'll use sse
        # intrinsics for the vectorization, and openmp to do the
        # parallel for loop (you'll need to compile with -fopenmp or
        # similar to get correct timing).
        #ifdef __SSE2__

        # Don't include the time required to allocate the output buffer.
        c_result = hl.Buffer(hl.UInt(8), input.width(), input.height())

        #ifdef _OPENMP
        t1 = datetime.now()
        #endif

        # Run this one hundred times so we can average the timing results.
        for iters in range(100):
            pass
            # #pragma omp parallel for
            # for yo in range((input.height() + 31)/32):
            #     y_base = hl.min(yo * 32, input.height() - 32)
            #
            #     # Compute clamped in a circular buffer of size 8
            #     # (smallest power of two greater than 5). Each thread
            #     # needs its own allocation, so it must occur here.
            #
            #     clamped_width = input.width() + 4
            #     clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8)
            #
            #     for yi in range(32):
            #         y = y_base + yi
            #
            #         uint8_t *output_row = &c_result(0, y)
            #
            #         # Compute clamped for this scanline, skipping rows
            #         # already computed within this slice.
            #         int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2)
            #         int max_y_clamped = (y + 2)
            #         for (int cy = min_y_clamped cy <= max_y_clamped cy++) {
            #             # Figure out which row of the circular buffer
            #             # we're filling in using bitmasking:
            #             uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width
            #
            #             # Figure out which row of the input we're reading
            #             # from by clamping the y coordinate:
            #             int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1)
            #             uint8_t *input_row = &input(0, clamped_y)
            #
            #             # Fill it in with the padding.
            #             for (int x = -2 x < input.width() + 2 ):
            #                 int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1)
            #                 *clamped_row++ = input_row[clamped_x]
            #
            #
            #
            #         # Now iterate over vectors of x for the pure step of the output.
            #         for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) {
            #             int x_base = std::hl.min(x_vec * 16, input.width() - 16)
            #
            #             # Allocate storage for the minimum and maximum
            #             # helpers. One vector is enough.
            #             __m128i minimum_storage, maximum_storage
            #
            #             # The pure step for the maximum is a vector of zeros
            #             maximum_storage = (__m128i)_mm_setzero_ps()
            #
            #             # The update step for maximum
            #             for (int max_y = y - 2 max_y <= y + 2 max_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width
            #                 for (int max_x = x_base - 2 max_x <= x_base + 2 max_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2))
            #                     maximum_storage = _mm_max_epu8(maximum_storage, v)
            #
            #
            #
            #             # The pure step for the minimum is a vector of
            #             # ones. Create it by comparing something to
            #             # itself.
            #             minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(),
            #                                                     _mm_setzero_ps())
            #
            #             # The update step for minimum.
            #             for (int min_y = y - 2 min_y <= y + 2 min_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width
            #                 for (int min_x = x_base - 2 min_x <= x_base + 2 min_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2))
            #                     minimum_storage = _mm_min_epu8(minimum_storage, v)
            #
            #
            #
            #             # Now compute the spread.
            #             __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage)
            #
            #             # Store it.
            #             _mm_storeu_si128((__m128i *)(output_row + x_base), spread)
            #
            #
            #
            #     del clamped_storage
            #
        # end of hundred iterations

        # Skip the timing comparison if we don't have openmp
        # enabled. Otherwise it's unfair to C.
        #ifdef _OPENMP
        t2 = datetime.now()

        # Now run the Halide version again without the
        # jit-compilation overhead. Also run it one hundred times.
        for iters in range(100):
            spread.realize(halide_result)

        t3 = datetime.now()

        # Report the timings. On my machine they both take about 3ms
        # for the 4-megapixel input (fast!), which makes sense,
        # because they're using the same vectorization and
        # parallelization strategy. However I find the Halide easier
        # to read, write, debug, modify, and port.
        print("Halide spread took %f ms. C equivalent took %f ms" %
              ((t3 - t2).total_seconds() * 1000,
               (t2 - t1).total_seconds() * 1000))

        #endif # _OPENMP

        # Check the results match:
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result(xx, yy) != c_result(xx, yy):
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result(xx, yy)))
                    return -1

        #endif # __SSE2__
    else:
        print("(Skipped the SSE2 section of the code, "
              "since non-sense in python world.)")

    print("Success!")
    return 0