Exemple #1
0
def test_compiletime_error():
    x = hl.Var('x')
    y = hl.Var('y')
    f = hl.Func('f')
    f[x, y] = hl.u16(x + y)
    # Deliberate type-mismatch error
    buf = hl.Buffer(hl.UInt(8), [2, 2])
    try:
        f.realize(buf)
    except RuntimeError as e:
        assert 'Output buffer f has type uint16 but type of the buffer passed in is uint8' in str(e)
    else:
        assert False, 'Did not see expected exception!'
Exemple #2
0
def main():
    input_img = hl.ImageParam(hl.UInt(16), 3, 'input')

    # number of intensity levels
    levels = hl.Param(int_t, 'levels', 8)

    # Parameters controlling the filter
    alpha = hl.Param(float_t, 'alpha', 1.0 / 7.0)
    beta = hl.Param(float_t, 'beta', 1.0)

    local_laplacian = get_local_laplacian(input_img, levels, alpha, beta)

    filter_test_image(local_laplacian, input_img)
Exemple #3
0
def test_runtime_error():
    x = hl.Var('x')
    f = hl.Func('f')
    f[x] = hl.u8(x)
    f.bound(x, 0, 1)
    # Deliberate runtime error
    buf = hl.Buffer(hl.UInt(8), [10])
    try:
        f.realize(buf)
    except RuntimeError as e:
        assert 'do not cover required region' in str(e)
    else:
        assert False, 'Did not see expected exception!'
Exemple #4
0
def generate_compiled_file(local_laplacian):

    # Need to copy the process executable from the C++ apps/local_laplacian folder to run this.
    # (after making it of course)
    arguments = ArgumentsVector()
    arguments.append(Argument('levels', False, int_t))
    arguments.append(Argument('alpha', False, float_t))
    arguments.append(Argument('beta', False, float_t))
    arguments.append(Argument('input', True, hl.UInt(16)))
    target = hl.get_target_from_environment()
    local_laplacian.compile_to_file("local_laplacian", arguments, "local_laplacian", target)
    print("Generated compiled file for local_laplacian function.")
    return
Exemple #5
0
def test_all(vector_width, target):
    # print("target is %s " % str(target))

    W = 32
    H = 32
    input = hl.Buffer(hl.UInt(8), [W, H])
    for r in range(H):
        for c in range(W):
            input[c, r] = (c + r * W) & 0xff

    input_f = hl.Func()
    input_f[x, y] = input[x, y]

    tests = [
            (hl.BoundaryConditions.constant_exterior, check_constant_exterior),
            (hl.BoundaryConditions.repeat_edge, check_repeat_edge),
            (hl.BoundaryConditions.repeat_image, check_repeat_image),
            (hl.BoundaryConditions.mirror_image, check_mirror_image),
            (hl.BoundaryConditions.mirror_interior, check_mirror_interior),
    ]

    for bc, checker in tests:
        # print('  Testing %s:%d...' % (bc.__name__, vector_width))
        func_input_args = {'f': input_f, 'bounds': [(0, W), (0, H)]}
        image_input_args = {'f': input, 'bounds': [(0, W), (0, H)]}
        undef_min_args = {'f': input, 'bounds': [(hl.Expr(), hl.Expr()), (0, H)]}
        undef_max_args = {'f': input, 'bounds': [(0, W), (hl.Expr(), hl.Expr())]}
        implicit_bounds_args = {'f': input}

        if bc == hl.BoundaryConditions.constant_exterior:
            func_input_args['exterior'] = test_exterior
            image_input_args['exterior'] = test_exterior
            undef_min_args['exterior'] = test_exterior
            undef_max_args['exterior'] = test_exterior
            implicit_bounds_args['exterior'] = test_exterior

        realize_and_check(
                bc(**func_input_args), checker, input, test_min, test_extent, test_min,
                test_extent, vector_width, target)
        realize_and_check(
                bc(**image_input_args), checker, input, test_min, test_extent, test_min,
                test_extent, vector_width, target)
        realize_and_check(
                bc(**undef_min_args), checker, input, 0, W, test_min, test_extent,
                vector_width, target)
        realize_and_check(
                bc(**undef_max_args), checker, input, test_min, test_extent, 0, H,
                vector_width, target)
        realize_and_check(
                bc(**implicit_bounds_args), checker, input, test_min, test_extent,
                test_min, test_extent, vector_width, target)
Exemple #6
0
def test_basics():

    input = hl.ImageParam(hl.UInt(16), 2, 'input')
    x, y = hl.Var('x'), hl.Var('y')

    blur_x = hl.Func('blur_x')
    blur_xx = hl.Func('blur_xx')
    blur_y = hl.Func('blur_y')

    yy = hl.cast(hl.Int(32), 1)
    assert yy.type() == hl.Int(32)
    print("yy type:", yy.type())

    z = x + 1
    input[x,y]
    input[0,0]
    input[z,y]
    input[x+1,y]
    print("ping 0.2")
    input[x,y]+input[x+1,y]

    if False:
        aa = blur_x[x,y]
        bb = blur_x[x,y+1]
        aa + bb
        blur_x[x,y]+blur_x[x,y+1]

    print("ping 0.3")
    (input[x,y]+input[x+1,y]) / 2
    print("ping 0.4")
    blur_x[x,y]
    print("ping 0.4.1")
    blur_xx[x,y] = input[x,y]



    print("ping 0.5")
    blur_x[x,y] = (input[x,y]+input[x+1,y]+input[x+2,y])/3
    print("ping 1")
    blur_y[x,y] = (blur_x[x,y]+blur_x[x,y+1]+blur_x[x,y+2])/3

    xi, yi = hl.Var('xi'), hl.Var('yi')
    print("ping 2")
    blur_y.tile(x, y, xi, yi, 8, 4).parallel(y).vectorize(xi, 8)
    blur_x.compute_at(blur_y, x).vectorize(x, 8)


    blur_y.compile_jit()
    print("Compiled to jit")

    return
Exemple #7
0
def test_scalar_funcs():
    input = hl.ImageParam(hl.UInt(16), 0, 'input')

    f = hl.Func('f')
    g = hl.Func('g')

    input[()]

    (input[()] + input[()]) / 2
    f[()]
    g[()]

    f[()] = (input[()] + input[()] + input[()]) / 3
    g[()] = (f[()] + f[()] + f[()]) / 3

    g.compile_jit()
Exemple #8
0
def main():

    input = hl.ImageParam(hl.UInt(16), 3, 'input')

    # number of intensity levels
    levels = hl.Param(int_t, 'levels', 8)

    #Parameters controlling the filter
    alpha = hl.Param(float_t, 'alpha', 1.0 / 7.0)
    beta = hl.Param(float_t, 'beta', 1.0)

    local_laplacian = get_local_laplacian(input, levels, alpha, beta)

    generate = False  # Set to False to run the jit immediately and get  instant gratification.
    if generate:
        generate_compiled_file(local_laplacian)
    else:
        filter_test_image(local_laplacian, input)

    return
Exemple #9
0
def test_looplevel():
    x, y = hl.Var('x'), hl.Var('y')
    target = hl.get_jit_target_from_environment()

    buffer_input = hl.Buffer(hl.UInt(8), [4, 4])
    buffer_input.fill(123)

    func_input = hl.Func("func_input")
    func_input[x, y] = x + y

    simple_compute_at = hl.LoopLevel()
    simple = simplestub.generate(target, buffer_input, func_input, 3.5,
        compute_level=simple_compute_at)

    computed_output = hl.Func('computed_output')
    computed_output[x, y] = simple[x, y] + 3

    simple_compute_at.set(hl.LoopLevel(computed_output, x))

    _realize_and_check(computed_output, 3)
Exemple #10
0
def test_make_interleaved():
    w = 7
    h = 13
    c = 3

    b = hl.Buffer.make_interleaved(type = hl.UInt(8), width = w, height = h, channels = c)

    assert b.dim(0).min() == 0
    assert b.dim(0).extent() == w
    assert b.dim(0).stride() == c

    assert b.dim(1).min() == 0
    assert b.dim(1).extent() == h
    assert b.dim(1).stride() == w * c

    assert b.dim(2).min() == 0
    assert b.dim(2).extent() == c
    assert b.dim(2).stride() == 1

    a = np.array(b, copy = False)
    assert a.shape == (w, h, c)
    assert a.strides == (c, w*c, 1)
    assert a.dtype == np.uint8
Exemple #11
0
def test_int_promotion():
    # Verify that (Exprlike op literal) correctly matches the type
    # of the literal to the Exprlike (rather than promoting the result to int32).
    # All types that use add_binary_operators() should be tested here.

    x = hl.Var('x')
    # All the binary ops are handled the same, so + is good enough

    # Exprlike = FuncRef
    f = hl.Func('f')
    f[x] = hl.u16(x)
    _check_is_u16(f[x] + 2)
    _check_is_u16(2 + f[x])

    # Exprlike = Expr
    e = hl.Expr(f[x])
    _check_is_u16(e + 2)
    _check_is_u16(2 + e)

    # Exprlike = Param
    p = hl.Param(hl.UInt(16))
    _check_is_u16(p + 2)
    _check_is_u16(2 + p)
Exemple #12
0
def addone():
    # feed input
    input_data = np.ones((4, 4), dtype=np.uint8)
    A = hl.Buffer(input_data)

    i, j = hl.Var("i"), hl.Var("j")
    B = hl.Func("B")
    B[i, j] = A[i, j] + 1

    # output
    if 0:
        output = B.realize(4, 4)
        print("out: \n", np.asanyarray(output))
    if 0:
        output = hl.Buffer(hl.UInt(8), [4, 4])
        B.realize(output)
        print("out: \n", np.asanyarray(output))
    if 1:
        output_data = np.empty(input_data.shape,
                               dtype=input_data.dtype,
                               order="F")
        output = hl.Buffer(output_data)
        B.realize(output)
        print("out: \n", output_data)
Exemple #13
0
def get_blur(input):
    assert type(input) == hl.ImageParam
    assert input.dimensions() == 2

    x, y = hl.Var("x"), hl.Var("y")

    clamped_input = hl.BoundaryConditions.repeat_edge(input)

    input_uint16 = hl.Func("input_uint16")
    input_uint16[x,y] = hl.u16(clamped_input[x,y])
    ci = input_uint16

    blur_x = hl.Func("blur_x")
    blur_y = hl.Func("blur_y")

    blur_x[x,y] = (ci[x,y]+ci[x+1,y]+ci[x+2,y])/3
    blur_y[x,y] = hl.cast(hl.UInt(8), (blur_x[x,y]+blur_x[x,y+1]+blur_x[x,y+2])/3)

    # schedule
    xi, yi = hl.Var("xi"), hl.Var("yi")
    blur_y.tile(x, y, xi, yi, 8, 4).parallel(y).vectorize(xi, 8)
    blur_x.compute_at(blur_y, x).vectorize(x, 8)

    return blur_y
Exemple #14
0
def yuv_to_rgb(input):
    print('    yuv_to_rgb')

    output = hl.Func("yuv_to_rgb_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    Y = input[x, y, 0]
    U = input[x, y, 1]
    V = input[x, y, 2]

    output[x, y, c] = hl.cast(hl.UInt(16), 0)

    output[x, y, 0] = hl.u16_sat(Y + 1.403 * V)
    output[x, y, 1] = hl.u16_sat(Y - 0.344 * U - 0.714 * V)
    output[x, y, 2] = hl.u16_sat(Y + 1.77 * U)

    output.compute_root().parallel(y).vectorize(x, 16)

    output.update(0).parallel(y).vectorize(x, 16)
    output.update(1).parallel(y).vectorize(x, 16)
    output.update(2).parallel(y).vectorize(x, 16)

    return output
def main():
    # First we'll declare some Vars to use below.
    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    image_path = os.path.join(os.path.dirname(__file__),
                              "../../tutorial/images/rgb.png")

    # Now we'll express a multi-stage pipeline that blurs an image
    # first horizontally, and then vertically.
    if True:
        # Take a color 8-bit input
        input = hl.Buffer(imageio.imread(image_path))
        assert input.type() == hl.UInt(8)

        # Upgrade it to 16-bit, so we can do math without it overflowing.
        input_16 = hl.Func("input_16")
        input_16[x, y, c] = hl.cast(hl.UInt(16), input[x, y, c])

        # Blur it horizontally:
        blur_x = hl.Func("blur_x")
        blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] +
                           input_16[x + 1, y, c]) / 4

        # Blur it vertically:
        blur_y = hl.Func("blur_y")
        blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] +
                           blur_x[x, y + 1, c]) / 4

        # Convert back to 8-bit.
        output = hl.Func("output")
        output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c])

        # Each hl.Func in this pipeline calls a previous one using
        # familiar function call syntax (we've overloaded operator()
        # on hl.Func objects). A hl.Func may call any other hl.Func that has
        # been given a definition. This restriction prevents
        # pipelines with loops in them. Halide pipelines are always
        # feed-forward graphs of Funcs.

        # Now let's realize it...

        # result = output.realize(input.width(), input.height(), 3)

        # Except that the line above is not going to work. Uncomment
        # it to see what happens.

        # Realizing this pipeline over the same domain as the input
        # image requires reading pixels out of bounds in the input,
        # because the blur_x stage reaches outwards horizontally, and
        # the blur_y stage reaches outwards vertically. Halide
        # detects this by injecting a piece of code at the top of the
        # pipeline that computes the region over which the input will
        # be read. When it starts to run the pipeline it first runs
        # this code, determines that the input will be read out of
        # bounds, and refuses to continue. No actual bounds checks
        # occur in the inner loop that would be slow.
        #
        # So what do we do? There are a few options. If we realize
        # over a domain shifted inwards by one pixel, we won't be
        # asking the Halide routine to read out of bounds. We saw how
        # to do this in the previous lesson:
        result = hl.Buffer(
            hl.UInt(8),
            [input.width() - 2, input.height() - 2, 3])
        result.set_min([1, 1])
        output.realize(result)

        # Save the result. It should look like a slightly blurry
        # parrot, and it should be two pixels narrower and two pixels
        # shorter than the input image.

        imageio.imsave("blurry_parrot_1.png", result)
        print("Created blurry_parrot_1.png")

        # This is usually the fastest way to deal with boundaries:
        # don't write code that reads out of bounds :) The more
        # general solution is our next example.

    # The same pipeline, with a boundary condition on the input.
    if True:
        # Take a color 8-bit input
        input = hl.Buffer(imageio.imread(image_path))
        assert input.type() == hl.UInt(8)

        # This time, we'll wrap the input in a hl.Func that prevents
        # reading out of bounds:
        clamped = hl.Func("clamped")

        # Define an expression that clamps x to lie within the the
        # range [0, input.width()-1].
        clamped_x = hl.clamp(x, 0, input.width() - 1)
        # Similarly hl.clamp y.
        clamped_y = hl.clamp(y, 0, input.height() - 1)
        # Load from input at the clamped coordinates. This means that
        # no matter how we evaluated the hl.Func 'clamped', we'll never
        # read out of bounds on the input. This is a hl.clamp-to-edge
        # style boundary condition, and is the simplest boundary
        # condition to express in Halide.
        clamped[x, y, c] = input[clamped_x, clamped_y, c]

        # Defining 'clamped' in that way can be done more concisely
        # using a helper function from the BoundaryConditions
        # namespace like so:
        #
        # clamped = hl.BoundaryConditions.repeat_edge(input)
        #
        # These are important to use for other boundary conditions,
        # because they are expressed in the way that Halide can best
        # understand and optimize.

        # Upgrade it to 16-bit, so we can do math without it
        # overflowing. This time we'll refer to our new hl.Func
        # 'clamped', instead of referring to the input image
        # directly.
        input_16 = hl.Func("input_16")
        input_16[x, y, c] = hl.cast(hl.UInt(16), clamped[x, y, c])

        # The rest of the pipeline will be the same...

        # Blur it horizontally:
        blur_x = hl.Func("blur_x")
        blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] +
                           input_16[x + 1, y, c]) / 4

        # Blur it vertically:
        blur_y = hl.Func("blur_y")
        blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] +
                           blur_x[x, y + 1, c]) / 4

        # Convert back to 8-bit.
        output = hl.Func("output")
        output[x, y, c] = hl.cast(hl.UInt(8), blur_y[x, y, c])

        # This time it's safe to evaluate the output over the some
        # domain as the input, because we have a boundary condition.
        result = output.realize(input.width(), input.height(), 3)

        # Save the result. It should look like a slightly blurry
        # parrot, but this time it will be the same size as the
        # input.
        imageio.imsave("blurry_parrot_2.png", result)
        print("Created blurry_parrot_2.png")

    print("Success!")
    return 0
Exemple #16
0
def main():

    # We'll define the simple one-stage pipeline that we used in lesson 10.
    brighter = hl.Func("brighter")
    x, y = hl.Var("x"), hl.Var("y")

    # Declare the arguments.
    offset = hl.Param(hl.UInt(8))
    input = hl.ImageParam(hl.UInt(8), 2)
    args = [input, offset]

    # Define the hl.Func.
    brighter[x, y] = input[x, y] + offset

    # Schedule it.
    brighter.vectorize(x, 16).parallel(y)

    # The following line is what we did in lesson 10. It compiles an
    # object file suitable for the system that you're running this
    # program on.  For example, if you compile and run this file on
    # 64-bit linux on an x86 cpu with sse4.1, then the generated code
    # will be suitable for 64-bit linux on x86 with sse4.1.
    brighter.compile_to_file("lesson_11_host", args, "lesson_11_host")

    # We can also compile object files suitable for other cpus and
    # operating systems. You do this with an optional third argument
    # to compile_to_file which specifies the target to compile for.

    create_android = True
    create_windows = True
    create_ios = True

    if create_android:
        # Let's use this to compile a 32-bit arm android version of this code:
        target = hl.Target()
        target.os = hl.TargetOS.Android  # The operating system
        target.arch = hl.TargetArch.ARM  # The CPU architecture
        target.bits = 32  # The bit-width of the architecture
        arm_features = []  # A list of features to set
        target.set_features(arm_features)
        # Pass the target as the last argument.
        brighter.compile_to_file("lesson_11_arm_32_android", args,
                                 "lesson_11_arm_32_android", target)

    if create_windows:
        # And now a Windows object file for 64-bit x86 with AVX and SSE 4.1:
        target = hl.Target()
        target.os = hl.TargetOS.Windows
        target.arch = hl.TargetArch.X86
        target.bits = 64
        target.set_features([hl.TargetFeature.AVX, hl.TargetFeature.SSE41])
        brighter.compile_to_file("lesson_11_x86_64_windows", args,
                                 "lesson_11_x86_64_windows", target)

    if create_ios:
        # And finally an iOS mach-o object file for one of Apple's 32-bit
        # ARM processors - the A6. It's used in the iPhone 5. The A6 uses
        # a slightly modified ARM architecture called ARMv7s. We specify
        # this using the target features field.  Support for Apple's
        # 64-bit ARM processors is very new in llvm, and still somewhat
        # flaky.
        target = hl.Target()
        target.os = hl.TargetOS.IOS
        target.arch = hl.TargetArch.ARM
        target.bits = 32
        target.set_features([hl.TargetFeature.ARMv7s])
        brighter.compile_to_file("lesson_11_arm_32_ios", args,
                                 "lesson_11_arm_32_ios", target)

    # Now let's check these files are what they claim, by examining
    # their first few bytes.

    if create_android:
        # 32-arm android object files start with the magic bytes:
        # uint8_t []
        arm_32_android_magic = [
            0x7f,
            ord('E'),
            ord('L'),
            ord('F'),  # ELF format
            1,  # 32-bit
            1,  # 2's complement little-endian
            1
        ]  # Current version of elf

        length = len(arm_32_android_magic)
        f = open("lesson_11_arm_32_android.o", "rb")
        try:
            header_bytes = f.read(length)
        except:
            print("Android object file not generated")
            return -1
        f.close()

        header = list(unpack("B" * length, header_bytes))
        if header != arm_32_android_magic:
            print([x == y for x, y in zip(header, arm_32_android_magic)])
            raise Exception(
                "Unexpected header bytes in 32-bit arm object file.")
            return -1

    if create_windows:
        # 64-bit windows object files start with the magic 16-bit value 0x8664
        # (presumably referring to x86-64)
        # uint8_t  []
        win_64_magic = [0x64, 0x86]

        f = open("lesson_11_x86_64_windows.obj", "rb")
        try:
            header_bytes = f.read(2)
        except:
            print("Windows object file not generated")
            return -1
        f.close()

        header = list(unpack("B" * 2, header_bytes))
        if header != win_64_magic:
            raise Exception(
                "Unexpected header bytes in 64-bit windows object file.")
            return -1

    if create_ios:
        # 32-bit arm iOS mach-o files start with the following magic bytes:
        #  uint32_t []
        arm_32_ios_magic = [
            0xfeedface,  # Mach-o magic bytes
            #0xfe, 0xed, 0xfa, 0xce, # Mach-o magic bytes
            12,  # CPU type is ARM
            11,  # CPU subtype is ARMv7s
            1
        ]  # It's a relocatable object file.
        f = open("lesson_11_arm_32_ios.o", "rb")
        try:
            header_bytes = f.read(4 * 4)
        except:
            print("ios object file not generated")
            return -1
        f.close()

        header = list(unpack("I" * 4, header_bytes))
        if header != arm_32_ios_magic:
            raise Exception(
                "Unexpected header bytes in 32-bit arm ios object file.")
            return -1

    # It looks like the object files we produced are plausible for
    # those targets. We'll count that as a success for the purposes
    # of this tutorial. For a real application you'd then need to
    # figure out how to integrate Halide into your cross-compilation
    # toolchain. There are several small examples of this in the
    # Halide repository under the apps folder. See HelloAndroid and
    # HelloiOS here:
    # https:#github.com/halide/Halide/tree/master/apps/
    print("Success!")
    return 0
def main():
    # Declare some Vars to use below.
    x, y = hl.Var("x"), hl.Var("y")

    # Load a grayscale image to use as an input.
    image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png")
    input_data = imageio.imread(image_path)
    if True:
         # making the image smaller to go faster
        input_data = input_data[:160, :150]
    assert input_data.dtype == np.uint8
    input = hl.Buffer(input_data)

    # You can define a hl.Func in multiple passes. Let's see a toy
    # example first.
    if True:
        # The first definition must be one like we have seen already
        # - a mapping from Vars to an hl.Expr:
        f = hl.Func("f")
        f[x, y] = x + y
        # We call this first definition the "pure" definition.

        # But the later definitions can include computed expressions on
        # both sides. The simplest example is modifying a single point:
        f[3, 7] = 42

        # We call these extra definitions "update" definitions, or
        # "reduction" definitions. A reduction definition is an
        # update definition that recursively refers back to the
        # function's current value at the same site:
        if False:
            e = f[x, y] + 17
            print("f[x, y] + 17", e)
            print("(f[x, y] + 17).type()", e.type())
            print("(f[x, y]).type()", f[x, y].type())

        f[x, y] = f[x, y] + 17

        # If we confine our update to a single row, we can
        # recursively refer to values in the same column:
        f[x, 3] = f[x, 0] * f[x, 10]

        # Similarly, if we confine our update to a single column, we
        # can recursively refer to other values in the same row.
        f[0, y] = f[0, y] / f[3, y]

        # The general rule is: Each hl.Var used in an update definition
        # must appear unadorned in the same position as in the pure
        # definition in all references to the function on the left-
        # and right-hand sides. So the following definitions are
        # legal updates:
        # x is used, so all uses of f must have x as the first argument.
        f[x, 17] = x + 8
        # y is used, so all uses of f must have y as the second argument.
        f[0, y] = y * 8
        f[x, x + 1] = x + 8
        f[y / 2, y] = f[0, y] * 17

        # But these ones would cause an error:
        # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'.
        # f[y, y + 1) = y + 8   <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'.
        # f[y, x) = y - x      <- Arguments to f on the left-hand-side are in the wrong places.
        # f[3, 4) = x + y      <- Free variables appear on the right-hand-side
        # but not the left-hand-side.

        # We'll realize this one just to make sure it compiles. The
        # second-to-last definition forces us to realize over a
        # domain that is taller than it is wide.
        f.realize(100, 101)

        # For each realization of f, each step runs in its entirety
        # before the next one begins. Let's trace the loads and
        # stores for a simpler example:
        g = hl.Func("g")
        g[x, y] = x + y   # Pure definition
        g[2, 1] = 42      # First update definition
        g[x, 0] = g[x, 1]  # Second update definition

        g.trace_loads()
        g.trace_stores()

        g.realize(4, 4)

        # Reading the log, we see that each pass is applied in turn. The
        # equivalent Python is:
        result = np.empty((4, 4), dtype=np.int)
        # Pure definition
        for yy in range(4):
            for xx in range(4):
                result[yy][xx] = xx + yy

        # First update definition
        result[1][2] = 42
        # Second update definition
        for xx in range(4):
            result[0][xx] = result[1][xx]
    # end of section

    # Putting update passes inside loops.
    if True:
        # Starting with this pure definition:
        f = hl.Func("f")
        f[x, y] = x + y

        # Say we want an update that squares the first fifty rows. We
        # could do this by adding 50 update definitions:

        # f[x, 0) = f[x, 0) * f[x, 0)
        # f[x, 1) = f[x, 1) * f[x, 1)
        # f[x, 2) = f[x, 2) * f[x, 2)
        # ...
        # f[x, 49) = f[x, 49) * f[x, 49)

        # Or equivalently using a compile-time loop in our C++:
        # for (int i = 0 i < 50 i++) {
        #   f[x, i) = f[x, i) * f[x, i)
        #

        # But it's more manageable and more flexible to put the loop
        # in the generated code. We do this by defining a "reduction
        # domain" and using it inside an update definition:
        r = hl.RDom([(0, 50)])
        f[x, r] = f[x, r] * f[x, r]
        halide_result = f.realize(100, 100)

        # The equivalent Python is:
        py_result = np.empty((100, 100), dtype=np.int)
        for yy in range(100):
            for xx in range(100):
                py_result[yy][xx] = xx + yy

        for xx in range(100):
            for rr in range(50):
                # The loop over the reduction domain occurs inside of
                # the loop over any pure variables used in the update
                # step:
                py_result[rr][xx] = py_result[rr][xx] * py_result[rr][xx]

        # Check the results match:
        for yy in range(100):
            for xx in range(100):
                assert halide_result[xx, yy] == py_result[yy][xx], \
                    "halide_result(%d, %d) = %d instead of %d" % (
                        xx, yy, halide_result[xx, yy], py_result[yy][xx])

    # Now we'll examine a real-world use for an update definition:
    # computing a histogram.
    if True:

        # Some operations on images can't be cleanly expressed as a pure
        # function from the output coordinates to the value stored
        # there. The classic example is computing a histogram. The
        # natural way to do it is to iterate over the input image,
        # updating histogram buckets. Here's how you do that in Halide:
        histogram = hl.Func("histogram")

        # Histogram buckets start as zero.
        histogram[x] = 0

        # Define a multi-dimensional reduction domain over the input image:
        r = hl.RDom([(0, input.width()), (0, input.height())])

        # For every point in the reduction domain, increment the
        # histogram bucket corresponding to the intensity of the
        # input image at that point.
        histogram[input[r.x, r.y]] += 1

        halide_result = histogram.realize(256)

        # The equivalent Python is:
        py_result = np.empty((256), dtype=np.int)
        for xx in range(256):
            py_result[xx] = 0

        for r_y in range(input.height()):
            for r_x in range(input.width()):
                py_result[input_data[r_x, r_y]] += 1

        # Check the answers agree:
        for xx in range(256):
            assert py_result[xx] == halide_result[xx], \
                "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

    # Scheduling update steps
    if True:
        # The pure variables in an update step and can be
        # parallelized, vectorized, split, etc as usual.

        # Vectorizing, splitting, or parallelize the variables that
        # are part of the reduction domain is trickier. We'll cover
        # that in a later lesson.

        # Consider the definition:
        f = hl.Func("x")
        f[x, y] = x * y
        # Set the second row to equal the first row.
        f[x, 1] = f[x, 0]
        # Set the second column to equal the first column plus 2.
        f[1, y] = f[0, y] + 2

        # The pure variables in each stage can be scheduled
        # independently. To control the pure definition, we schedule
        # as we have done in the past. The following code vectorizes
        # and parallelizes the pure definition only.
        f.vectorize(x, 4).parallel(y)

        # We use hl.Func::update(int) to get a handle to an update step
        # for the purposes of scheduling. The following line
        # vectorizes the first update step across x. We can't do
        # anything with y for this update step, because it doesn't
        # use y.
        f.update(0).vectorize(x, 4)

        # Now we parallelize the second update step in chunks of size
        # 4.
        yo, yi = hl.Var("yo"), hl.Var("yi")
        f.update(1).split(y, yo, yi, 4).parallel(yo)

        halide_result = f.realize(16, 16)

        # Here's the equivalent (serial) C:
        py_result = np.empty((16, 16), dtype=np.int)

        # Pure step. Vectorized in x and parallelized in y.
        for yy in range(16):  # Should be a parallel for loop
            for x_vec in range(4):
                xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
                py_result[yy][xx[0]] = xx[0] * yy
                py_result[yy][xx[1]] = xx[1] * yy
                py_result[yy][xx[2]] = xx[2] * yy
                py_result[yy][xx[3]] = xx[3] * yy

        # First update. Vectorized in x.
        for x_vec in range(4):
            xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
            py_result[1][xx[0]] = py_result[0][xx[0]]
            py_result[1][xx[1]] = py_result[0][xx[1]]
            py_result[1][xx[2]] = py_result[0][xx[2]]
            py_result[1][xx[3]] = py_result[0][xx[3]]

        # Second update. Parallelized in chunks of size 4 in y.
        for yo in range(4):  # Should be a parallel for loop
            for yi in range(4):
                yy = yo * 4 + yi
                py_result[yy][1] = py_result[yy][0] + 2

        # Check the C and Halide results match:
        for yy in range(16):
            for xx in range(16):
                assert halide_result[xx, yy] == py_result[yy][xx], \
                    "halide_result(%d, %d) = %d instead of %d" % (
                        xx, yy, halide_result[xx, yy], py_result[yy][xx])

    # That covers how to schedule the variables within a hl.Func that
    # uses update steps, but what about producer-consumer
    # relationships that involve compute_at and store_at? Let's
    # examine a reduction as a producer, in a producer-consumer pair.
    if True:
        # Because an update does multiple passes over a stored array,
        # it's not meaningful to inline them. So the default schedule
        # for them does the closest thing possible. It computes them
        # in the innermost loop of their consumer. Consider this
        # trivial example:
        producer, consumer = hl.Func("producer"), hl.Func("consumer")
        producer[x] = x * 17
        producer[x] += 1
        consumer[x] = 2 * producer[x]
        halide_result = consumer.realize(10)

        # The equivalent Python is:
        py_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            producer_storage = np.empty((1), dtype=np.int)
            # Pure step for producer
            producer_storage[0] = xx * 17
            # Update step for producer
            producer_storage[0] = producer_storage[0] + 1
            # Pure step for consumer
            py_result[xx] = 2 * producer_storage[0]

        # Check the results match
        for xx in range(10):
            assert halide_result[xx] == py_result[xx], \
                "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        # For all other compute_at/store_at options, the reduction
        # gets placed where you would expect, somewhere in the loop
        # nest of the consumer.

    # Now let's consider a reduction as a consumer in a
    # producer-consumer pair. This is a little more involved.
    if True:
        if True:
            # Case 1: The consumer references the producer in the pure step
            # only.
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            # The producer is pure.
            producer[x] = x * 17
            consumer[x] = 2 * producer[x]
            consumer[x] += 1

            # The valid schedules for the producer in this case are
            # the default schedule - inlined, and also:
            #
            # 1) producer.compute_at(x), which places the computation of
            # the producer inside the loop over x in the pure step of the
            # consumer.
            #
            # 2) producer.compute_root(), which computes all of the
            # producer ahead of time.
            #
            # 3) producer.store_root().compute_at(x), which allocates
            # space for the consumer outside the loop over x, but fills
            # it in as needed inside the loop.
            #
            # Let's use option 1.

            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)

            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] = 2 * producer_storage[0]

            # Update step for the consumer
            for xx in range(10):
                py_result[xx] += 1

            # All of the pure step is evaluated before any of the
            # update step, so there are two separate loops over x.

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        if True:
            # Case 2: The consumer references the producer in the update step
            # only
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside the update
            # step of the producer, because that's the only step that
            # uses the producer.
            producer.compute_at(consumer, x)

            # Note however, that we didn't say:
            #
            # producer.compute_at(consumer.update(0), x).
            #
            # Scheduling is done with respect to Vars of a hl.Func, and
            # the Vars of a hl.Func are shared across the pure and
            # update steps.

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                py_result[xx] = xx

            # Update step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        if True:
            # Case 3: The consumer references the producer in
            # multiple steps that share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = producer[x] * x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside both the
            # pure and the update step of the producer. So there ends
            # up being two separate realizations of the producer, and
            # redundant work occurs.
            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] = producer_storage[0] * xx

            # Update step for the consumer
            for xx in range(10):
                # Another copy of the pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                py_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

        if True:
            # Case 4: The consumer references the producer in
            # multiple steps that do not share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x, y] = x * y
            consumer[x, y] = x + y
            consumer[x, 0] = producer[x, x - 1]
            consumer[0, y] = producer[y, y - 1]

            # In this case neither producer.compute_at(consumer, x)
            # nor producer.compute_at(consumer, y) will work, because
            # either one fails to cover one of the uses of the
            # producer. So we'd have to inline producer, or use
            # producer.compute_root().

            # Let's say we really really want producer to be
            # compute_at the inner loops of both consumer update
            # steps. Halide doesn't allow multiple different
            # schedules for a single hl.Func, but we can work around it
            # by making two wrappers around producer, and scheduling
            # those instead:

            # Attempt 2:
            producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(), hl.Func(), hl.Func()
            producer_wrapper_1[x, y] = producer[x, y]
            producer_wrapper_2[x, y] = producer[x, y]

            consumer_2[x, y] = x + y
            consumer_2[x, 0] += producer_wrapper_1[x, x - 1]
            consumer_2[0, y] += producer_wrapper_2[y, y - 1]

            # The wrapper functions give us two separate handles on
            # the producer, so we can schedule them differently.
            producer_wrapper_1.compute_at(consumer_2, x)
            producer_wrapper_2.compute_at(consumer_2, y)

            halide_result = consumer_2.realize(10, 10)

            # The equivalent Python is:
            py_result = np.empty((10, 10), dtype=np.int)

            # Pure step for the consumer
            for yy in range(10):
                for xx in range(10):
                    py_result[yy][xx] = xx + yy

            # First update step for consumer
            for xx in range(10):
                producer_wrapper_1_storage = np.empty((1), dtype=np.int)
                producer_wrapper_1_storage[0] = xx * (xx - 1)
                py_result[0][xx] += producer_wrapper_1_storage[0]

            # Second update step for consumer
            for yy in range(10):
                producer_wrapper_2_storage = np.empty((1), dtype=np.int)
                producer_wrapper_2_storage[0] = yy * (yy - 1)
                py_result[yy][0] += producer_wrapper_2_storage[0]

            # Check the results match
            for yy in range(10):
                for xx in range(10):
                    assert halide_result[xx, yy] == py_result[yy][xx], \
                        "halide_result(%d, %d) = %d instead of %d" % (
                            xx, yy, halide_result[xx, yy], py_result[yy][xx])

        if True:
            # Case 5: Scheduling a producer under a reduction domain
            # variable of the consumer.

            # We are not just restricted to scheduling producers at
            # the loops over the pure variables of the consumer. If a
            # producer is only used within a loop over a reduction
            # domain (hl.RDom) variable, we can also schedule the
            # producer there.

            producer, consumer = hl.Func("producer"), hl.Func("consumer")

            r = hl.RDom([(0, 5)])
            producer[x] = x * 17
            consumer[x] = x + 10
            consumer[x] += r + producer[x + r]

            producer.compute_at(consumer, r)

            halide_result = consumer.realize(10)

            # The equivalent Python is:
            py_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer.
            for xx in range(10):
                py_result[xx] = xx + 10

            # Update step for the consumer.
            for xx in range(10):
                # The loop over the reduction domain is always the inner loop.
                for rr in range(5):
                    # We've schedule the storage and computation of
                    # the producer here. We just need a single value.
                    producer_storage = np.empty((1), dtype=np.int)
                    # Pure step of the producer.
                    producer_storage[0] = (xx + rr) * 17

                    # Now use it in the update step of the consumer.
                    py_result[xx] += rr + producer_storage[0]

            # Check the results match
            for xx in range(10):
                assert halide_result[xx] == py_result[xx], \
                    "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx])

    # A real-world example of a reduction inside a producer-consumer chain.
    if True:
        # The default schedule for a reduction is a good one for
        # convolution-like operations. For example, the following
        # computes a 5x5 box-blur of our grayscale test image with a
        # hl.clamp-to-edge boundary condition:

        # First add the boundary condition.
        clamped = hl.BoundaryConditions.repeat_edge(input)

        # Define a 5x5 box that starts at (-2, -2)
        r = hl.RDom([(-2, 5), (-2, 5)])

        # Compute the 5x5 sum around each pixel.
        local_sum = hl.Func("local_sum")
        local_sum[x, y] = 0  # Compute the sum as a 32-bit integer
        local_sum[x, y] += clamped[x + r.x, y + r.y]

        # Divide the sum by 25 to make it an average
        blurry = hl.Func("blurry")
        blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25)

        halide_result = blurry.realize(input.width(), input.height())

        # The default schedule will inline 'clamped' into the update
        # step of 'local_sum', because clamped only has a pure
        # definition, and so its default schedule is fully-inlined.
        # We will then compute local_sum per x coordinate of blurry,
        # because the default schedule for reductions is
        # compute-innermost. Here's the equivalent Python:

        #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0]
        local_sum = np.empty((1), dtype=np.int32)

        py_result = hl.Buffer(hl.UInt(8), [input.width(), input.height()])
        for yy in range(input.height()):
            for xx in range(input.width()):
                # FIXME this loop is quite slow
                # Pure step of local_sum
                local_sum[0] = 0
                # Update step of local_sum
                for r_y in range(-2, 2 + 1):
                    for r_x in range(-2, 2 + 1):
                        # The clamping has been inlined into the update step.
                        clamped_x = min(max(xx + r_x, 0), input.width() - 1)
                        clamped_y = min(max(yy + r_y, 0), input.height() - 1)
                        local_sum[0] += input[clamped_x, clamped_y]

                # Pure step of blurry
                # py_result(x, y) = (uint8_t)(local_sum[0] / 25)
                #py_result[xx, yy] = cast_to_uint8(local_sum[0] / 25)
                # hl.cast done internally
                py_result[xx, yy] = int(local_sum[0] / 25)

        # Check the results match
        for yy in range(input.height()):
            for xx in range(input.width()):
                assert halide_result[xx, yy] == py_result[xx, yy], \
                    "halide_result(%d, %d) = %d instead of %d" % (
                        xx, yy, halide_result[xx, yy], py_result[xx, yy])

    # Reduction helpers.
    if True:
        # There are several reduction helper functions provided in
        # Halide.h, which compute small reductions and schedule them
        # innermost into their consumer. The most useful one is
        # "sum".
        f1 = hl.Func("f1")
        r = hl.RDom([(0, 100)])
        f1[x] = hl.sum(r + x) * 7

        # Sum creates a small anonymous hl.Func to do the reduction. It's
        # equivalent to:
        f2, anon = hl.Func("f2"), hl.Func("anon")
        anon[x] = 0
        anon[x] += r + x
        f2[x] = anon[x] * 7

        # So even though f1 references a reduction domain, it is a
        # pure function. The reduction domain has been swallowed to
        # define the inner anonymous reduction.
        halide_result_1 = f1.realize(10)
        halide_result_2 = f2.realize(10)

        # The equivalent Python is:
        py_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            anon = np.empty((1), dtype=np.int)
            anon[0] = 0
            for rr in range(100):
                anon[0] += rr + xx

            py_result[xx] = anon[0] * 7

        # Check they all match.
        for xx in range(10):
            assert halide_result_1[xx] == py_result[xx], \
                "halide_result_1(%d) = %d instead of %d" % (xx, halide_result_1[xx], py_result[xx])
            assert halide_result_2[xx] == py_result[xx], \
                "halide_result_2(%d) = %d instead of %d" % (xx, halide_result_2[xx], py_result[xx])

    print("Success!")
    return 0
Exemple #18
0
def test_type():
    t1 = hl.Type()
    assert t1.code() == hl.TypeCode.Handle
    assert t1.bits() == 0
    assert t1.lanes() == 0

    t1 = hl.Type(hl.TypeCode.Int, 32, 1)
    t2 = hl.Int(32)
    assert t1 == t2
    assert t2.code() == hl.TypeCode.Int
    assert t2.bits() == 32
    assert t2.lanes() == 1
    assert t2.bytes() == 4

    t1 = t2.with_code(hl.TypeCode.UInt)
    assert t1 == hl.UInt(32)

    t1 = t2.with_bits(16)
    assert t1 == hl.Int(16)
    assert t1 != t2

    t1 = t2.with_lanes(8)
    assert t1 == hl.Int(32, 8)
    assert t1 != t2
    assert t1.element_of() == hl.Int(32)

    b1 = hl.Bool()
    f32 = hl.Float(32)
    h64 = hl.Handle()
    i32 = hl.Int(32)
    u32 = hl.UInt(32)
    vi32x8 = hl.Int(32, 8)
    u64 = hl.UInt(64)
    i64 = hl.Int(64)

    assert b1.is_bool()
    assert f32.is_float()
    assert h64.is_handle()
    assert i32.is_int()
    assert u32.is_uint()
    assert i64.is_int()
    assert u64.is_uint()

    assert not vi32x8.is_scalar()
    assert vi32x8.is_vector()

    h2 = hl.Handle()
    assert h64.same_handle_type(h2)

    assert f32.can_represent(b1)
    assert not b1.can_represent(f32)

    assert b1.max() == 1
    assert b1.min() == 0
    assert i32.max() == 2147483647
    assert i32.min() == -2147483648

    assert not b1.is_max(2)
    assert b1.is_max(1)
    assert b1.is_min(0)
    assert not b1.is_min(-1)

    assert i32.is_max(2147483647)
    assert i32.is_min(-2147483648)

    assert not u32.is_max(4294967296)
    assert u32.is_max(4294967295)
    assert u32.is_min(0)
    assert not u32.is_min(-1)
    assert not u32.is_min(1)

    assert i64.is_max(9223372036854775807)
    assert i64.is_min(-9223372036854775808)

    # Python doesn't have unsigned integers, so we can't really express this value.
    # assert u64.is_max(0xFFFFFFFFFFFFFFFF)
    assert u64.is_min(0)

    # repr() and str()
    assert str(i32) == "int32"
    assert repr(i32) == "<halide.Type int32>"
    assert str(vi32x8) == "int32x8"
    assert repr(vi32x8) == "<halide.Type int32x8>"
Exemple #19
0
def main():

    # This program defines a single-stage imaging pipeline that
    # brightens an image.

    # First we'll load the input image we wish to brighten.
    image_path = os.path.join(os.path.dirname(__file__),
                              "../../tutorial/images/rgb.png")

    # We create a hl.Buffer object to wrap the numpy array
    input = hl.Buffer(imageio.imread(image_path))
    assert input.type() == hl.UInt(8)

    # Next we define our hl.Func object that represents our one pipeline
    # stage.
    brighter = hl.Func("brighter")

    # Our hl.Func will have three arguments, representing the position
    # in the image and the color channel. Halide treats color
    # channels as an extra dimension of the image.
    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    # Normally we'd probably write the whole function definition on
    # one line. Here we'll break it apart so we can explain what
    # we're doing at every step.

    # For each pixel of the input image.
    value = input[x, y, c]
    assert type(value) == hl.Expr

    # Cast it to a floating point value.
    value = hl.cast(hl.Float(32), value)

    # Multiply it by 1.5 to brighten it. Halide represents real
    # numbers as floats, not doubles, so we stick an 'f' on the end
    # of our constant.
    value = value * 1.5

    # Clamp it to be less than 255, so we don't get overflow when we
    # hl.cast it back to an 8-bit unsigned int.
    value = hl.min(value, 255.0)

    # Cast it back to an 8-bit unsigned integer.
    value = hl.cast(hl.UInt(8), value)

    # Define the function.
    brighter[x, y, c] = value

    # The equivalent one-liner to all of the above is:
    #
    # brighter(x, y, c) = hl.cast<uint8_t>(hl.min(input(x, y, c) * 1.5f, 255))
    # brighter[x, y, c] = hl.cast(hl.UInt(8), hl.min(input[x, y, c] * 1.5, 255))
    #
    # In the shorter version:
    # - I skipped the hl.cast to float, because multiplying by 1.5f does
    #   that automatically.
    # - I also used integer constants in hl.clamp, because they get hl.cast
    #   to match the type of the first argument.
    # - I left the h. off hl.clamp. It's unnecessary due to Koenig
    #   lookup.

    # Remember. All we've done so far is build a representation of a
    # Halide program in memory. We haven't actually processed any
    # pixels yet. We haven't even compiled that Halide program yet.

    # So now we'll realize the hl.Func. The size of the output image
    # should match the size of the input image. If we just wanted to
    # brighten a portion of the input image we could request a
    # smaller size. If we request a larger size Halide will throw an
    # error at runtime telling us we're trying to read out of bounds
    # on the input image.
    output_image = brighter.realize(
        [input.width(), input.height(),
         input.channels()])
    assert output_image.type() == hl.UInt(8)

    # Save the output for inspection. It should look like a bright parrot.
    # python3-imageio versions <2.5 expect a numpy array
    imageio.imsave("brighter.png", np.asanyarray(output_image))
    print("Created brighter.png result file.")

    print("Success!")
    return 0
Exemple #20
0
def main():

    # All Exprs have a scalar type, and all Funcs evaluate to one or
    # more scalar types. The scalar types in Halide are unsigned
    # integers of various bit widths, signed integers of the same set
    # of bit widths, floating point numbers in single and double
    # precision, and opaque handles (equivalent to void *). The
    # following array contains all the legal types.

    valid_halide_types = [
        hl.UInt(8), hl.UInt(16), hl.UInt(32), hl.UInt(64),
        hl.Int(8), hl.Int(16), hl.Int(32), hl.Int(64),
        hl.Float(32), hl.Float(64), hl.Handle() ]


    # Constructing and inspecting types.
    if True:
        # You can programmatically examine the properties of a Halide
        # type. This is useful when you write a C++ function that has
        # hl.Expr arguments and you wish to check their types:
        assert hl.UInt(8).bits() == 8
        assert hl.Int(8).is_int()


        # You can also programmatically construct Types as a function of other Types.
        t = hl.UInt(8)
        t = t.with_bits(t.bits() * 2)
        assert t == hl.UInt(16)

        # Or construct a Type from a C++ scalar type
        #assert type_of<float>() == hl.Float(32)

        # The Type struct is also capable of representing vector types,
        # but this is reserved for Halide's internal use. You should
        # vectorize code by using hl.Func::vectorize, not by attempting to
        # construct vector expressions directly. You may encounter vector
        # types if you programmatically manipulate lowered Halide code,
        # but this is an advanced topic (see hl.Func::add_custom_lowering_pass).

        # You can query any Halide hl.Expr for its type. An hl.Expr
        # representing a hl.Var has type hl.Int(32):
        x = hl.Var("x")
        assert hl.Expr(x).type() == hl.Int(32)

        # Most transcendental functions in Halide hl.cast their inputs to a
        # hl.Float(32) and return a hl.Float(32):
        assert hl.sin(x).type() == hl.Float(32)

        # You can hl.cast an hl.Expr from one Type to another using the hl.cast operator:
        assert hl.cast(hl.UInt(8), x).type() == hl.UInt(8)

        # This also comes in a template form that takes a C++ type.
        #assert hl.cast<uint8_t>(x).type() == hl.UInt(8)

        # You can also query any defined hl.Func for the types it produces.
        f1 = hl.Func("f1")
        f1[x] = hl.cast(hl.UInt(8), x)
        assert f1.output_types()[0] == hl.UInt(8)

        f2 = hl.Func("f2")
        f2[x] = (x, hl.sin(x))
        assert f2.output_types()[0] == hl.Int(32) and \
               f2.output_types()[1] == hl.Float(32)



    # Type promotion rules.
    if True:
        # When you combine Exprs of different types (e.g. using '+',
        # '*', etc), Halide uses a system of type promotion
        # rules. These differ to C's rules. To demonstrate these
        # we'll make some Exprs of each type.
        x = hl.Var("x")
        u8 = hl.cast(hl.UInt(8), x)
        u16 = hl.cast(hl.UInt(16), x)
        u32 = hl.cast(hl.UInt(32), x)
        u64 = hl.cast(hl.UInt(64), x)
        s8 = hl.cast(hl.Int(8), x)
        s16 = hl.cast(hl.Int(16), x)
        s32 = hl.cast(hl.Int(32), x)
        s64 = hl.cast(hl.Int(64), x)
        f32 = hl.cast(hl.Float(32), x)
        f64 = hl.cast(hl.Float(64), x)

        # The rules are as follows, and are applied in the order they are
        # written below.

        # 1) It is an error to hl.cast or use arithmetic operators on Exprs of type hl.Handle().

        # 2) If the types are the same, then no type conversions occur.
        for t in valid_halide_types:
            # Skip the handle type.
            if t.is_handle():
                continue
            e = hl.cast(t, x)
            assert (e + e).type() == e.type()


        # 3) If one type is a float but the other is not, then the
        # non-float argument is promoted to a float (possibly causing a
        # loss of precision for large integers).
        assert (u8 + f32).type() == hl.Float(32)
        assert (f32 + s64).type() == hl.Float(32)
        assert (u16 + f64).type() == hl.Float(64)
        assert (f64 + s32).type() == hl.Float(64)

        # 4) If both types are float, then the narrower argument is
        # promoted to the wider bit-width.
        assert (f64 + f32).type() == hl.Float(64)

        # The rules above handle all the floating-point cases. The
        # following three rules handle the integer cases.

        # 5) If one of the expressions is an integer constant, then it is
        # coerced to the type of the other expression.
        assert (u32 + 3).type() == hl.UInt(32)
        assert (3 + s16).type() == hl.Int(16)

        # If this rule would cause the integer to overflow, then Halide
        # will trigger an error, e.g. uncommenting the following line
        # will cause this program to terminate with an error.
        # hl.Expr bad = u8 + 257

        # 6) If both types are unsigned integers, or both types are
        # signed integers, then the narrower argument is promoted to
        # wider type.
        assert (u32 + u8).type() == hl.UInt(32)
        assert (s16 + s64).type() == hl.Int(64)

        # 7) If one type is signed and the other is unsigned, both
        # arguments are promoted to a signed integer with the greater of
        # the two bit widths.
        assert (u8 + s32).type() == hl.Int(32)
        assert (u32 + s8).type() == hl.Int(32)

        # Note that this may silently overflow the unsigned type in the
        # case where the bit widths are the same.
        assert (u32 + s32).type() == hl.Int(32)

        if False: # evaluate<X> not yet exposed to python
            # When an unsigned hl.Expr is converted to a wider signed type in
            # this way, it is first widened to a wider unsigned type
            # (zero-extended), and then reinterpreted as a signed
            # integer. I.e. casting the hl.UInt(8) value 255 to an hl.Int(32)
            # produces 255, not -1.
            #int32_t result32 = evaluate<int>(hl.cast<int32_t>(hl.cast<uint8_t>(255)))
            assert result32 == 255

            # When a signed type is explicitly converted to a wider unsigned
            # type with the hl.cast operator (the type promotion rules will
            # never do this automatically), it is first converted to the
            # wider signed type (sign-extended), and then reinterpreted as
            # an unsigned integer. I.e. casting the hl.Int(8) value -1 to a
            # hl.UInt(16) produces 65535, not 255.
            #uint16_t result16 = evaluate<uint16_t>(hl.cast<uint16_t>(hl.cast<int8_t>(-1)))
            assert result16 == 65535


    # The type hl.Handle().
    if True:
        # hl.Handle is used to represent opaque pointers. Applying
        # type_of to any pointer type will return hl.Handle()

        #assert type_of<void *>() == hl.Handle()
        #assert type_of<const char * const **>() == hl.Handle()
        # (not clear what the proper python version would be)

        # Handles are always stored as 64-bit, regardless of the compilation
        # target.
        assert hl.Handle().bits() == 64

        # The main use of an hl.Expr of type hl.Handle is to pass
        # it through Halide to other external code.


    # Generic code.
    if True:
        # The main explicit use of Type in Halide is to write Halide
        # code parameterized by a Type. In C++ you'd do this with
        # templates. In Halide there's no need - you can inspect and
        # modify the types dynamically at C++ runtime instead. The
        # function defined below averages two expressions of any
        # equal numeric type.
        x = hl.Var("x")
        assert average(hl.cast(hl.Float(32), x), 3.0).type() == hl.Float(32)
        assert average(x, 3).type() == hl.Int(32)
        assert average(hl.cast(hl.UInt(8), x), hl.cast(hl.UInt(8), 3)).type() == hl.UInt(8)


    print("Success!")

    return 0
def main():
    # Declare some Vars to use below.
    x, y = hl.Var("x"), hl.Var("y")

    # Load a grayscale image to use as an input.
    image_path = os.path.join(os.path.dirname(__file__),
                              "../../tutorial/images/gray.png")
    input_data = imread(image_path)
    if True:
        # making the image smaller to go faster
        input_data = input_data[:160, :150]
    assert input_data.dtype == np.uint8
    input = hl.Buffer(input_data)

    # You can define a hl.Func in multiple passes. Let's see a toy
    # example first.
    if True:
        # The first definition must be one like we have seen already
        # - a mapping from Vars to an hl.Expr:
        f = hl.Func("f")
        f[x, y] = x + y
        # We call this first definition the "pure" definition.

        # But the later definitions can include computed expressions on
        # both sides. The simplest example is modifying a single point:
        f[3, 7] = 42

        # We call these extra definitions "update" definitions, or
        # "reduction" definitions. A reduction definition is an
        # update definition that recursively refers back to the
        # function's current value at the same site:
        if False:
            e = f[x, y] + 17
            print("f[x, y] + 17", e)
            print("(f[x, y] + 17).type()", e.type())
            print("(f[x, y]).type()", f[x, y].type())

        f[x, y] = f[x, y] + 17

        # If we confine our update to a single row, we can
        # recursively refer to values in the same column:
        f[x, 3] = f[x, 0] * f[x, 10]

        # Similarly, if we confine our update to a single column, we
        # can recursively refer to other values in the same row.
        f[0, y] = f[0, y] / f[3, y]

        # The general rule is: Each hl.Var used in an update definition
        # must appear unadorned in the same position as in the pure
        # definition in all references to the function on the left-
        # and right-hand sides. So the following definitions are
        # legal updates:
        f[x,
          17] = x + 8  # x is used, so all uses of f must have x as the first argument.
        f[0,
          y] = y * 8  # y is used, so all uses of f must have y as the second argument.
        f[x, x + 1] = x + 8
        f[y / 2, y] = f[0, y] * 17

        # But these ones would cause an error:
        # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'.
        # f[y, y + 1) = y + 8   <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'.
        # f[y, x) = y - x      <- Arguments to f on the left-hand-side are in the wrong places.
        # f[3, 4) = x + y      <- Free variables appear on the right-hand-side but not the left-hand-side.

        # We'll realize this one just to make sure it compiles. The
        # second-to-last definition forces us to realize over a
        # domain that is taller than it is wide.
        f.realize(100, 101)

        # For each realization of f, each step runs in its entirety
        # before the next one begins. Let's trace the loads and
        # stores for a simpler example:
        g = hl.Func("g")
        g[x, y] = x + y  # Pure definition
        g[2, 1] = 42  # First update definition
        g[x, 0] = g[x, 1]  # Second update definition

        g.trace_loads()
        g.trace_stores()

        g.realize(4, 4)

        # Reading the log, we see that each pass is applied in turn. The equivalent C is:
        result = np.empty((4, 4), dtype=np.int)
        # Pure definition
        for yy in range(4):
            for xx in range(4):
                result[yy][xx] = xx + yy

        # First update definition
        result[1][2] = 42
        # Second update definition
        for xx in range(4):
            result[0][xx] = result[1][xx]
    # end of section

    # Putting update passes inside loops.
    if True:
        # Starting with this pure definition:
        f = hl.Func("f")
        f[x, y] = x + y

        # Say we want an update that squares the first fifty rows. We
        # could do this by adding 50 update definitions:

        # f[x, 0) = f[x, 0) * f[x, 0)
        # f[x, 1) = f[x, 1) * f[x, 1)
        # f[x, 2) = f[x, 2) * f[x, 2)
        # ...
        # f[x, 49) = f[x, 49) * f[x, 49)

        # Or equivalently using a compile-time loop in our C++:
        # for (int i = 0 i < 50 i++) {
        #   f[x, i) = f[x, i) * f[x, i)
        #

        # But it's more manageable and more flexible to put the loop
        # in the generated code. We do this by defining a "reduction
        # domain" and using it inside an update definition:
        r = hl.RDom(0, 50)
        f[x, r] = f[x, r] * f[x, r]
        halide_result = f.realize(100, 100)

        # The equivalent C is:
        c_result = np.empty((100, 100), dtype=np.int)
        for yy in range(100):
            for xx in range(100):
                c_result[yy][xx] = xx + yy

        for xx in range(100):
            for rr in range(50):
                # The loop over the reduction domain occurs inside of
                # the loop over any pure variables used in the update
                # step:
                c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx]

        # Check the results match:
        for yy in range(100):
            for xx in range(100):
                if halide_result(xx, yy) != c_result[yy][xx]:
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result[yy][xx]))
                    return -1

    # Now we'll examine a real-world use for an update definition:
    # computing a histogram.
    if True:

        # Some operations on images can't be cleanly expressed as a pure
        # function from the output coordinates to the value stored
        # there. The classic example is computing a histogram. The
        # natural way to do it is to iterate over the input image,
        # updating histogram buckets. Here's how you do that in Halide:
        histogram = hl.Func("histogram")

        # Histogram buckets start as zero.
        histogram[x] = 0

        # Define a multi-dimensional reduction domain over the input image:
        r = hl.RDom(0, input.width(), 0, input.height())

        # For every point in the reduction domain, increment the
        # histogram bucket corresponding to the intensity of the
        # input image at that point.
        histogram[input[r.x, r.y]] += 1

        halide_result = histogram.realize(256)

        # The equivalent C is:
        c_result = np.empty((256), dtype=np.int)
        for xx in range(256):
            c_result[xx] = 0

        for r_y in range(input.height()):
            for r_x in range(input.width()):
                c_result[input_data[r_x, r_y]] += 1

        # Check the answers agree:
        for xx in range(256):
            if c_result[xx] != halide_result(xx):
                raise Exception("halide_result(%d) = %d instead of %d" %
                                (xx, halide_result(xx), c_result[xx]))
                return -1

    # Scheduling update steps
    if True:
        # The pure variables in an update step and can be
        # parallelized, vectorized, split, etc as usual.

        # Vectorizing, splitting, or parallelize the variables that
        # are part of the reduction domain is trickier. We'll cover
        # that in a later lesson.

        # Consider the definition:
        f = hl.Func("x")
        f[x, y] = x * y
        # Set the second row to equal the first row.
        f[x, 1] = f[x, 0]
        # Set the second column to equal the first column plus 2.
        f[1, y] = f[0, y] + 2

        # The pure variables in each stage can be scheduled
        # independently. To control the pure definition, we schedule
        # as we have done in the past. The following code vectorizes
        # and parallelizes the pure definition only.
        f.vectorize(x, 4).parallel(y)

        # We use hl.Func::update(int) to get a handle to an update step
        # for the purposes of scheduling. The following line
        # vectorizes the first update step across x. We can't do
        # anything with y for this update step, because it doesn't
        # use y.
        f.update(0).vectorize(x, 4)

        # Now we parallelize the second update step in chunks of size
        # 4.
        yo, yi = hl.Var("yo"), hl.Var("yi")
        f.update(1).split(y, yo, yi, 4).parallel(yo)

        halide_result = f.realize(16, 16)

        # Here's the equivalent (serial) C:
        c_result = np.empty((16, 16), dtype=np.int)

        # Pure step. Vectorized in x and parallelized in y.
        for yy in range(16):  # Should be a parallel for loop
            for x_vec in range(4):
                xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
                c_result[yy][xx[0]] = xx[0] * yy
                c_result[yy][xx[1]] = xx[1] * yy
                c_result[yy][xx[2]] = xx[2] * yy
                c_result[yy][xx[3]] = xx[3] * yy

        # First update. Vectorized in x.
        for x_vec in range(4):
            xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
            c_result[1][xx[0]] = c_result[0][xx[0]]
            c_result[1][xx[1]] = c_result[0][xx[1]]
            c_result[1][xx[2]] = c_result[0][xx[2]]
            c_result[1][xx[3]] = c_result[0][xx[3]]

        # Second update. Parallelized in chunks of size 4 in y.
        for yo in range(4):  # Should be a parallel for loop
            for yi in range(4):
                yy = yo * 4 + yi
                c_result[yy][1] = c_result[yy][0] + 2

        # Check the C and Halide results match:
        for yy in range(16):
            for xx in range(16):
                if halide_result(xx, yy) != c_result[yy][xx]:
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result[yy][xx]))
                    return -1

    # That covers how to schedule the variables within a hl.Func that
    # uses update steps, but what about producer-consumer
    # relationships that involve compute_at and store_at? Let's
    # examine a reduction as a producer, in a producer-consumer pair.
    if True:
        # Because an update does multiple passes over a stored array,
        # it's not meaningful to inline them. So the default schedule
        # for them does the closest thing possible. It computes them
        # in the innermost loop of their consumer. Consider this
        # trivial example:
        producer, consumer = hl.Func("producer"), hl.Func("consumer")
        producer[x] = x * 17
        producer[x] += 1
        consumer[x] = 2 * producer[x]
        halide_result = consumer.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            producer_storage = np.empty((1), dtype=np.int)
            # Pure step for producer
            producer_storage[0] = xx * 17
            # Update step for producer
            producer_storage[0] = producer_storage[0] + 1
            # Pure step for consumer
            c_result[xx] = 2 * producer_storage[0]

        # Check the results match
        for xx in range(10):
            if halide_result(xx) != c_result[xx]:
                raise Exception("halide_result(%d) = %d instead of %d" %
                                (xx, halide_result(xx), c_result[xx]))
                return -1

        # For all other compute_at/store_at options, the reduction
        # gets placed where you would expect, somewhere in the loop
        # nest of the consumer.

    # Now let's consider a reduction as a consumer in a
    # producer-consumer pair. This is a little more involved.
    if True:
        if True:
            # Case 1: The consumer references the producer in the pure step only.
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            # The producer is pure.
            producer[x] = x * 17
            consumer[x] = 2 * producer[x]
            consumer[x] += 1

            # The valid schedules for the producer in this case are
            # the default schedule - inlined, and also:
            #
            # 1) producer.compute_at(x), which places the computation of
            # the producer inside the loop over x in the pure step of the
            # consumer.
            #
            # 2) producer.compute_root(), which computes all of the
            # producer ahead of time.
            #
            # 3) producer.store_root().compute_at(x), which allocates
            # space for the consumer outside the loop over x, but fills
            # it in as needed inside the loop.
            #
            # Let's use option 1.

            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)

            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = 2 * producer_storage[0]

            # Update step for the consumer
            for xx in range(10):
                c_result[xx] += 1

            # All of the pure step is evaluated before any of the
            # update step, so there are two separate loops over x.

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 2: The consumer references the producer in the update step only
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside the update
            # step of the producer, because that's the only step that
            # uses the producer.
            producer.compute_at(consumer, x)

            # Note however, that we didn't say:
            #
            # producer.compute_at(consumer.update(0), x).
            #
            # Scheduling is done with respect to Vars of a hl.Func, and
            # the Vars of a hl.Func are shared across the pure and
            # update steps.

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                c_result[xx] = xx

            # Update step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 3: The consumer references the producer in
            # multiple steps that share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x] = x * 17
            consumer[x] = producer[x] * x
            consumer[x] += producer[x]

            # Again we compute the producer per x coordinate of the
            # consumer. This places producer code inside both the
            # pure and the update step of the producer. So there ends
            # up being two separate realizations of the producer, and
            # redundant work occurs.
            producer.compute_at(consumer, x)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer
            for xx in range(10):
                # Pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] = producer_storage[0] * xx

            # Update step for the consumer
            for xx in range(10):
                # Another copy of the pure step for producer
                producer_storage = np.empty((1), dtype=np.int)
                producer_storage[0] = xx * 17
                c_result[xx] += producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

        if True:
            # Case 4: The consumer references the producer in
            # multiple steps that do not share common variables
            producer, consumer = hl.Func("producer"), hl.Func("consumer")
            producer[x, y] = x * y
            consumer[x, y] = x + y
            consumer[x, 0] = producer[x, x - 1]
            consumer[0, y] = producer[y, y - 1]

            # In this case neither producer.compute_at(consumer, x)
            # nor producer.compute_at(consumer, y) will work, because
            # either one fails to cover one of the uses of the
            # producer. So we'd have to inline producer, or use
            # producer.compute_root().

            # Let's say we really really want producer to be
            # compute_at the inner loops of both consumer update
            # steps. Halide doesn't allow multiple different
            # schedules for a single hl.Func, but we can work around it
            # by making two wrappers around producer, and scheduling
            # those instead:

            # Attempt 2:
            producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(
            ), hl.Func(), hl.Func()
            producer_wrapper_1[x, y] = producer[x, y]
            producer_wrapper_2[x, y] = producer[x, y]

            consumer_2[x, y] = x + y
            consumer_2[x, 0] += producer_wrapper_1[x, x - 1]
            consumer_2[0, y] += producer_wrapper_2[y, y - 1]

            # The wrapper functions give us two separate handles on
            # the producer, so we can schedule them differently.
            producer_wrapper_1.compute_at(consumer_2, x)
            producer_wrapper_2.compute_at(consumer_2, y)

            halide_result = consumer_2.realize(10, 10)

            # The equivalent C is:
            c_result = np.empty((10, 10), dtype=np.int)

            # Pure step for the consumer
            for yy in range(10):
                for xx in range(10):
                    c_result[yy][xx] = xx + yy

            # First update step for consumer
            for xx in range(10):
                producer_wrapper_1_storage = np.empty((1), dtype=np.int)
                producer_wrapper_1_storage[0] = xx * (xx - 1)
                c_result[0][xx] += producer_wrapper_1_storage[0]

            # Second update step for consumer
            for yy in range(10):
                producer_wrapper_2_storage = np.empty((1), dtype=np.int)
                producer_wrapper_2_storage[0] = yy * (yy - 1)
                c_result[yy][0] += producer_wrapper_2_storage[0]

            # Check the results match
            for yy in range(10):
                for xx in range(10):
                    if halide_result(xx, yy) != c_result[yy][xx]:
                        print("halide_result(%d, %d) = %d instead of %d", xx,
                              yy, halide_result(xx, yy), c_result[yy][xx])
                        return -1

        if True:
            # Case 5: Scheduling a producer under a reduction domain
            # variable of the consumer.

            # We are not just restricted to scheduling producers at
            # the loops over the pure variables of the consumer. If a
            # producer is only used within a loop over a reduction
            # domain (hl.RDom) variable, we can also schedule the
            # producer there.

            producer, consumer = hl.Func("producer"), hl.Func("consumer")

            r = hl.RDom(0, 5)
            producer[x] = x * 17
            consumer[x] = x + 10
            consumer[x] += r + producer[x + r]

            producer.compute_at(consumer, r)

            halide_result = consumer.realize(10)

            # The equivalent C is:
            c_result = np.empty((10), dtype=np.int)
            # Pure step for the consumer.
            for xx in range(10):
                c_result[xx] = xx + 10

            # Update step for the consumer.
            for xx in range(10):
                for rr in range(
                        5
                ):  # The loop over the reduction domain is always the inner loop.
                    # We've schedule the storage and computation of
                    # the producer here. We just need a single value.
                    producer_storage = np.empty((1), dtype=np.int)
                    # Pure step of the producer.
                    producer_storage[0] = (xx + rr) * 17

                    # Now use it in the update step of the consumer.
                    c_result[xx] += rr + producer_storage[0]

            # Check the results match
            for xx in range(10):
                if halide_result(xx) != c_result[xx]:
                    raise Exception("halide_result(%d) = %d instead of %d" %
                                    (xx, halide_result(xx), c_result[xx]))
                    return -1

    # A real-world example of a reduction inside a producer-consumer chain.
    if True:
        # The default schedule for a reduction is a good one for
        # convolution-like operations. For example, the following
        # computes a 5x5 box-blur of our grayscale test image with a
        # hl.clamp-to-edge boundary condition:

        # First add the boundary condition.
        clamped = hl.repeat_edge(input)

        # Define a 5x5 box that starts at (-2, -2)
        r = hl.RDom(-2, 5, -2, 5)

        # Compute the 5x5 sum around each pixel.
        local_sum = hl.Func("local_sum")
        local_sum[x, y] = 0  # Compute the sum as a 32-bit integer
        local_sum[x, y] += clamped[x + r.x, y + r.y]

        # Divide the sum by 25 to make it an average
        blurry = hl.Func("blurry")
        blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25)

        halide_result = blurry.realize(input.width(), input.height())

        # The default schedule will inline 'clamped' into the update
        # step of 'local_sum', because clamped only has a pure
        # definition, and so its default schedule is fully-inlined.
        # We will then compute local_sum per x coordinate of blurry,
        # because the default schedule for reductions is
        # compute-innermost. Here's the equivalent C:

        #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0]
        local_sum = np.empty((1), dtype=np.int32)

        c_result = hl.Buffer(hl.UInt(8), input.width(), input.height())
        for yy in range(input.height()):
            for xx in range(input.width()):
                # FIXME this loop is quite slow
                # Pure step of local_sum
                local_sum[0] = 0
                # Update step of local_sum
                for r_y in range(-2, 2 + 1):
                    for r_x in range(-2, 2 + 1):
                        # The clamping has been inlined into the update step.
                        clamped_x = min(max(xx + r_x, 0), input.width() - 1)
                        clamped_y = min(max(yy + r_y, 0), input.height() - 1)
                        local_sum[0] += input(clamped_x, clamped_y)

                # Pure step of blurry
                #c_result(x, y) = (uint8_t)(local_sum[0] / 25)
                #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25)
                c_result[xx, yy] = int(local_sum[0] /
                                       25)  # hl.cast done internally

        # Check the results match
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result(xx, yy) != c_result(xx, yy):
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result(xx, yy)))
                    return -1

    # Reduction helpers.
    if True:
        # There are several reduction helper functions provided in
        # Halide.h, which compute small reductions and schedule them
        # innermost into their consumer. The most useful one is
        # "sum".
        f1 = hl.Func("f1")
        r = hl.RDom(0, 100)
        f1[x] = hl.sum(r + x) * 7

        # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to:
        f2, anon = hl.Func("f2"), hl.Func("anon")
        anon[x] = 0
        anon[x] += r + x
        f2[x] = anon[x] * 7

        # So even though f1 references a reduction domain, it is a
        # pure function. The reduction domain has been swallowed to
        # define the inner anonymous reduction.
        halide_result_1 = f1.realize(10)
        halide_result_2 = f2.realize(10)

        # The equivalent C is:
        c_result = np.empty((10), dtype=np.int)
        for xx in range(10):
            anon = np.empty((1), dtype=np.int)
            anon[0] = 0
            for rr in range(100):
                anon[0] += rr + xx

            c_result[xx] = anon[0] * 7

        # Check they all match.
        for xx in range(10):
            if halide_result_1(xx) != c_result[xx]:
                print("halide_result_1(%d) = %d instead of %d", x,
                      halide_result_1(x), c_result[x])
                return -1

            if halide_result_2(xx) != c_result[xx]:
                print("halide_result_2(%d) = %d instead of %d", x,
                      halide_result_2(x), c_result[x])
                return -1

    # A complex example that uses reduction helpers.
    if False:  # non-sense to port SSE code to python, skipping this test

        # Other reduction helpers include "product", "minimum",
        # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax
        # requires understanding tuples, which come in a later
        # lesson. Let's use minimum and maximum to compute the local
        # spread of our grayscale image.

        # First, add a boundary condition to the input.
        clamped = hl.Func("clamped")
        x_clamped = hl.clamp(x, 0, input.width() - 1)
        y_clamped = hl.clamp(y, 0, input.height() - 1)
        clamped[x, y] = input[x_clamped, y_clamped]

        box = hl.RDom(-2, 5, -2, 5)
        # Compute the local maximum minus the local minimum:
        spread = hl.Func("spread")
        spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) -
                        minimum(clamped(x + box.x, y + box.y)))

        # Compute the result in strips of 32 scanlines
        yo, yi = hl.Var("yo"), hl.Var("yi")
        spread.split(y, yo, yi, 32).parallel(yo)

        # Vectorize across x within the strips. This implicitly
        # vectorizes stuff that is computed within the loop over x in
        # spread, which includes our minimum and maximum helpers, so
        # they get vectorized too.
        spread.vectorize(x, 16)

        # We'll apply the boundary condition by padding each scanline
        # as we need it in a circular buffer (see lesson 08).
        clamped.store_at(spread, yo).compute_at(spread, yi)

        halide_result = spread.realize(input.width(), input.height())

        # The C equivalent is almost too horrible to contemplate (and
        # took me a long time to debug). This time I want to time
        # both the Halide version and the C version, so I'll use sse
        # intrinsics for the vectorization, and openmp to do the
        # parallel for loop (you'll need to compile with -fopenmp or
        # similar to get correct timing).
        #ifdef __SSE2__

        # Don't include the time required to allocate the output buffer.
        c_result = hl.Buffer(hl.UInt(8), input.width(), input.height())

        #ifdef _OPENMP
        t1 = datetime.now()
        #endif

        # Run this one hundred times so we can average the timing results.
        for iters in range(100):
            pass
            # #pragma omp parallel for
            # for yo in range((input.height() + 31)/32):
            #     y_base = hl.min(yo * 32, input.height() - 32)
            #
            #     # Compute clamped in a circular buffer of size 8
            #     # (smallest power of two greater than 5). Each thread
            #     # needs its own allocation, so it must occur here.
            #
            #     clamped_width = input.width() + 4
            #     clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8)
            #
            #     for yi in range(32):
            #         y = y_base + yi
            #
            #         uint8_t *output_row = &c_result(0, y)
            #
            #         # Compute clamped for this scanline, skipping rows
            #         # already computed within this slice.
            #         int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2)
            #         int max_y_clamped = (y + 2)
            #         for (int cy = min_y_clamped cy <= max_y_clamped cy++) {
            #             # Figure out which row of the circular buffer
            #             # we're filling in using bitmasking:
            #             uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width
            #
            #             # Figure out which row of the input we're reading
            #             # from by clamping the y coordinate:
            #             int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1)
            #             uint8_t *input_row = &input(0, clamped_y)
            #
            #             # Fill it in with the padding.
            #             for (int x = -2 x < input.width() + 2 ):
            #                 int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1)
            #                 *clamped_row++ = input_row[clamped_x]
            #
            #
            #
            #         # Now iterate over vectors of x for the pure step of the output.
            #         for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) {
            #             int x_base = std::hl.min(x_vec * 16, input.width() - 16)
            #
            #             # Allocate storage for the minimum and maximum
            #             # helpers. One vector is enough.
            #             __m128i minimum_storage, maximum_storage
            #
            #             # The pure step for the maximum is a vector of zeros
            #             maximum_storage = (__m128i)_mm_setzero_ps()
            #
            #             # The update step for maximum
            #             for (int max_y = y - 2 max_y <= y + 2 max_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width
            #                 for (int max_x = x_base - 2 max_x <= x_base + 2 max_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2))
            #                     maximum_storage = _mm_max_epu8(maximum_storage, v)
            #
            #
            #
            #             # The pure step for the minimum is a vector of
            #             # ones. Create it by comparing something to
            #             # itself.
            #             minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(),
            #                                                     _mm_setzero_ps())
            #
            #             # The update step for minimum.
            #             for (int min_y = y - 2 min_y <= y + 2 min_y++) {
            #                 uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width
            #                 for (int min_x = x_base - 2 min_x <= x_base + 2 min_):
            #                     __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2))
            #                     minimum_storage = _mm_min_epu8(minimum_storage, v)
            #
            #
            #
            #             # Now compute the spread.
            #             __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage)
            #
            #             # Store it.
            #             _mm_storeu_si128((__m128i *)(output_row + x_base), spread)
            #
            #
            #
            #     del clamped_storage
            #
        # end of hundred iterations

        # Skip the timing comparison if we don't have openmp
        # enabled. Otherwise it's unfair to C.
        #ifdef _OPENMP
        t2 = datetime.now()

        # Now run the Halide version again without the
        # jit-compilation overhead. Also run it one hundred times.
        for iters in range(100):
            spread.realize(halide_result)

        t3 = datetime.now()

        # Report the timings. On my machine they both take about 3ms
        # for the 4-megapixel input (fast!), which makes sense,
        # because they're using the same vectorization and
        # parallelization strategy. However I find the Halide easier
        # to read, write, debug, modify, and port.
        print("Halide spread took %f ms. C equivalent took %f ms" %
              ((t3 - t2).total_seconds() * 1000,
               (t2 - t1).total_seconds() * 1000))

        #endif # _OPENMP

        # Check the results match:
        for yy in range(input.height()):
            for xx in range(input.width()):
                if halide_result(xx, yy) != c_result(xx, yy):
                    raise Exception(
                        "halide_result(%d, %d) = %d instead of %d" %
                        (xx, yy, halide_result(xx, yy), c_result(xx, yy)))
                    return -1

        #endif # __SSE2__
    else:
        print("(Skipped the SSE2 section of the code, "
              "since non-sense in python world.)")

    print("Success!")
    return 0
def focus_stack_pipeline():
    outputs = []
    start_w, start_h = 3000, 2000
    number_of_layers = 5
    layer_sizes = [[start_w, start_h]]    

    for i in range(0, number_of_layers):
            # Grab from prev layer
            w,h = layer_sizes[-1]
            layer_sizes.append([int(math.ceil(w/2.0)),int(math.ceil(h/2.0))])

    # Add last size in once more to get the 2nd top lap layer (gaussian) for
    # the energy/deviation split.
    layer_sizes.append(layer_sizes[-1])

    input = hl.ImageParam(hl.UInt(8), 3)
    input.dim(0).set_estimate(0, start_w)
    input.dim(1).set_estimate(0, start_h)
    input.dim(2).set_estimate(0, 3)

    lap_inputs = []
    max_energy_inputs = []

    for i in range(0,number_of_layers+1):
        lap_layer = hl.ImageParam(hl.Float(32), 3, "lap{}".format(i))
        lap_inputs.append(lap_layer)
        w,h = layer_sizes[i]
        lap_layer.dim(0).set_estimate(0, w)
        lap_layer.dim(1).set_estimate(0, h)
        lap_layer.dim(2).set_estimate(0, 3)

        if i == number_of_layers:
            # last (top - small) layer
            # Add the last laplacian (really direct from gaussian) layer
            # in twice. We output one maxed on entropies and one maxed on
            # deviations.
            lap_layer = hl.ImageParam(hl.Float(32), 3, "lap{}".format(i+1))
            lap_inputs.append(lap_layer)
            lap_layer.dim(0).set_estimate(0, w)
            lap_layer.dim(1).set_estimate(0, h)
            lap_layer.dim(2).set_estimate(0, 3)


            entropy_layer = hl.ImageParam(hl.Float(32), 2, "entroy{}".format(i))
            max_energy_inputs.append(entropy_layer)
            entropy_layer.dim(0).set_estimate(0, w)
            entropy_layer.dim(1).set_estimate(0, h)

            deviation_layer = hl.ImageParam(hl.Float(32), 2, "deviation{}".format(i))
            max_energy_inputs.append(deviation_layer)
            deviation_layer.dim(0).set_estimate(0, w)
            deviation_layer.dim(1).set_estimate(0, h)
        else:
            max_energy_layer = hl.ImageParam(hl.Float(32), 2, "max_energy{}".format(i))
            max_energy_inputs.append(max_energy_layer)
            max_energy_layer.dim(0).set_estimate(0, w)
            max_energy_layer.dim(1).set_estimate(0, h)

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")
    hist_index = hl.Var('hist_index')
    clamped = f32(x, y, c, mirror(input, 3000, 2000))

    f = hl.Func("input32")
    f[x, y, c] = clamped[x, y, c]

    energy_outputs = []
    gaussian_layers = [f]

    laplacian_layers = []
    merged_laps = []

    for layer_num in range(0, number_of_layers):
        # Add the layer size in also
        w,h = layer_sizes[layer_num]

        start_layer = gaussian_layers[-1]

        # Blur the image
        gaussian_layer = gaussian(x, y, c, start_layer)

        # Grab next layer size
        # w,h = layer_sizes[layer_num+1]

        # Reduce the layer size and add it into the list
        next_layer = reduce_layer(x, y, c, gaussian_layer)
        gaussian_layers.append(next_layer)

        # Expand back up
        expanded = expand_layer(x, y, c, next_layer)

        # Generate the laplacian from the
        # original - blurred/reduced/expanded version
        laplacian_layer = laplacian(x, y, c, start_layer, expanded)
        laplacian_layers.append(laplacian_layer)

        # Calculate energies for the gaussian layer
        prev_energies = mirror(max_energy_inputs[layer_num], w, h)
        next_energies = region_energy(x, y, c, laplacian_layer)

        prev_laplacian = mirror(lap_inputs[layer_num], w, h)
        merged_energies = energy_maxes(x, y, c, prev_energies, next_energies)

        merged_lap = merge_laplacian(x, y, c, merged_energies, next_energies, prev_laplacian, laplacian_layer)

        energy_outputs.append([[w,h,True],merged_energies])
        merged_laps.append(merged_lap)

        # Add estimates
        next_layer.set_estimate(x, 0, w)
        next_layer.set_estimate(y, 0, h)
        next_layer.set_estimate(c, 0, 3)

    # Handle last layer differently
    w,h = layer_sizes[-1]

    # The next_lap is really just the last gaussian layer
    next_lap = gaussian_layers[-1]

    prev_entropy_laplacian = mirror(lap_inputs[-2], w, h)
    prev_entropy = mirror(max_energy_inputs[-2], w, h)
    next_entropy = entropy(x, y, c, next_lap, w, h, hist_index)
    merged_entropy = energy_maxes(x, y, c, prev_entropy, next_entropy)
    merged_lap_on_entropy = merge_laplacian(x, y, c, merged_entropy, next_entropy, prev_entropy_laplacian, next_lap)
    merged_laps.append(merged_lap_on_entropy)

    prev_deviation_laplacian = mirror(lap_inputs[-1], w, h)
    prev_deviation = mirror(max_energy_inputs[-1], w, h)
    next_deviation = deviation(x, y, c, next_lap)
    merged_deviation = energy_maxes(x, y, c, prev_deviation, next_deviation)
    merged_lap_on_deviation = merge_laplacian(x, y, c, merged_deviation, next_deviation, prev_deviation_laplacian, next_lap)
    merged_laps.append(merged_lap_on_deviation)

    energy_outputs.append([[w,h,True],merged_entropy])
    energy_outputs.append([[w,h,True],merged_deviation])


    print("NUM LAYERS: ", len(gaussian_layers), len(laplacian_layers), layer_sizes)
    
    # Add all of the laplacian layers to the output first
    i = 0
    for merged_lap in merged_laps:
        w,h = layer_sizes[i]
        mid = (i < (len(merged_laps) - 2))
        outputs.append([[w,h,False,mid], merged_lap])
        i += 1

    # Then energies
    for energy_output in energy_outputs:
        outputs.append(energy_output)

    new_outputs = []
    for size, output in outputs:
        w = size[0]
        h = size[1]
        gray = len(size) > 2 and size[2]
        mid = len(size) > 3 and size[3]

        if mid:
            uint8_output = output
        else:
            uint8_output = output

        uint8_output.set_estimate(x, 0, w)
        uint8_output.set_estimate(y, 0, h)
        if not gray:
            uint8_output.set_estimate(c, 0, 3)

        new_outputs.append([size, uint8_output])

    outputs = new_outputs

    print("OUTPUT LAYERS: ")
    pprint(outputs)

    output_funcs = [output for _, output in outputs]
    
    pipeline = hl.Pipeline(output_funcs)

    return {
        'pipeline': pipeline,
        'inputs': [input] + lap_inputs + max_energy_inputs
    }
Exemple #23
0
def test_target():
    # Target("") should be exactly like get_host_target().
    t1 = hl.get_host_target()
    t2 = hl.Target("")
    assert t1 == t2, "Default ctor failure"
    assert t1.supported()

    # to_string roundtripping
    t1 = hl.Target()
    ts = t1.to_string()
    assert ts == "arch_unknown-0-os_unknown"

    # Note, this should *not* validate, since validate_target_string
    # now returns false if any of arch-bits-os are undefined
    assert not hl.Target.validate_target_string(ts)

    # Don't attempt to roundtrip this: trying to create
    # a Target with unknown portions will now assert-fail.
    #
    # t2 = hl.Target(ts)
    # assert t2 == t1

    # repr() and str()
    assert str(t1) == "arch_unknown-0-os_unknown"
    assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>"

    assert t1.os == hl.TargetOS.OSUnknown
    assert t1.arch == hl.TargetArch.ArchUnknown
    assert t1.bits == 0

    # Full specification round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    ts = t1.to_string()
    assert ts == "x86-32-linux-sse41"
    assert hl.Target.validate_target_string(ts)

    # Full specification (without features) round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32)
    ts = t1.to_string()
    assert ts == "x86-32-linux"
    assert hl.Target.validate_target_string(ts)

    # Full specification round-trip, crazy features
    t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32, [
        hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX,
        hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL,
        hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute,
        hl.TargetFeature.Debug
    ])
    ts = t1.to_string()
    assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41"
    assert hl.Target.validate_target_string(ts)

    # Expected failures:
    ts = "host-unknowntoken"
    assert not hl.Target.validate_target_string(ts)

    ts = "x86-23"
    assert not hl.Target.validate_target_string(ts)

    # bits == 0 is allowed only if arch_unknown and os_unknown are specified,
    # and no features are set
    ts = "x86-0"
    assert not hl.Target.validate_target_string(ts)

    ts = "0-arch_unknown-os_unknown-sse41"
    assert not hl.Target.validate_target_string(ts)

    # "host" is only supported as the first token
    ts = "opencl-host"
    assert not hl.Target.validate_target_string(ts)

    # set_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.SSE41, False)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # set_features
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_features([hl.TargetFeature.SSE41], False)
    t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert t1.has_feature(hl.TargetFeature.AVX2)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # with_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature(
        hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41"

    # without_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts])
    # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op
    t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature(
        hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-sse41"

    # natural_vector_size
    # SSE4.1 is 16 bytes wide
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.natural_vector_size(hl.UInt(8)) == 16
    assert t1.natural_vector_size(hl.Int(16)) == 8
    assert t1.natural_vector_size(hl.UInt(32)) == 4
    assert t1.natural_vector_size(hl.Float(32)) == 4

    # has_gpu_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.OpenCL])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [])
    assert t1.has_gpu_feature()
    assert not t2.has_gpu_feature()

    # has_large_buffers & maximum_buffer_size
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.LargeBuffers])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [])
    assert t1.has_large_buffers()
    assert t1.maximum_buffer_size() == 9223372036854775807
    assert not t2.has_large_buffers()
    assert t2.maximum_buffer_size() == 2147483647

    # supports_device_api
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.CUDA])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64)
    assert t1.supports_device_api(hl.DeviceAPI.CUDA)
    assert not t2.supports_device_api(hl.DeviceAPI.CUDA)

    # supports_type (deprecated version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64))
    assert t2.supports_type(hl.Float(64))

    # supports_type (preferred version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal)
    assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal)

    # target_feature_for_device_api
    assert hl.target_feature_for_device_api(
        hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL

    # with_feature with non-convertible lists
    try:
        t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                       ["this is a string"])
    except TypeError as e:
        assert "incompatible constructor arguments" in str(e)
    else:
        assert False, 'Did not see expected exception!'
Exemple #24
0
def get_local_laplacian(input, levels, alpha, beta, J=8):
    downsample_counter = [0]
    upsample_counter = [0]

    x = hl.Var('x')
    y = hl.Var('y')

    def downsample(f):
        downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 *
                          (f[2 * x, y, c] + f[2 * x + 1, y, c]) +
                          f[2 * x + 2, y, c]) / 8.0
        downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 *
                          (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) +
                          downx[x, 2 * y + 2, c]) / 8.0

        return downy

    def upsample(f):
        upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 *
                                (x % 2), y, c] + 0.75 * f[x // 2, y, c]
        upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 *
                                  (y % 2), c] + 0.75 * upx[x, y // 2, c]

        return upy

    def downsample2D(f):
        downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y] = (f[2 * x - 1, y] + 3.0 *
                       (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0
        downy[x, y] = (downx[x, 2 * y - 1] + 3.0 *
                       (downx[x, 2 * y] + downx[x, 2 * y + 1]) +
                       downx[x, 2 * y + 2]) / 8.0

        return downy

    def upsample2D(f):
        upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x,
            y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y]
        upy[x,
            y] = 0.25 * upx[x,
                            (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2]

        return upy

    # THE ALGORITHM

    # loop variables
    c = hl.Var('c')
    k = hl.Var('k')

    # Make the remapping function as a lookup table.
    remap = hl.Func('remap')
    fx = hl.cast(float_t, x / 256.0)
    #remap[x] = alpha*fx*exp(-fx*fx/2.0)
    remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0)

    # Convert to floating point
    floating = hl.Func('floating')
    floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0

    # Set a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y, c] = floating[hl.clamp(x, 0,
                                         input.width() - 1),
                                hl.clamp(y, 0,
                                         input.height() - 1), c]

    # Get the luminance channel
    gray = hl.Func('gray')
    gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[
        x, y, 1] + 0.114 * clamped[x, y, 2]

    # Make the processed Gaussian pyramid.
    gPyramid = [hl.Func('gPyramid%d' % i) for i in range(J)]
    # Do a lookup into a lut with 256 entires per intensity level
    level = k / (levels - 1)
    idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0
    idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256)
    gPyramid[0][x, y,
                k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k]
    for j in range(1, J):
        gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k]

    # Get its laplacian pyramid
    lPyramid = [hl.Func('lPyramid%d' % i) for i in range(J)]
    lPyramid[J - 1] = gPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(
            gPyramid[j + 1])[x, y, k]

    # Make the Gaussian pyramid of the input
    inGPyramid = [hl.Func('inGPyramid%d' % i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1, J):
        inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y]

    # Make the laplacian pyramid of the output
    outLPyramid = [hl.Func('outLPyramid%d' % i) for i in range(J)]
    for j in range(J):
        # Split input pyramid value into integer and floating parts
        level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1)
        li = hl.clamp(hl.cast(int_t, level), 0, levels - 2)
        lf = level - hl.cast(float_t, li)
        # Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j][x, y] = (
            1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1]

    # Make the Gaussian pyramid of the output
    outGPyramid = [hl.Func('outGPyramid%d' % i) for i in range(J)]
    outGPyramid[J - 1] = outLPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        outGPyramid[j][x, y] = upsample2D(
            outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]

    # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
    color = hl.Func('color')
    eps = 0.01
    color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] +
                                             eps) / (gray[x, y] + eps)

    output = hl.Func('local_laplacian')
    # Convert back to 16-bit
    output[x, y, c] = hl.cast(hl.UInt(16),
                              hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0)

    # THE SCHEDULE
    remap.compute_root()

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU Schedule
        print("Compiling for GPU")
        xi, yi = hl.Var("xi"), hl.Var("yi")
        output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default)
        for j in range(J):
            blockw = 32
            blockh = 16
            if j > 3:
                blockw = 2
                blockh = 2
            if j > 0:
                inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
                                                      blockh, GPU_Default)
            if j > 0:
                gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(
                    x, y, xi, yi, blockw, blockh, GPU_Default)
            outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
                                                   blockh, GPU_Default)
    else:
        # CPU schedule
        print("Compiling for CPU")
        output.parallel(y, 4).vectorize(x, 4)
        gray.compute_root().parallel(y, 4).vectorize(x, 4)
        for j in range(4):
            if j > 0:
                inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            if j > 0:
                gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            outGPyramid[j].compute_root().parallel(y).vectorize(x, 4)
        for j in range(4, J):
            inGPyramid[j].compute_root().parallel(y)
            gPyramid[j].compute_root().parallel(k)
            outGPyramid[j].compute_root().parallel(y)

    return output
Exemple #25
0
def test_simple(gen):
    x, y = hl.Var(), hl.Var()
    target = hl.get_jit_target_from_environment()

    b_in = hl.Buffer(hl.UInt(8), [2, 2])
    b_in.fill(123)

    f_in = hl.Func("f")
    f_in[x, y] = x + y

    # ----------- Inputs by-position
    f = gen(target, b_in, f_in, 3.5)
    _realize_and_check(f)

    # ----------- Inputs by-name
    f = gen(target, buffer_input=b_in, func_input=f_in, float_arg=3.5)
    _realize_and_check(f)

    f = gen(target, float_arg=3.5, buffer_input=b_in, func_input=f_in)
    _realize_and_check(f)

    # ----------- Above set again, w/ GeneratorParam mixed in
    k = 42

    gp = {"offset": k}

    # (positional)
    f = gen(target, b_in, f_in, 3.5, generator_params=gp)
    _realize_and_check(f, k)

    # (keyword)
    f = gen(target,
            generator_params=gp,
            buffer_input=b_in,
            func_input=f_in,
            float_arg=3.5)
    _realize_and_check(f, k)

    f = gen(target,
            buffer_input=b_in,
            generator_params=gp,
            func_input=f_in,
            float_arg=3.5)
    _realize_and_check(f, k)

    f = gen(target,
            buffer_input=b_in,
            func_input=f_in,
            generator_params=gp,
            float_arg=3.5)
    _realize_and_check(f, k)

    f = gen(target,
            buffer_input=b_in,
            float_arg=3.5,
            func_input=f_in,
            generator_params=gp)
    _realize_and_check(f, k)

    # ----------- Test various failure modes
    try:
        # Inputs w/ mixed by-position and by-name
        f = gen(target, b_in, f_in, float_arg=3.5)
    except RuntimeError as e:
        assert 'Cannot use both positional and keyword arguments for inputs.' in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # too many positional args
        f = gen(target, b_in, f_in, 3.5, 4)
    except RuntimeError as e:
        assert 'Expected exactly 3 positional args for inputs, but saw 4.' in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # too few positional args
        f = gen(target, b_in, f_in)
    except RuntimeError as e:
        assert 'Expected exactly 3 positional args for inputs, but saw 2.' in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Inputs that can't be converted to what the receiver needs (positional)
        f = gen(target, hl.f32(3.141592), "happy", k)
    except RuntimeError as e:
        assert 'Unable to cast Python instance' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Inputs that can't be converted to what the receiver needs (named)
        f = gen(target, b_in, f_in, float_arg="bogus")
    except RuntimeError as e:
        assert 'Unable to cast Python instance' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Input specified by both pos and kwarg
        f = gen(target, b_in, f_in, 3.5, float_arg=4.5)
    except RuntimeError as e:
        assert "Cannot use both positional and keyword arguments for inputs." in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # generator_params is not a dict
        f = gen(target, b_in, f_in, 3.5, generator_params=[1, 2, 3])
    except TypeError as e:
        assert "cannot convert dictionary" in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Bad gp name
        f = gen(target, b_in, f_in, 3.5, generator_params={"foo": 0})
    except RuntimeError as e:
        assert "has no GeneratorParam named: foo" in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Bad input name
        f = gen(target,
                buffer_input=b_in,
                float_arg=3.5,
                generator_params=gp,
                funk_input=f_in)
    except RuntimeError as e:
        assert "Unknown input 'funk_input' specified via keyword argument." in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Bad gp name
        f = gen(target,
                buffer_input=b_in,
                float_arg=3.5,
                generator_params=gp,
                func_input=f_in,
                nonexistent_generator_param="wat")
    except RuntimeError as e:
        assert "Unknown input 'nonexistent_generator_param' specified via keyword argument." in str(
            e)
    else:
        assert False, 'Did not see expected exception!'
Exemple #26
0
def _check_is_u16(e):
    assert e.type() == hl.UInt(16), e.type()
Exemple #27
0
def test_complexstub():
    constant_image = _make_constant_image()
    input = hl.ImageParam(hl.UInt(8), 3, 'input')
    input.set(constant_image)

    x, y, c = hl.Var(), hl.Var(), hl.Var()
    target = hl.get_jit_target_from_environment()

    float_arg = 1.25
    int_arg = 33

    r = complexstub(target,
                    typed_buffer_input=constant_image,
                    untyped_buffer_input=constant_image,
                    simple_input=input,
                    array_input=[input, input],
                    float_arg=float_arg,
                    int_arg=[int_arg, int_arg],
                    untyped_buffer_output_type="uint8",
                    vectorize=True)

    # return value is a tuple; unpack separately to avoid
    # making the callsite above unreadable
    (simple_output, tuple_output, array_output, typed_buffer_output,
     untyped_buffer_output, static_compiled_buffer_output) = r

    b = simple_output.realize(32, 32, 3, target)
    assert b.type() == hl.Float(32)
    for x in range(32):
        for y in range(32):
            for c in range(3):
                expected = constant_image[x, y, c]
                actual = b[x, y, c]
                assert expected == actual, "Expected %s Actual %s" % (expected,
                                                                      actual)

    b = tuple_output.realize(32, 32, 3, target)
    assert b[0].type() == hl.Float(32)
    assert b[1].type() == hl.Float(32)
    assert len(b) == 2
    for x in range(32):
        for y in range(32):
            for c in range(3):
                expected1 = constant_image[x, y, c] * float_arg
                expected2 = expected1 + int_arg
                actual1, actual2 = b[0][x, y, c], b[1][x, y, c]
                assert expected1 == actual1, "Expected1 %s Actual1 %s" % (
                    expected1, actual1)
                assert expected2 == actual2, "Expected2 %s Actual1 %s" % (
                    expected2, actual2)

    assert len(array_output) == 2
    for a in array_output:
        b = a.realize(32, 32, target)
        assert b.type() == hl.Int(16)
        for x in range(32):
            for y in range(32):
                expected = constant_image[x, y, 0] + int_arg
                actual = b[x, y]
                assert expected == actual, "Expected %s Actual %s" % (expected,
                                                                      actual)

    # TODO: Output<Buffer<>> has additional behaviors useful when a Stub
    # is used within another Generator; this isn't yet implemented since there
    # isn't yet Python bindings for Generator authoring. This section
    # of the test may need revision at that point.
    b = typed_buffer_output.realize(32, 32, 3, target)
    assert b.type() == hl.Float(32)
    for x in range(32):
        for y in range(32):
            for c in range(3):
                expected = constant_image[x, y, c]
                actual = b[x, y, c]
                assert expected == actual, "Expected %s Actual %s" % (expected,
                                                                      actual)

    b = untyped_buffer_output.realize(32, 32, 3, target)
    assert b.type() == hl.UInt(8)
    for x in range(32):
        for y in range(32):
            for c in range(3):
                expected = constant_image[x, y, c]
                actual = b[x, y, c]
                assert expected == actual, "Expected %s Actual %s" % (expected,
                                                                      actual)

    b = static_compiled_buffer_output.realize(4, 4, 1, target)
    assert b.type() == hl.UInt(8)
    for x in range(4):
        for y in range(4):
            for c in range(1):
                expected = constant_image[x, y, c] + 42
                actual = b[x, y, c]
                assert expected == actual, "Expected %s Actual %s" % (expected,
                                                                      actual)
Exemple #28
0
def test_schedules(verbose=False, test_random=False):
    #random_module.seed(int(sys.argv[1]) if len(sys.argv)>1 else 0)
    halide.exit_on_signal()
    f = halide.Func('f')
    x = halide.Var('x')
    y = halide.Var('y')
    c = halide.Var('c')
    g = halide.Func('g')
    v = halide.Var('v')
    input = halide.UniformImage(halide.UInt(16), 3)
    int_t = halide.Int(32)
    f[x, y, c] = input[
        halide.clamp(x, halide.cast(int_t, 0
                                    ), halide.cast(int_t,
                                                   input.width() - 1)),
        halide.clamp(y, halide.cast(int_t, 0
                                    ), halide.cast(int_t,
                                                   input.height() - 1)),
        halide.clamp(c, halide.cast(int_t, 0), halide.cast(int_t, 2))]
    #g[v] = f[v,v]
    g[x, y, c] = f[x, y, c] + 1
    assert sorted(halide.all_vars(g).keys()) == sorted(['x', 'y',
                                                        'c'])  #, 'v'])

    if verbose:
        print halide.func_varlist(f)
        print 'caller_vars(f) =', caller_vars(g, f)
        print 'caller_vars(g) =', caller_vars(g, g)


#    validL = list(valid_schedules(g, f, 4))
#    validL = [repr(_x) for _x in validL]
#
#    for L in sorted(validL):
#        print repr(L)
    T0 = time.time()
    if not test_random:
        random = True  #False
        nvalid_determ = 0
        for L in schedules_func(g, f, 0, 3):
            nvalid_determ += 1
            if verbose:
                print L
        nvalid_random = 0
        for i in range(100):
            for L in schedules_func(
                    g, f, 0, DEFAULT_MAX_DEPTH, random=True
            ):  #sorted([repr(_x) for _x in valid_schedules(g, f, 3)]):
                if verbose and 0:
                    print L  #repr(L)
                nvalid_random += 1
    s = []
    for i in range(400):
        d = random_schedule(g, 0, DEFAULT_MAX_DEPTH)
        si = str(d)
        s.append(si)
        if verbose:
            print 'Schedule:', si

        d.apply()
        evaluate = d.test((36, 36, 3), input)
        print 'evaluate'
        evaluate()
        if test_random:
            print 'Success'
            sys.exit()
    T1 = time.time()

    s = '\n'.join(s)
    assert 'f.chunk(_c0)' in s
    assert 'f.root().vectorize' in s
    assert 'f.root().unroll' in s
    assert 'f.root().split' in s
    assert 'f.root().tile' in s
    assert 'f.root().parallel' in s
    assert 'f.root().transpose' in s

    assert nvalid_random == 100
    if verbose:
        print 'generated in %.3f secs' % (T1 - T0)
    print 'random_schedule: OK'
Exemple #29
0
def test_simplestub():
    x, y = hl.Var(), hl.Var()
    target = hl.get_jit_target_from_environment()

    b_in = hl.Buffer(hl.UInt(8), [2, 2])
    b_in.fill(123)

    f_in = hl.Func("f")
    f_in[x, y] = x + y

    # ----------- Inputs by-position
    f = simplestub.generate(target, b_in, f_in, 3.5)
    _realize_and_check(f)

    # ----------- Inputs by-name
    f = simplestub.generate(target,
                            buffer_input=b_in,
                            func_input=f_in,
                            float_arg=3.5)
    _realize_and_check(f)

    # ----------- Inputs w/ mixed by-position and by-name
    f = simplestub.generate(target, b_in, f_in, float_arg=3.5)
    _realize_and_check(f)

    f = simplestub.generate(target, b_in, float_arg=3.5, func_input=f_in)
    _realize_and_check(f)

    # ----------- Above set again, w/ GeneratorParam mixed in
    k = 42

    f = simplestub.generate(target, b_in, f_in, 3.5, offset=k)
    _realize_and_check(f, k)

    f = simplestub.generate(target,
                            offset=k,
                            buffer_input=b_in,
                            func_input=f_in,
                            float_arg=3.5)
    _realize_and_check(f, k)

    f = simplestub.generate(target, b_in, f_in, offset=k, float_arg=3.5)
    _realize_and_check(f, k)

    f = simplestub.generate(target,
                            b_in,
                            float_arg=3.5,
                            offset=k,
                            func_input=f_in)
    _realize_and_check(f, k)

    # ----------- Test various failure modes
    try:
        # too many positional args
        f = simplestub.generate(target, b_in, f_in, 3.5, 4)
    except RuntimeError as e:
        assert 'Expected at most 3 positional args, but saw 4.' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Inputs that can't be converted to what the receiver needs (positional)
        f = simplestub.generate(target, 3.141592, "happy")
    except RuntimeError as e:
        assert 'Unable to cast Python instance' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Inputs that can't be converted to what the receiver needs (named)
        f = simplestub.generate(target, b_in, f_in, float_arg="bogus")
    except RuntimeError as e:
        assert 'Unable to cast Python instance' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Missing required inputs
        f = simplestub.generate(target, b_in, f_in)
    except RuntimeError as e:
        assert "Generator Input named 'float_arg' was not specified." in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Input specified by both pos and kwarg
        f = simplestub.generate(target, b_in, f_in, 3.5, float_arg=4.5)
    except RuntimeError as e:
        assert "Generator Input named 'float_arg' was specified by both position and keyword." in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Bad input name
        f = simplestub.generate(target,
                                b_in,
                                float_arg=3.5,
                                offset=k,
                                funk_input=f_in)
    except RuntimeError as e:
        assert "Generator Input named 'func_input' was not specified." in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        # Bad gp name
        f = simplestub.generate(target,
                                b_in,
                                float_arg=3.5,
                                offset=k,
                                func_input=f_in,
                                nonexistent_generator_param="wat")
    except RuntimeError as e:
        assert "Generator simplestub has no GeneratorParam named: nonexistent_generator_param" in str(
            e)
    else:
        assert False, 'Did not see expected exception!'