Ejemplo n.º 1
0
def main():
    hl.load_plugin("autoschedule_li2018")

    x = hl.Var('x')
    f_in = hl.Func('in')
    f_in[x] = hl.f32(x)  # Cast to float 32
    f_0 = hl.Func('f_0')
    f_0[x] = 2 * f_in[x]
    f_1 = hl.Func('f_1')
    f_1[x] = hl.sin(f_0[x])
    f_2 = hl.Func('f_2')
    f_2[x] = f_1[x] * f_1[x]

    # Setup
    f_2.set_estimate(x, 0, 1000)
    p = hl.Pipeline(f_2)
    target = hl.Target()
    # Only first parameter is used (number of cores on CPU)
    params = hl.MachineParams(32, 0, 0)
    result = p.auto_schedule('Li2018', target, params)
    print('Schedule:')
    print(result.schedule_source)

    p.compile_jit()  # compile
    buf = p.realize(1000)  # compute and get the buffer
Ejemplo n.º 2
0
 def test_generate_halide(self):
     zone = self.define_original_twoel()
     decomposed = zone.split_recursive()
     self.vars  = {k: hl.Var(k) for k in "ijkl"}
     i, j, k, l = [self.vars[k] for k in "ijkl"]
     g_dens = hl.Func("g_dens")
     g_dens[i,j] = i * j
     g = hl.Func("g")
     g[i,j,k,l] = hl.cos(i*j) * hl.sin(k*l)
     self.inputs = {"g": g, "g_dens": g_dens}
     self.clamps = {"g": g, "g_dens": g_dens}
     self.funcs = {"g": g, "g_dens": g_dens}
     self.loopnest_funcs = {}
     func = decomposed.generate_halide(self, [8, 8, 8, 8])
Ejemplo n.º 3
0
def contrast(input, strength, black_point):
    output = hl.Func("contrast_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    scale = strength

    inner_constant = math.pi / (2 * scale)
    sin_constant = hl.sin(inner_constant)
    slope = 65535 / (2 * sin_constant)
    constant = slope * sin_constant
    factor = math.pi / (scale * 65535)

    val = factor * hl.cast(hl.Float(32), input[x, y, c])

    output[x, y, c] = hl.u16_sat(slope * hl.sin(val - inner_constant) + constant)

    white_scale = 65535 / (65535 - black_point)

    output[x, y, c] = hl.u16_sat((hl.cast(hl.Int(32), output[x, y, c]) - black_point) * white_scale)

    output.compute_root().parallel(y).vectorize(x, 16)

    return output
Ejemplo n.º 4
0
def main():

    # So far Funcs (such as the one below) have evaluated to a single
    # scalar value for each point in their domain.
    single_valued = hl.Func()
    x, y = hl.Var("x"), hl.Var("y")
    single_valued[x, y] = x + y

    # One way to write a hl.Func that returns a collection of values is
    # to add an additional dimension which indexes that
    # collection. This is how we typically deal with color. For
    # example, the hl.Func below represents a collection of three values
    # for every x, y coordinate indexed by c.
    color_image = hl.Func()
    c = hl.Var("c")
    color_image[x, y, c] = hl.select(
        c == 0,
        245,  # Red value
        c == 1,
        42,  # Green value
        132)  # Blue value

    # Since this pattern appears quite often, Halide provides a
    # syntatic sugar to write the code above as the following,
    # using the "mux" function.
    # color_image[x, y, c] = hl.mux(c, [245, 42, 132]);

    # This method is often convenient because it makes it easy to
    # operate on this hl.Func in a way that treats each item in the
    # collection equally:
    brighter = hl.Func()
    brighter[x, y, c] = color_image[x, y, c] + 10

    # However this method is also inconvenient for three reasons.
    #
    # 1) Funcs are defined over an infinite domain, so users of this
    # hl.Func can for example access color_image(x, y, -17), which is
    # not a meaningful value and is probably indicative of a bug.
    #
    # 2) It requires a hl.select, which can impact performance if not
    # bounded and unrolled:
    # brighter.bound(c, 0, 3).unroll(c)
    #
    # 3) With this method, all values in the collection must have the
    # same type. While the above two issues are merely inconvenient,
    # this one is a hard limitation that makes it impossible to
    # express certain things in this way.

    # It is also possible to represent a collection of values as a
    # collection of Funcs:
    func_array = [hl.Func() for i in range(3)]
    func_array[0][x, y] = x + y
    func_array[1][x, y] = hl.sin(x)
    func_array[2][x, y] = hl.cos(y)

    # This method avoids the three problems above, but introduces a
    # new annoyance. Because these are separate Funcs, it is
    # difficult to schedule them so that they are all computed
    # together inside a single loop over x, y.

    # A third alternative is to define a hl.Func as evaluating to a
    # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of
    # Exprs which may have different type. The following function
    # evaluates to an integer value (x+y), and a floating point value
    # (hl.sin(x*y)).
    multi_valued = hl.Func("multi_valued")
    multi_valued[x, y] = (x + y, hl.sin(x * y))

    # Realizing a tuple-valued hl.Func returns a collection of
    # Buffers. We call this a Realization. It's equivalent to a
    # std::vector of hl.Buffer/Image objects:
    if True:
        im1, im2 = multi_valued.realize([80, 60])
        assert im1.type() == hl.Int(32)
        assert im2.type() == hl.Float(32)
        assert im1[30, 40] == 30 + 40
        assert np.isclose(im2[30, 40], math.sin(30 * 40))

    # You can also pass a tuple of pre-allocated buffers to realize()
    # rather than having new ones created. (The Buffers must have the correct
    # types and have identical sizes.)
    if True:
        im1, im2 = hl.Buffer(hl.Int(32),
                             [80, 60]), hl.Buffer(hl.Float(32), [80, 60])
        multi_valued.realize((im1, im2))
        assert im1[30, 40] == 30 + 40
        assert np.isclose(im2[30, 40], math.sin(30 * 40))

    # All Tuple elements are evaluated together over the same domain
    # in the same loop nest, but stored in distinct allocations. The
    # equivalent C++ code to the above is:
    if True:
        multi_valued_0 = np.empty((80 * 60), dtype=np.int32)
        multi_valued_1 = np.empty((80 * 60), dtype=np.int32)

        for yy in range(80):
            for xx in range(60):
                multi_valued_0[xx + 60 * yy] = xx + yy
                multi_valued_1[xx + 60 * yy] = math.sin(xx * yy)

    # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates
    # into multiple distinct output halide_buffer_t structs. These appear in
    # order at the end of the function signature:
    # int multi_valued(...input buffers and params..., halide_buffer_t
    # *output_1, halide_buffer_t *output_2)

    # You can construct a Tuple by passing multiple Exprs to the
    # Tuple constructor as we did above. Perhaps more elegantly, you
    # can also take advantage of initializer lists and just
    # enclose your Exprs in braces:
    multi_valued_2 = hl.Func("multi_valued_2")
    multi_valued_2[x, y] = (x + y, hl.sin(x * y))

    # Calls to a multi-valued hl.Func cannot be treated as Exprs. The
    # following is a syntax error:
    # hl.Func consumer
    # consumer[x, y] = multi_valued_2[x, y] + 10

    # Instead you must index the returned object with square brackets
    # to retrieve the individual Exprs:
    integer_part = multi_valued_2[x, y][0]
    floating_part = multi_valued_2[x, y][1]
    assert type(integer_part) is hl.FuncTupleElementRef
    assert type(floating_part) is hl.FuncTupleElementRef

    consumer = hl.Func()
    consumer[x, y] = (integer_part + 10, floating_part + 10.0)

    # Tuple reductions.
    if True:
        # Tuples are particularly useful in reductions, as they allow
        # the reduction to maintain complex state as it walks along
        # its domain. The simplest example is an argmax.

        # First we create an Image to take the argmax over.
        input_func = hl.Func()
        input_func[x] = hl.sin(x)
        input = input_func.realize([100])
        assert input.type() == hl.Float(32)

        # Then we defined a 2-valued Tuple which tracks the maximum value
        # its index.
        arg_max = hl.Func()

        # Pure definition.
        # (using [()] for zero-dimensional Funcs is a convention of this python interface)
        arg_max[()] = (0, input[0])

        # Update definition.
        r = hl.RDom([(1, 99)])
        old_index = arg_max[()][0]
        old_max = arg_max[()][1]
        new_index = hl.select(old_max > input[r], r, old_index)
        new_max = hl.max(input[r], old_max)
        arg_max[()] = (new_index, new_max)

        # The equivalent C++ is:
        arg_max_0 = 0
        arg_max_1 = float(input[0])
        for r in range(1, 100):
            old_index = arg_max_0
            old_max = arg_max_1
            new_index = r if (old_max > input[r]) else old_index
            new_max = max(input[r], old_max)
            # In a tuple update definition, all loads and computation
            # are done before any stores, so that all Tuple elements
            # are updated atomically with respect to recursive calls
            # to the same hl.Func.
            arg_max_0 = new_index
            arg_max_1 = new_max

        # Let's verify that the Halide and C++ found the same maximum
        # value and index.
        if True:
            r0, r1 = arg_max.realize()

            assert r0.type() == hl.Int(32)
            assert r1.type() == hl.Float(32)
            assert arg_max_0 == r0[()]
            assert np.isclose(arg_max_1, r1[()])

        # Halide provides argmax and hl.argmin as built-in reductions
        # similar to sum, product, maximum, and minimum. They return
        # a Tuple consisting of the point in the reduction domain
        # corresponding to that value, and the value itself. In the
        # case of ties they return the first value found. We'll use
        # one of these in the following section.

    # Tuples for user-defined types.
    if True:
        # Tuples can also be a convenient way to represent compound
        # objects such as complex numbers. Defining an object that
        # can be converted to and from a Tuple is one way to extend
        # Halide's type system with user-defined types.
        class Complex:
            def __init__(self, r, i=None):
                if type(r) is float and type(i) is float:
                    self.real = hl.Expr(r)
                    self.imag = hl.Expr(i)
                elif i is not None:
                    self.real = r
                    self.imag = i
                else:
                    self.real = r[0]
                    self.imag = r[1]

            def as_tuple(self):
                "Convert to a Tuple"
                return (self.real, self.imag)

            def __add__(self, other):
                "Complex addition"
                return Complex(self.real + other.real, self.imag + other.imag)

            def __mul__(self, other):
                "Complex multiplication"
                return Complex(self.real * other.real - self.imag * other.imag,
                               self.real * other.imag + self.imag * other.real)

            def __getitem__(self, idx):
                return (self.real, self.imag)[idx]

            def __len__(self):
                return 2

            def magnitude(self):
                "Complex magnitude"
                return (self.real * self.real) + (self.imag * self.imag)

            # Other complex operators would go here. The above are
            # sufficient for this example.

        # Let's use the Complex struct to compute a Mandelbrot set.
        mandelbrot = hl.Func()

        # The initial complex value corresponding to an x, y coordinate
        # in our hl.Func.
        initial = Complex(x / 15.0 - 2.5, y / 6.0 - 2.0)

        # Pure definition.
        t = hl.Var("t")
        mandelbrot[x, y, t] = Complex(0.0, 0.0)

        # We'll use an update definition to take 12 steps.
        r = hl.RDom([(1, 12)])
        current = Complex(mandelbrot[x, y, r - 1])

        # The following line uses the complex multiplication and
        # addition we defined above.
        mandelbrot[x, y, r] = (Complex(current * current) + initial)

        # We'll use another tuple reduction to compute the iteration
        # number where the value first escapes a circle of radius 4.
        # This can be expressed as an hl.argmin of a boolean - we want
        # the index of the first time the given boolean expression is
        # false (we consider false to be less than true).  The argmax
        # would return the index of the first time the expression is
        # true.

        escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0
        first_escape = hl.argmin(escape_condition)
        assert type(first_escape) is tuple
        # We only want the index, not the value, but hl.argmin returns
        # both, so we'll index the hl.argmin Tuple expression using
        # square brackets to get the hl.Expr representing the index.
        escape = hl.Func()
        escape[x, y] = first_escape[0]

        # Realize the pipeline and print the result as ascii art.
        result = escape.realize([61, 25])
        assert result.type() == hl.Int(32)
        code = " .:-~*={&%#@"
        for yy in range(result.height()):
            for xx in range(result.width()):
                index = result[xx, yy]
                if index < len(code):
                    print("%c" % code[index], end="")
                else:
                    pass  # is lesson 13 cpp version buggy ?
            print("")

    print("Success!")

    return 0
Ejemplo n.º 5
0
def main():

    # So far Funcs (such as the one below) have evaluated to a single
    # scalar value for each point in their domain.
    single_valued = hl.Func()
    x, y = hl.Var("x"), hl.Var("y")
    single_valued[x, y] = x + y

    # One way to write a hl.Func that returns a collection of values is
    # to add an additional dimension which indexes that
    # collection. This is how we typically deal with color. For
    # example, the hl.Func below represents a collection of three values
    # for every x, y coordinate indexed by c.
    color_image = hl.Func()
    c = hl.Var("c")
    color_image[x, y, c] = hl.select(c == 0, 245, # Red value
                                  c == 1, 42,  # Green value
                                  132)        # Blue value

    # This method is often convenient because it makes it easy to
    # operate on this hl.Func in a way that treats each item in the
    # collection equally:
    brighter = hl.Func()
    brighter[x, y, c] = color_image[x, y, c] + 10

    # However this method is also inconvenient for three reasons.
    #
    # 1) Funcs are defined over an infinite domain, so users of this
    # hl.Func can for example access color_image(x, y, -17), which is
    # not a meaningful value and is probably indicative of a bug.
    #
    # 2) It requires a hl.select, which can impact performance if not
    # bounded and unrolled:
    # brighter.bound(c, 0, 3).unroll(c)
    #
    # 3) With this method, all values in the collection must have the
    # same type. While the above two issues are merely inconvenient,
    # this one is a hard limitation that makes it impossible to
    # express certain things in this way.

    # It is also possible to represent a collection of values as a
    # collection of Funcs:
    func_array = [hl.Func() for i in range(3)]
    func_array[0][x, y] = x + y
    func_array[1][x, y] = hl.sin(x)
    func_array[2][x, y] = hl.cos(y)

    # This method avoids the three problems above, but introduces a
    # new annoyance. Because these are separate Funcs, it is
    # difficult to schedule them so that they are all computed
    # together inside a single loop over x, y.

    # A third alternative is to define a hl.Func as evaluating to a
    # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of
    # Exprs which may have different type. The following function
    # evaluates to an integer value (x+y), and a floating point value
    # (hl.sin(x*y)).
    multi_valued = hl.Func("multi_valued")
    multi_valued[x, y] = (x + y, hl.sin(x * y))

    # Realizing a tuple-valued hl.Func returns a collection of
    # Buffers. We call this a Realization. It's equivalent to a
    # std::vector of hl.Buffer/Image objects:
    if True:
        (im1, im2) = multi_valued.realize(80, 60)
        assert type(im1) is hl.Buffer_int32
        assert type(im2) is hl.Buffer_float32
        assert im1(30, 40) == 30 + 40
        assert numpy.isclose(im2(30, 40), math.sin(30 * 40))


    # All Tuple elements are evaluated together over the same domain
    # in the same loop nest, but stored in distinct allocations. The
    # equivalent C++ code to the above is:
    if True:
        multi_valued_0 = numpy.empty((80*60), dtype=numpy.int32)
        multi_valued_1 = numpy.empty((80*60), dtype=numpy.int32)

        for yy in range(80):
            for xx in range(60):
                multi_valued_0[xx + 60*yy] = xx + yy
                multi_valued_1[xx + 60*yy] = math.sin(xx*yy)


    # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates
    # into multiple distinct output buffer_t structs. These appear in
    # order at the end of the function signature:
    # int multi_valued(...input buffers and params..., buffer_t *output_1, buffer_t *output_2)

    # You can construct a Tuple by passing multiple Exprs to the
    # Tuple constructor as we did above. Perhaps more elegantly, you
    # can also take advantage of C++11 initializer lists and just
    # enclose your Exprs in braces:
    multi_valued_2 = hl.Func("multi_valued_2")
    multi_valued_2[x, y] = (x + y, hl.sin(x * y))

    # Calls to a multi-valued hl.Func cannot be treated as Exprs. The
    # following is a syntax error:
    # hl.Func consumer
    # consumer[x, y] = multi_valued_2[x, y] + 10

    # Instead you must index the returned object with square brackets
    # to retrieve the individual Exprs:
    integer_part = multi_valued_2[x, y][0]
    floating_part = multi_valued_2[x, y][1]
    assert type(integer_part) is hl.FuncTupleElementRef
    assert type(floating_part) is hl.FuncTupleElementRef

    consumer = hl.Func()
    consumer[x, y] = (integer_part + 10, floating_part + 10.0)

    # Tuple reductions.
    if True:
        # Tuples are particularly useful in reductions, as they allow
        # the reduction to maintain complex state as it walks along
        # its domain. The simplest example is an argmax.

        # First we create an Image to take the argmax over.
        input_func = hl.Func()
        input_func[x] = hl.sin(x)
        input = input_func.realize(100)
        assert type(input) is hl.Buffer_float32

        # Then we defined a 2-valued Tuple which tracks the maximum value
        # its index.
        arg_max = hl.Func()

        # Pure definition.
        # (using [()] for zero-dimensional Funcs is a convention of this python interface)
        arg_max[()] = (0, input(0))

        # Update definition.
        r = hl.RDom(1, 99)
        old_index = arg_max[()][0]
        old_max   = arg_max[()][1]
        new_index = hl.select(old_max > input[r], r, old_index)
        new_max   = hl.max(input[r], old_max)
        arg_max[()] = (new_index, new_max)

        # The equivalent C++ is:
        arg_max_0 = 0
        arg_max_1 = float(input(0))
        for r in range(1, 100):
            old_index = arg_max_0
            old_max = arg_max_1
            new_index = r if (old_max > input(r)) else old_index
            new_max = max(input(r), old_max)
            # In a tuple update definition, all loads and computation
            # are done before any stores, so that all Tuple elements
            # are updated atomically with respect to recursive calls
            # to the same hl.Func.
            arg_max_0 = new_index
            arg_max_1 = new_max


        # Let's verify that the Halide and C++ found the same maximum
        # value and index.
        if True:
            (r0, r1) = arg_max.realize()

            assert type(r0) is hl.Buffer_int32
            assert type(r1) is hl.Buffer_float32
            assert arg_max_0 == r0(0)
            assert numpy.isclose(arg_max_1, r1(0))


        # Halide provides argmax and hl.argmin as built-in reductions
        # similar to sum, product, maximum, and minimum. They return
        # a Tuple consisting of the point in the reduction domain
        # corresponding to that value, and the value itself. In the
        # case of ties they return the first value found. We'll use
        # one of these in the following section.


    # Tuples for user-defined types.
    if True:
        # Tuples can also be a convenient way to represent compound
        # objects such as complex numbers. Defining an object that
        # can be converted to and from a Tuple is one way to extend
        # Halide's type system with user-defined types.
        class Complex:

            def __init__(self, r, i=None):
                if type(r) is float and type(i) is float:
                    self.real = hl.Expr(r)
                    self.imag = hl.Expr(i)
                elif i is not None:
                    self.real = r
                    self.imag = i
                else:
                    self.real = r[0]
                    self.imag = r[1]

            def as_tuple(self):
                "Convert to a Tuple"
                return (self.real, self.imag)


            def __add__(self, other):
                "Complex addition"
                return Complex(self.real + other.real, self.imag + other.imag)


            def __mul__(self, other):
                "Complex multiplication"
                return Complex(self.real * other.real - self.imag * other.imag,
                               self.real * other.imag + self.imag * other.real)

            def __getitem__(self, idx):
                return (self.real, self.imag)[idx]

            def __len__(self):
                return 2

            def magnitude(self):
                "Complex magnitude"
                return (self.real * self.real) + (self.imag * self.imag)


            # Other complex operators would go here. The above are
            # sufficient for this example.


        # Let's use the Complex struct to compute a Mandelbrot set.
        mandelbrot = hl.Func()

        # The initial complex value corresponding to an x, y coordinate
        # in our hl.Func.
        initial = Complex(x/15.0 - 2.5, y/6.0 - 2.0)

        # Pure definition.
        t = hl.Var("t")
        mandelbrot[x, y, t] = Complex(0.0, 0.0)

        # We'll use an update definition to take 12 steps.
        r = hl.RDom(1, 12)
        current = Complex(mandelbrot[x, y, r-1])

        # The following line uses the complex multiplication and
        # addition we defined above.
        mandelbrot[x, y, r] = (Complex(current*current) + initial)

        # We'll use another tuple reduction to compute the iteration
        # number where the value first escapes a circle of radius 4.
        # This can be expressed as an hl.argmin of a boolean - we want
        # the index of the first time the given boolean expression is
        # false (we consider false to be less than true).  The argmax
        # would return the index of the first time the expression is
        # true.

        escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0
        first_escape = hl.argmin(escape_condition)
        assert type(first_escape) is tuple
        # We only want the index, not the value, but hl.argmin returns
        # both, so we'll index the hl.argmin Tuple expression using
        # square brackets to get the hl.Expr representing the index.
        escape = hl.Func()
        escape[x, y] = first_escape[0]

        # Realize the pipeline and print the result as ascii art.
        result = escape.realize(61, 25)
        assert type(result) is hl.Buffer_int32
        code = " .:-~*={&%#@"
        for yy in range(result.height()):
            for xx in range(result.width()):
                index = result(xx, yy)
                if index < len(code):
                    print("%c" % code[index], end="")
                else:
                    pass # is lesson 13 cpp version buggy ?
            print("")


    print("Success!")

    return 0
Ejemplo n.º 6
0
def main():

    # All Exprs have a scalar type, and all Funcs evaluate to one or
    # more scalar types. The scalar types in Halide are unsigned
    # integers of various bit widths, signed integers of the same set
    # of bit widths, floating point numbers in single and double
    # precision, and opaque handles (equivalent to void *). The
    # following array contains all the legal types.

    valid_halide_types = [
        hl.UInt(8), hl.UInt(16), hl.UInt(32), hl.UInt(64),
        hl.Int(8), hl.Int(16), hl.Int(32), hl.Int(64),
        hl.Float(32), hl.Float(64), hl.Handle() ]


    # Constructing and inspecting types.
    if True:
        # You can programmatically examine the properties of a Halide
        # type. This is useful when you write a C++ function that has
        # hl.Expr arguments and you wish to check their types:
        assert hl.UInt(8).bits() == 8
        assert hl.Int(8).is_int()


        # You can also programmatically construct Types as a function of other Types.
        t = hl.UInt(8)
        t = t.with_bits(t.bits() * 2)
        assert t == hl.UInt(16)

        # Or construct a Type from a C++ scalar type
        #assert type_of<float>() == hl.Float(32)

        # The Type struct is also capable of representing vector types,
        # but this is reserved for Halide's internal use. You should
        # vectorize code by using hl.Func::vectorize, not by attempting to
        # construct vector expressions directly. You may encounter vector
        # types if you programmatically manipulate lowered Halide code,
        # but this is an advanced topic (see hl.Func::add_custom_lowering_pass).

        # You can query any Halide hl.Expr for its type. An hl.Expr
        # representing a hl.Var has type hl.Int(32):
        x = hl.Var("x")
        assert hl.Expr(x).type() == hl.Int(32)

        # Most transcendental functions in Halide hl.cast their inputs to a
        # hl.Float(32) and return a hl.Float(32):
        assert hl.sin(x).type() == hl.Float(32)

        # You can hl.cast an hl.Expr from one Type to another using the hl.cast operator:
        assert hl.cast(hl.UInt(8), x).type() == hl.UInt(8)

        # This also comes in a template form that takes a C++ type.
        #assert hl.cast<uint8_t>(x).type() == hl.UInt(8)

        # You can also query any defined hl.Func for the types it produces.
        f1 = hl.Func("f1")
        f1[x] = hl.cast(hl.UInt(8), x)
        assert f1.output_types()[0] == hl.UInt(8)

        f2 = hl.Func("f2")
        f2[x] = (x, hl.sin(x))
        assert f2.output_types()[0] == hl.Int(32) and \
               f2.output_types()[1] == hl.Float(32)



    # Type promotion rules.
    if True:
        # When you combine Exprs of different types (e.g. using '+',
        # '*', etc), Halide uses a system of type promotion
        # rules. These differ to C's rules. To demonstrate these
        # we'll make some Exprs of each type.
        x = hl.Var("x")
        u8 = hl.cast(hl.UInt(8), x)
        u16 = hl.cast(hl.UInt(16), x)
        u32 = hl.cast(hl.UInt(32), x)
        u64 = hl.cast(hl.UInt(64), x)
        s8 = hl.cast(hl.Int(8), x)
        s16 = hl.cast(hl.Int(16), x)
        s32 = hl.cast(hl.Int(32), x)
        s64 = hl.cast(hl.Int(64), x)
        f32 = hl.cast(hl.Float(32), x)
        f64 = hl.cast(hl.Float(64), x)

        # The rules are as follows, and are applied in the order they are
        # written below.

        # 1) It is an error to hl.cast or use arithmetic operators on Exprs of type hl.Handle().

        # 2) If the types are the same, then no type conversions occur.
        for t in valid_halide_types:
            # Skip the handle type.
            if t.is_handle():
                continue
            e = hl.cast(t, x)
            assert (e + e).type() == e.type()


        # 3) If one type is a float but the other is not, then the
        # non-float argument is promoted to a float (possibly causing a
        # loss of precision for large integers).
        assert (u8 + f32).type() == hl.Float(32)
        assert (f32 + s64).type() == hl.Float(32)
        assert (u16 + f64).type() == hl.Float(64)
        assert (f64 + s32).type() == hl.Float(64)

        # 4) If both types are float, then the narrower argument is
        # promoted to the wider bit-width.
        assert (f64 + f32).type() == hl.Float(64)

        # The rules above handle all the floating-point cases. The
        # following three rules handle the integer cases.

        # 5) If one of the expressions is an integer constant, then it is
        # coerced to the type of the other expression.
        assert (u32 + 3).type() == hl.UInt(32)
        assert (3 + s16).type() == hl.Int(16)

        # If this rule would cause the integer to overflow, then Halide
        # will trigger an error, e.g. uncommenting the following line
        # will cause this program to terminate with an error.
        # hl.Expr bad = u8 + 257

        # 6) If both types are unsigned integers, or both types are
        # signed integers, then the narrower argument is promoted to
        # wider type.
        assert (u32 + u8).type() == hl.UInt(32)
        assert (s16 + s64).type() == hl.Int(64)

        # 7) If one type is signed and the other is unsigned, both
        # arguments are promoted to a signed integer with the greater of
        # the two bit widths.
        assert (u8 + s32).type() == hl.Int(32)
        assert (u32 + s8).type() == hl.Int(32)

        # Note that this may silently overflow the unsigned type in the
        # case where the bit widths are the same.
        assert (u32 + s32).type() == hl.Int(32)

        if False: # evaluate<X> not yet exposed to python
            # When an unsigned hl.Expr is converted to a wider signed type in
            # this way, it is first widened to a wider unsigned type
            # (zero-extended), and then reinterpreted as a signed
            # integer. I.e. casting the hl.UInt(8) value 255 to an hl.Int(32)
            # produces 255, not -1.
            #int32_t result32 = evaluate<int>(hl.cast<int32_t>(hl.cast<uint8_t>(255)))
            assert result32 == 255

            # When a signed type is explicitly converted to a wider unsigned
            # type with the hl.cast operator (the type promotion rules will
            # never do this automatically), it is first converted to the
            # wider signed type (sign-extended), and then reinterpreted as
            # an unsigned integer. I.e. casting the hl.Int(8) value -1 to a
            # hl.UInt(16) produces 65535, not 255.
            #uint16_t result16 = evaluate<uint16_t>(hl.cast<uint16_t>(hl.cast<int8_t>(-1)))
            assert result16 == 65535


    # The type hl.Handle().
    if True:
        # hl.Handle is used to represent opaque pointers. Applying
        # type_of to any pointer type will return hl.Handle()

        #assert type_of<void *>() == hl.Handle()
        #assert type_of<const char * const **>() == hl.Handle()
        # (not clear what the proper python version would be)

        # Handles are always stored as 64-bit, regardless of the compilation
        # target.
        assert hl.Handle().bits() == 64

        # The main use of an hl.Expr of type hl.Handle is to pass
        # it through Halide to other external code.


    # Generic code.
    if True:
        # The main explicit use of Type in Halide is to write Halide
        # code parameterized by a Type. In C++ you'd do this with
        # templates. In Halide there's no need - you can inspect and
        # modify the types dynamically at C++ runtime instead. The
        # function defined below averages two expressions of any
        # equal numeric type.
        x = hl.Var("x")
        assert average(hl.cast(hl.Float(32), x), 3.0).type() == hl.Float(32)
        assert average(x, 3).type() == hl.Int(32)
        assert average(hl.cast(hl.UInt(8), x), hl.cast(hl.UInt(8), 3)).type() == hl.UInt(8)


    print("Success!")

    return 0