Example #1
0
def combine2(im1, im2, width, height, dist):
    init_mask1 = hl.Func("mask1_layer_0")
    init_mask2 = hl.Func("mask2_layer_0")
    accumulator = hl.Func("combine_accumulator")
    output = hl.Func("combine_output")

    x, y = hl.Var("x"), hl.Var("y")

    im1_mirror = hl.BoundaryConditions.repeat_edge(im1, [(0, width), (0, height)])
    im2_mirror = hl.BoundaryConditions.repeat_edge(im2, [(0, width), (0, height)])

    weight1 = hl.f32(dist[im1_mirror[x, y]])
    weight2 = hl.f32(dist[im2_mirror[x, y]])

    init_mask1[x, y] = weight1 / (weight1 + weight2)
    init_mask2[x, y] = 1 - init_mask1[x, y]

    mask1 = init_mask1
    mask2 = init_mask2

    accumulator[x, y] = hl.i32(0)

    accumulator[x, y] += hl.i32(im1_mirror[x, y] * mask1[x, y]) + hl.i32(im2_mirror[x, y] * mask2[x, y])

    output[x, y] = hl.u16_sat(accumulator[x, y])

    init_mask1.compute_root().parallel(y).vectorize(x, 16)

    accumulator.compute_root().parallel(y).vectorize(x, 16)

    accumulator.update(0).parallel(y).vectorize(x, 16)

    return output
Example #2
0
def test_basics2():
    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1)
    s_sigma = 8

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val0 = clamped[x * s_sigma, y * s_sigma]
    val00 = clamped[x * s_sigma * hl.i32(1), y * s_sigma * hl.i32(1)]
    val22 = clamped[x * s_sigma - hl.i32(s_sigma // 2),
                    y * s_sigma - hl.i32(s_sigma // 2)]
    val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2]
    val3 = clamped[x * s_sigma + r.x - s_sigma // 2,
                   y * s_sigma + r.y - s_sigma // 2]

    try:
        val1 = clamped[x * s_sigma - s_sigma / 2, y * s_sigma - s_sigma / 2]
    except RuntimeError as e:
        assert 'Implicit cast from float32 to int' in str(e)
    else:
        assert False, 'Did not see expected exception!'
Example #3
0
def diff(im1, im2, name):
    output = hl.Func(name)

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    if im1.dimensions() == 2:
        output[x, y] = hl.i32(im1[x, y]) - hl.i32(im2[x, y])
    else:
        output[x, y, c] = hl.i32(im1[x, y, c]) - hl.i32(im2[x, y, c])

    return output
Example #4
0
def test_basics():
    input = hl.ImageParam(hl.UInt(16), 2, 'input')
    x, y = hl.Var('x'), hl.Var('y')

    blur_x = hl.Func('blur_x')
    blur_xx = hl.Func('blur_xx')
    blur_y = hl.Func('blur_y')

    yy = hl.i32(1)
    assert yy.type() == hl.Int(32)

    z = x + 1
    input[x, y]
    input[0, 0]
    input[z, y]
    input[x + 1, y]
    input[x, y] + input[x + 1, y]

    if False:
        aa = blur_x[x, y]
        bb = blur_x[x, y + 1]
        aa + bb
        blur_x[x, y] + blur_x[x, y + 1]

    (input[x, y] + input[x + 1, y]) / 2
    blur_x[x, y]
    blur_xx[x, y] = input[x, y]

    blur_x[x, y] = (input[x, y] + input[x + 1, y] + input[x + 2, y]) / 3
    blur_y[x, y] = (blur_x[x, y] + blur_x[x, y + 1] + blur_x[x, y + 2]) / 3

    xi, yi = hl.Var('xi'), hl.Var('yi')
    blur_y.tile(x, y, xi, yi, 8, 4).parallel(y).vectorize(xi, 8)
    blur_x.compute_at(blur_y, x).vectorize(x, 8)
    blur_y.compile_jit()
Example #5
0
def test_basics3():
    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma',
                       0.1)  # Value needed if not generating an executable
    s_sigma = 8  # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2,
                  y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)
    zi = hl.i32((val / r_sigma) + 0.5)
    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0

    ss = hl.select(c == 0, val, 1.0)
    left = histogram[x, y, zi, c]
    left += 5
    left += ss
Example #6
0
def black_white_level(input, black_point, white_point):
    output = hl.Func("black_white_level_output")

    x, y = hl.Var("x"), hl.Var("y")

    white_factor = 65535 / (white_point - black_point)

    output[x, y] = hl.u16_sat((hl.i32(input[x, y]) - black_point) * white_factor)

    return output
Example #7
0
def test_minmax():
    x = hl.Var()
    f = hl.Func()
    f[x] = hl.select(x == 0, hl.min(x, 1), (x == 2) | (x == 4),
                     hl.i32(hl.min(hl.f32(x), hl.f32(3.2), x * hl.f32(2.1))),
                     x == 3, hl.max(x, x * 3, 1, x * 4), x)
    b = f.realize(5)
    assert b[0] == 0
    assert b[1] == 1, b[1]
    assert b[2] == 2
    assert b[3] == 12
    assert b[4] == 3
Example #8
0
def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid
Example #9
0
def combine(im1, im2, width, height, dist):
    init_mask1 = hl.Func("mask1_layer_0")
    init_mask2 = hl.Func("mask2_layer_0")
    accumulator = hl.Func("combine_accumulator")
    output = hl.Func("combine_output")

    x, y = hl.Var("x"), hl.Var("y")

    im1_mirror = hl.BoundaryConditions.repeat_edge(im1, [(0, width), (0, height)])
    im2_mirror = hl.BoundaryConditions.repeat_edge(im2, [(0, width), (0, height)])

    unblurred1 = im1_mirror
    unblurred2 = im2_mirror

    blurred1 = gauss_7x7(im1_mirror, "img1_layer_0")
    blurred2 = gauss_7x7(im2_mirror, "img2_layer_0")

    weight1 = hl.f32(dist[im1_mirror[x, y]])
    weight2 = hl.f32(dist[im2_mirror[x, y]])

    init_mask1[x, y] = weight1 / (weight1 + weight2)
    init_mask2[x, y] = 1 - init_mask1[x, y]

    mask1 = init_mask1
    mask2 = init_mask2

    num_layers = 2

    accumulator[x, y] = hl.i32(0)

    for i in range(1, num_layers):
        print('        layer', i)

        prev_layer_str = str(i - 1)
        layer_str = str(i)

        laplace1 = diff(unblurred1, blurred1, "laplace1_layer_" + prev_layer_str)
        laplace2 = diff(unblurred2, blurred2, "laplace2_layer_" + layer_str)

        accumulator[x, y] += hl.i32(laplace1[x, y] * mask1[x, y]) + hl.i32(laplace2[x, y] * mask2[x, y])

        unblurred1 = blurred1
        unblurred2 = blurred2

        blurred1 = gauss_7x7(blurred1, "img1_layer_" + layer_str)
        blurred2 = gauss_7x7(blurred2, "img2_layer_" + layer_str)

        mask1 = gauss_7x7(mask1, "mask1_layer_" + layer_str)
        mask2 = gauss_7x7(mask2, "mask2_layer_" + layer_str)

    accumulator[x, y] += hl.i32(blurred1[x, y] * mask1[x, y]) + hl.i32(blurred2[x, y] * mask2[x, y])

    output[x, y] = hl.u16_sat(accumulator[x, y])

    init_mask1.compute_root().parallel(y).vectorize(x, 16)

    accumulator.compute_root().parallel(y).vectorize(x, 16)

    for i in range(num_layers):
        accumulator.update(i).parallel(y).vectorize(x, 16)

    return output
Example #10
0
def demosaic(input, width, height):
    print(f'width: {width}, height: {height}')

    f0 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f0")
    f1 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f1")
    f2 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f2")
    f3 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f3")

    f0.translate([-2, -2])
    f1.translate([-2, -2])
    f2.translate([-2, -2])
    f3.translate([-2, -2])

    d0 = hl.Func("demosaic_0")
    d1 = hl.Func("demosaic_1")
    d2 = hl.Func("demosaic_2")
    d3 = hl.Func("demosaic_3")

    output = hl.Func("demosaic_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")
    rdom0 = hl.RDom([(-2, 5), (-2, 5)])
    # rdom1 = hl.RDom([(0, width / 2), (0, height / 2)])

    input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)])

    f0.fill(0)
    f1.fill(0)
    f2.fill(0)
    f3.fill(0)

    f0_sum = 8
    f1_sum = 16
    f2_sum = 16
    f3_sum = 16

    f0[0, -2] = -1
    f0[0, -1] = 2
    f0[-2, 0] = -1
    f0[-1, 0] = 2
    f0[0, 0] = 4
    f0[1, 0] = 2
    f0[2, 0] = -1
    f0[0, 1] = 2
    f0[0, 2] = -1

    f1[0, -2] = 1
    f1[-1, -1] = -2
    f1[1, -1] = -2
    f1[-2, 0] = -2
    f1[-1, 0] = 8
    f1[0, 0] = 10
    f1[1, 0] = 8
    f1[2, 0] = -2
    f1[-1, 1] = -2
    f1[1, 1] = -2
    f1[0, 2] = 1

    f2[0, -2] = -2
    f2[-1, -1] = -2
    f2[0, -1] = 8
    f2[1, -1] = -2
    f2[-2, 0] = 1
    f2[0, 0] = 10
    f2[2, 0] = 1
    f2[-1, 1] = -2
    f2[0, 1] = 8
    f2[1, 1] = -2
    f2[0, 2] = -2

    f3[0, -2] = -3
    f3[-1, -1] = 4
    f3[1, -1] = 4
    f3[-2, 0] = -3
    f3[0, 0] = 12
    f3[2, 0] = -3
    f3[-1, 1] = 4
    f3[1, 1] = 4
    f3[0, 2] = -3

    d0[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f0[rdom0.x, rdom0.y]) / f0_sum)
    d1[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f1[rdom0.x, rdom0.y]) / f1_sum)
    d2[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f2[rdom0.x, rdom0.y]) / f2_sum)
    d3[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f3[rdom0.x, rdom0.y]) / f3_sum)

    R_row = y % 2 == 0
    B_row = y % 2 != 0
    R_col = x % 2 == 0
    B_col = x % 2 != 0
    at_R = c == 0
    at_G = c == 1
    at_B = c == 2

    output[x, y, c] = hl.select(at_R & R_row & B_col, d1[x, y],
                                at_R & B_row & R_col, d2[x, y],
                                at_R & B_row & B_col, d3[x, y],
                                at_G & R_row & R_col, d0[x, y],
                                at_G & B_row & B_col, d0[x, y],
                                at_B & B_row & R_col, d1[x, y],
                                at_B & R_row & B_col, d2[x, y],
                                at_B & R_row & R_col, d3[x, y],
                                input[x, y])

    d0.compute_root().parallel(y).vectorize(x, 16)
    d1.compute_root().parallel(y).vectorize(x, 16)
    d2.compute_root().parallel(y).vectorize(x, 16)
    d3.compute_root().parallel(y).vectorize(x, 16)

    output.compute_root().parallel(y).align_bounds(x, 2).unroll(x, 2).align_bounds(y, 2).unroll(y, 2).vectorize(x, 16)

    return output
Example #11
0
def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid
Example #12
0
def test_typed_funcs():
    x = hl.Var('x')
    y = hl.Var('y')

    f = hl.Func('f')
    assert not f.defined()
    try:
        assert f.output_type() == Int(32)
    except RuntimeError as e:
        assert 'it is undefined' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        assert f.outputs() == 0
    except RuntimeError as e:
        assert 'it is undefined' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    try:
        assert f.dimensions() == 0
    except RuntimeError as e:
        assert 'it is undefined' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    f = hl.Func(hl.Int(32), 2, 'f')
    assert not f.defined()
    assert f.output_type() == hl.Int(32)
    assert f.output_types() == [hl.Int(32)]
    assert f.outputs() == 1
    assert f.dimensions() == 2

    f = hl.Func([hl.Int(32), hl.Float(64)], 3, 'f')
    assert not f.defined()
    try:
        assert f.output_type() == hl.Int(32)
    except RuntimeError as e:
        assert 'it returns a Tuple' in str(e)
    else:
        assert False, 'Did not see expected exception!'

    assert f.output_types() == [hl.Int(32), hl.Float(64)]
    assert f.outputs() == 2
    assert f.dimensions() == 3

    f = hl.Func(hl.Int(32), 1, 'f')
    try:
        f[x, y] = hl.i32(0)
        f.realize([10, 10])
    except RuntimeError as e:
        assert 'is constrained to have exactly 1 dimensions, but is defined with 2 dimensions' in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    f = hl.Func(hl.Int(32), 2, 'f')
    try:
        f[x, y] = hl.i16(0)
        f.realize([10, 10])
    except RuntimeError as e:
        assert 'is constrained to only hold values of type int32 but is defined with values of type int16' in str(
            e)
    else:
        assert False, 'Did not see expected exception!'

    f = hl.Func((hl.Int(32), hl.Float(32)), 2, 'f')
    try:
        f[x, y] = (hl.i16(0), hl.f64(0))
        f.realize([10, 10])
    except RuntimeError as e:
        assert 'is constrained to only hold values of type (int32, float32) but is defined with values of type (int16, float64)' in str(
            e)
    else:
        assert False, 'Did not see expected exception!'
Example #13
0
    def gen_g(self):
        ''' define g() function '''
        # vars
        i, j, k, l = [self.vars[c] for c in "ijkl"]
        # clamped inputs
        x, y, z, expnt, fm, rnorm = [
            self.clamps[c] for c in ["x", "y", "z", "expnt", "fm", "rnorm"]
        ]
        # unclamped input (for sizing)
        fm_in = self.inputs["fm_in"]
        # scalar inputs
        delo2, delta, rdelta = [
            self.inputs[c] for c in ["delo2", "delta", "rdelta"]
        ]

        dx = hl.Func("dx")
        dy = hl.Func("dy")
        dz = hl.Func("dz")
        r2 = hl.Func("g_r2")
        expnt2 = hl.Func("expnt2")
        expnt_inv = hl.Func("expnt_inv")
        self.add_funcs_by_name([dx, dy, dz, r2, expnt2, expnt_inv])

        dx[i, j] = x[i] - x[j]
        dy[i, j] = y[i] - y[j]
        dz[i, j] = z[i] - z[j]

        r2[i,
           j] = dx[i, j] * dx[i, j] + dy[i, j] * dy[i, j] + dz[i, j] * dz[i, j]

        expnt2[i, j] = expnt[i] + expnt[j]
        expnt_inv[i, j] = hl.f64(1.0) / expnt2[i, j]

        fac2 = hl.Func("fac2")
        ex_arg = hl.Func("ex_arg")
        ex = hl.Func("ex")
        denom = hl.Func("denom")
        fac4d = hl.Func("fac4d")
        self.add_funcs_by_name([fac2, ex_arg, ex, denom, fac4d])
        fac2[i, j] = expnt[i] * expnt[j] * expnt_inv[i, j]
        ex_arg[i, j, k, l] = -fac2[i, j] * r2[i, j] - fac2[k, l] * r2[k, l]
        ex[i, j, k, l] = hl.select(ex_arg[i, j, k, l] < hl.f64(-37.0),
                                   hl.f64(0.0), hl.exp(ex_arg[i, j, k, l]))
        denom[i, j, k,
              l] = expnt2[i, j] * expnt2[k, l] * hl.sqrt(expnt2[i, j] +
                                                         expnt2[k, l])
        fac4d[i, j, k,
              l] = expnt2[i, j] * expnt2[k, l] / (expnt2[i, j] + expnt2[k, l])

        x2 = hl.Func("g_x2")
        y2 = hl.Func("g_y2")
        z2 = hl.Func("g_z2")
        rpq2 = hl.Func("rpq2")
        self.add_funcs_by_name([x2, y2, z2, rpq2])
        x2[i, j] = (x[i] * expnt[i] + x[j] * expnt[j]) * expnt_inv[i, j]
        y2[i, j] = (y[i] * expnt[i] + y[j] * expnt[j]) * expnt_inv[i, j]
        z2[i, j] = (z[i] * expnt[i] + z[j] * expnt[j]) * expnt_inv[i, j]
        rpq2[i, j, k, l] = ((x2[i, j] - x2[k, l]) * (x2[i, j] - x2[k, l]) +
                            (y2[i, j] - y2[k, l]) * (y2[i, j] - y2[k, l]) +
                            (z2[i, j] - z2[k, l]) * (z2[i, j] - z2[k, l]))

        f0t = hl.Func("f0t")
        f0n = hl.Func("f0n")
        f0x = hl.Func("f0x")
        f0val = hl.Func("f0val")
        self.add_funcs_by_name([f0t, f0n, f0x, f0val])
        f0t[i, j, k, l] = fac4d[i, j, k, l] * rpq2[i, j, k, l]
        f0n[i, j, k, l] = hl.clamp(hl.i32((f0t[i, j, k, l] + delo2) * rdelta),
                                   fm_in.dim(0).min(),
                                   fm_in.dim(0).max())
        f0x[i, j, k, l] = delta * f0n[i, j, k, l] - f0t[i, j, k, l]
        f0val[i, j, k, l] = hl.select(
            f0t[i, j, k, l] >= hl.f64(28.0),
            hl.f64(0.88622692545276) / hl.sqrt(f0t[i, j, k, l]),
            fm[f0n[i, j, k, l], 0] + f0x[i, j, k, l] *
            (fm[f0n[i, j, k, l], 1] + f0x[i, j, k, l] * hl.f64(0.5) *
             (fm[f0n[i, j, k, l], 2] + f0x[i, j, k, l] * hl.f64(1. / 3.) *
              (fm[f0n[i, j, k, l], 3] +
               f0x[i, j, k, l] * hl.f64(0.25) * fm[f0n[i, j, k, l], 4]))))

        g = hl.Func("g")
        self.add_funcs_by_name([g])

        if self.tracing and self.tracing_g:
            g_trace_in = hl.ImageParam(hl.Float(64), 4, "g_trace_in")
            g_trace = hl.BoundaryConditions.constant_exterior(g_trace_in, 0)
            self.inputs["g_trace_in"] = g_trace_in
            self.clamps["g_trace"] = g_trace
            g_trace.compute_root()
            g[i, j, k,
              l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i, j, k, l]
                    ) * ex[i, j, k, l] * f0val[i, j, k, l] * rnorm[i] * rnorm[
                        j] * rnorm[k] * rnorm[l] + g_trace[i, j, k, l]
        else:
            g_trace = None
            g[i, j, k,
              l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) /
                    denom[i, j, k, l]) * ex[i, j, k, l] * f0val[
                        i, j, k, l] * rnorm[i] * rnorm[j] * rnorm[k] * rnorm[l]