Python get_target_from_environment Examples, halide.get_target_from_environment Python Examples

Example #1

0

Show file

File: local_laplacian.py Project: zhou13/Halide

def filter_test_image(local_laplacian, input):
    local_laplacian.compile_jit(hl.get_target_from_environment())

    # preparing input and output memory buffers (numpy ndarrays)
    input_data = get_input_data()
    input_image = hl.Buffer(input_data)
    input.set(input_image)

    output_data = np.empty_like(input_data)

    # do the actual computation
    input_width, input_height = input_data.shape[:2]
    output_image = local_laplacian.realize(input_width, input_height, 3)
    output_data = np.asanyarray(output_image)

    # convert back to uint8
    input_data = (input_data >> 8).astype(np.uint8)
    output_data = (output_data >> 8).astype(np.uint8)

    # save results
    input_path = "local_laplacian_input.png"
    output_path = "local_laplacian.png"

    imageio.imsave(input_path, input_data)
    imageio.imsave(output_path, output_data)

    print()
    print("local_laplacian realized on output_image.")
    print('Result saved at {} (input data copy at {}).'.format(
        output_path, input_path))

Example #2

0

Show file

File: bilateral_grid.py Project: zy20091082/Halide

def generate_compiled_file(bilateral_grid):

    target = hl.get_target_from_environment()
    # Need to copy the filter executable from the C++ apps/bilateral_grid folder to run this.
    # (after making it of course)
    arguments = ArgumentsVector()
    arguments.append(Argument('r_sigma', InputScalar, hl.Float(32), 0))
    arguments.append(Argument('input', InputBuffer, hl.UInt(16), 2))
    bilateral_grid.compile_to_file("bilateral_grid", arguments,
                                   "bilateral_grid", target)
    print("Generated compiled file for bilateral_grid function.")

Example #3

0

Show file

File: bilateral_grid.py Project: adityaatluri/Halide

def generate_compiled_file(bilateral_grid):

    target = hl.get_target_from_environment()
    # Need to copy the filter executable from the C++ apps/bilateral_grid folder to run this.
    # (after making it of course)
    arguments = ArgumentsVector()
    arguments.append(Argument('r_sigma', InputScalar, hl.Float(32), 0))
    arguments.append(Argument('input', InputBuffer, hl.UInt(16), 2))
    bilateral_grid.compile_to_file("bilateral_grid",
                                   arguments,
                                   "bilateral_grid",
                                   target)
    print("Generated compiled file for bilateral_grid function.")

Example #4

0

Show file

File: local_laplacian.py Project: wsmoses/Halide-AS

def generate_compiled_file(local_laplacian):

    # Need to copy the process executable from the C++ apps/local_laplacian folder to run this.
    # (after making it of course)
    arguments = ArgumentsVector()
    arguments.append(Argument('levels', False, int_t))
    arguments.append(Argument('alpha', False, float_t))
    arguments.append(Argument('beta', False, float_t))
    arguments.append(Argument('input', True, hl.UInt(16)))
    target = hl.get_target_from_environment()
    local_laplacian.compile_to_file("local_laplacian", arguments, "local_laplacian", target)
    print("Generated compiled file for local_laplacian function.")
    return

Example #5

0

Show file

File: local_laplacian.py Project: darkbuck/Halide

def generate_compiled_file(local_laplacian):

    # Need to copy the process executable from the C++ apps/local_laplacian folder to run this.
    # (after making it of course)
    arguments = ArgumentsVector()
    arguments.append(Argument('levels', False, int_t))
    arguments.append(Argument('alpha', False, float_t))
    arguments.append(Argument('beta', False, float_t))
    arguments.append(Argument('input', True, hl.UInt(16)))
    target = hl.get_target_from_environment()
    local_laplacian.compile_to_file("local_laplacian", arguments, "local_laplacian", target)
    print("Generated compiled file for local_laplacian function.")
    return

Example #6

0

Show file

File: compile_to.py Project: voidcycles/Halide

def main():
    x = hl.Var("x")

    f = hl.Func("f")
    f[x] = 100 * x

    args = []

    tmpdir = tempfile.mkdtemp()
    try:
        p = os.path.join(tmpdir, "f.bc")
        f.compile_to_bitcode(p, args, "f")
        assert os.path.isfile(p)

        p = os.path.join(tmpdir, "f.cpp")
        f.compile_to_c(p, args, "f")
        assert os.path.isfile(p)

        p = os.path.join(tmpdir, "f.o")
        f.compile_to_object(p, args, "f")
        assert os.path.isfile(p)

        p = os.path.join(tmpdir, "f.h")
        f.compile_to_header(p, args, "f")
        assert os.path.isfile(p)

        p = os.path.join(tmpdir, "f.s")
        f.compile_to_assembly(p, args, "f")
        assert os.path.isfile(p)

        p = os.path.join(tmpdir, "f.txt")
        f.compile_to_lowered_stmt(p, args)
        assert os.path.isfile(p)

        f.compile_to_file(os.path.join(tmpdir, "f_all"), args)
        assert os.path.isfile(os.path.join(tmpdir, "f_all.h"))
        if hl.get_target_from_environment().os == hl.TargetOS.Windows:
            assert os.path.isfile(os.path.join(tmpdir, "f_all.obj"))
        else:
            assert os.path.isfile(os.path.join(tmpdir, "f_all.o"))

        p = os.path.join(tmpdir, "f.html")
        f.compile_to({hl.OutputFileType.stmt_html: p}, args, "f")
        assert os.path.isfile(p)

    finally:
        shutil.rmtree(tmpdir, ignore_errors=True)

Example #7

0

Show file

def get_interpolate(input, levels):
    """
    Build function, schedules it, and invokes jit compiler
    :return: halide.hl.Func
    """

    # THE ALGORITHM

    downsampled = [hl.Func('downsampled%d' % i) for i in range(levels)]
    downx = [hl.Func('downx%d' % l) for l in range(levels)]
    interpolated = [hl.Func('interpolated%d' % i) for i in range(levels)]
    #     level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)]
    #     level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)]
    upsampled = [hl.Func('upsampled%d' % l) for l in range(levels)]
    upsampledx = [hl.Func('upsampledx%d' % l) for l in range(levels)]
    x = hl.Var('x')
    y = hl.Var('y')
    c = hl.Var('c')

    clamped = hl.Func('clamped')
    clamped[x, y, c] = input[hl.clamp(x, 0,
                                      input.width() - 1),
                             hl.clamp(y, 0,
                                      input.height() - 1), c]

    # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we
    # rewrite it in a way that doesn't trigger the bug. The rewritten
    # form assumes the input alpha is zero or one.
    # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3])
    downsampled[0][x, y, c] = clamped[x, y, c] * clamped[x, y, 3]

    for l in range(1, levels):
        prev = hl.Func()
        prev = downsampled[l - 1]

        if l == 4:
            # Also add a boundary condition at a middle pyramid level
            # to prevent the footprint of the downsamplings to extend
            # too far off the base image. Otherwise we look 512
            # pixels off each edge.
            w = input.width() / (1 << l)
            h = input.height() / (1 << l)
            prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w),
                                             hl.clamp(y, 0, h), c])

        downx[l][x, y, c] = (prev[x * 2 - 1, y, c] + 2.0 * prev[x * 2, y, c] +
                             prev[x * 2 + 1, y, c]) * 0.25
        downsampled[l][x, y, c] = (downx[l][x, y * 2 - 1, c] +
                                   2.0 * downx[l][x, y * 2, c] +
                                   downx[l][x, y * 2 + 1, c]) * 0.25

    interpolated[levels - 1][x, y, c] = downsampled[levels - 1][x, y, c]
    for l in range(levels - 1)[::-1]:
        upsampledx[l][x, y, c] = (interpolated[l + 1][x / 2, y, c] +
                                  interpolated[l + 1][(x + 1) / 2, y, c]) / 2.0
        upsampled[l][x, y, c] = (upsampledx[l][x, y / 2, c] +
                                 upsampledx[l][x, (y + 1) / 2, c]) / 2.0
        interpolated[l][x, y, c] = downsampled[l][
            x, y, c] + (1.0 - downsampled[l][x, y, 3]) * upsampled[l][x, y, c]

    normalize = hl.Func('normalize')
    normalize[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3]

    final = hl.Func('final')
    final[x, y, c] = normalize[x, y, c]

    print("Finished function setup.")

    # THE SCHEDULE

    sched = 2
    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        sched = 4
    else:
        sched = 2

    if sched == 0:
        print("Flat schedule.")
        for l in range(levels):
            downsampled[l].compute_root()
            interpolated[l].compute_root()

        final.compute_root()

    elif sched == 1:
        print("Flat schedule with vectorization.")
        for l in range(levels):
            downsampled[l].compute_root().vectorize(x, 4)
            interpolated[l].compute_root().vectorize(x, 4)

        final.compute_root()

    elif sched == 2:
        print("Flat schedule with parallelization + vectorization")
        xi, yi = hl.Var('xi'), hl.Var('yi')
        clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(
            c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
        for l in range(1, levels - 1):
            if l > 0:
                downsampled[l].compute_root().parallel(y).reorder(
                    c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].compute_root().parallel(y).reorder(
                c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].unroll(x, 2).unroll(y, 2)

        final.reorder(c, x, y).bound(c, 0, 3).parallel(y)
        final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi)
        final.bound(x, 0, input.width())
        final.bound(y, 0, input.height())

    elif sched == 3:
        print("Flat schedule with vectorization sometimes.")
        for l in range(levels):
            if l + 4 < levels:
                yo, yi = hl.Var('yo'), hl.Var('yi')
                downsampled[l].compute_root().vectorize(x, 4)
                interpolated[l].compute_root().vectorize(x, 4)
            else:
                downsampled[l].compute_root()
                interpolated[l].compute_root()

        final.compute_root()

    elif sched == 4:
        print("GPU schedule.")

        # Some gpus don't have enough memory to process the entire
        # image, so we process the image in tiles.
        yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var(
            "ci")
        final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4)
        final.tile(x, y, xo, yo, xi, yi, input.width() / 4, input.height() / 4)
        normalize.compute_at(final,
                             xo).reorder(c, x,
                                         y).gpu_tile(x, y, xi, yi, 16, 16,
                                                     GPU_Default).unroll(c)

        # Start from level 1 to save memory - level zero will be computed on demand
        for l in range(1, levels):
            tile_size = 32 >> l
            if tile_size < 1: tile_size = 1
            if tile_size > 16: tile_size = 16
            downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci,
                                                   tile_size, tile_size, 4,
                                                   GPU_Default)
            interpolated[l].compute_at(final,
                                       xo).gpu_tile(x, y, c, xi, yi, ci,
                                                    tile_size, tile_size, 4,
                                                    GPU_Default)

    else:
        print("No schedule with this number.")
        exit(1)

    # JIT compile the pipeline eagerly, so we don't interfere with timing
    final.compile_jit(target)

    return final

Example #8

0

Show file

File: interpolate.py Project: darkbuck/Halide

def get_interpolate(input, levels):
    """
    Build function, schedules it, and invokes jit compiler
    :return: halide.hl.Func
    """

    # THE ALGORITHM

    downsampled = [hl.Func('downsampled%d'%i) for i in range(levels)]
    downx = [hl.Func('downx%d'%l) for l in range(levels)]
    interpolated = [hl.Func('interpolated%d'%i) for i in range(levels)]
#     level_widths = [hl.Param(int_t,'level_widths%d'%i) for i in range(levels)]
#     level_heights = [hl.Param(int_t,'level_heights%d'%i) for i in range(levels)]
    upsampled = [hl.Func('upsampled%d'%l) for l in range(levels)]
    upsampledx = [hl.Func('upsampledx%d'%l) for l in range(levels)]
    x = hl.Var('x')
    y = hl.Var('y')
    c = hl.Var('c')

    clamped = hl.Func('clamped')
    clamped[x, y, c] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c]

    # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we
    # rewrite it in a way that doesn't trigger the bug. The rewritten
    # form assumes the input alpha is zero or one.
    # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3])
    downsampled[0][x,y,c] = clamped[x, y, c] * clamped[x, y, 3]

    for l in range(1, levels):
        prev = hl.Func()
        prev = downsampled[l-1]

        if l == 4:
            # Also add a boundary condition at a middle pyramid level
            # to prevent the footprint of the downsamplings to extend
            # too far off the base image. Otherwise we look 512
            # pixels off each edge.
            w = input.width()/(1 << l)
            h = input.height()/(1 << l)
            prev = hl.lambda3D(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c])

        downx[l][x,y,c] = (prev[x*2-1,y,c] + 2.0 * prev[x*2,y,c] + prev[x*2+1,y,c]) * 0.25
        downsampled[l][x,y,c] = (downx[l][x,y*2-1,c] + 2.0 * downx[l][x,y*2,c] + downx[l][x,y*2+1,c]) * 0.25


    interpolated[levels-1][x,y,c] = downsampled[levels-1][x,y,c]
    for l in range(levels-1)[::-1]:
        upsampledx[l][x,y,c] = (interpolated[l+1][x/2, y, c] + interpolated[l+1][(x+1)/2, y, c]) / 2.0
        upsampled[l][x,y,c] = (upsampledx[l][x, y/2, c] + upsampledx[l][x, (y+1)/2, c]) / 2.0
        interpolated[l][x,y,c] = downsampled[l][x,y,c] + (1.0 - downsampled[l][x,y,3]) * upsampled[l][x,y,c]

    normalize = hl.Func('normalize')
    normalize[x,y,c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3]

    final = hl.Func('final')
    final[x,y,c] = normalize[x,y,c]

    print("Finished function setup.")

    # THE SCHEDULE

    sched = 2
    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        sched = 4
    else:
        sched = 2

    if sched == 0:
        print ("Flat schedule.")
        for l in range(levels):
            downsampled[l].compute_root()
            interpolated[l].compute_root()

        final.compute_root()

    elif sched == 1:
        print("Flat schedule with vectorization.")
        for l in range(levels):
            downsampled[l].compute_root().vectorize(x, 4)
            interpolated[l].compute_root().vectorize(x, 4)

        final.compute_root()

    elif sched == 2:
        print("Flat schedule with parallelization + vectorization")
        xi, yi = hl.Var('xi'), hl.Var('yi')
        clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
        for l in range(1, levels - 1):
            if l > 0:
                downsampled[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
            interpolated[l].unroll(x, 2).unroll(y, 2);

        final.reorder(c, x, y).bound(c, 0, 3).parallel(y)
        final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi)
        final.bound(x, 0, input.width())
        final.bound(y, 0, input.height())

    elif sched == 3:
        print("Flat schedule with vectorization sometimes.")
        for l in range(levels):
            if l + 4 < levels:
                yo, yi = hl.Var('yo'), hl.Var('yi')
                downsampled[l].compute_root().vectorize(x, 4)
                interpolated[l].compute_root().vectorize(x, 4)
            else:
                downsampled[l].compute_root()
                interpolated[l].compute_root()

        final.compute_root();

    elif sched == 4:
        print("GPU schedule.")

        # Some gpus don't have enough memory to process the entire
        # image, so we process the image in tiles.
        yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var("ci")
        final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4)
        final.tile(x, y, xo, yo, xi, yi, input.width()/4, input.height()/4)
        normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16, GPU_Default).unroll(c)

        # Start from level 1 to save memory - level zero will be computed on demand
        for l in range(1, levels):
            tile_size = 32 >> l;
            if tile_size < 1: tile_size = 1
            if tile_size > 16: tile_size = 16
            downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default)
            interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4, GPU_Default)

    else:
        print("No schedule with this number.")
        exit(1)

    # JIT compile the pipeline eagerly, so we don't interfere with timing
    final.compile_jit(target)

    return final

Example #9

0

Show file

File: bilateral_grid.py Project: wsmoses/Halide-AS

def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid

Example #10

0

Show file

File: bilateral_grid.py Project: adityaatluri/Halide

def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid

Example #11

0

Show file

File: local_laplacian.py Project: zy20091082/Halide

def get_local_laplacian(input, levels, alpha, beta, J=8):
    downsample_counter = [0]
    upsample_counter = [0]

    x = hl.Var('x')
    y = hl.Var('y')

    def downsample(f):
        downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 *
                          (f[2 * x, y, c] + f[2 * x + 1, y, c]) +
                          f[2 * x + 2, y, c]) / 8.0
        downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 *
                          (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) +
                          downx[x, 2 * y + 2, c]) / 8.0

        return downy

    def upsample(f):
        upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 *
                                (x % 2), y, c] + 0.75 * f[x // 2, y, c]
        upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 *
                                  (y % 2), c] + 0.75 * upx[x, y // 2, c]

        return upy

    def downsample2D(f):
        downx, downy = hl.Func('downx%d' % downsample_counter[0]), hl.Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y] = (f[2 * x - 1, y] + 3.0 *
                       (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0
        downy[x, y] = (downx[x, 2 * y - 1] + 3.0 *
                       (downx[x, 2 * y] + downx[x, 2 * y + 1]) +
                       downx[x, 2 * y + 2]) / 8.0

        return downy

    def upsample2D(f):
        upx, upy = hl.Func('upx%d' % upsample_counter[0]), hl.Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x,
            y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y]
        upy[x,
            y] = 0.25 * upx[x,
                            (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2]

        return upy

    # THE ALGORITHM

    # loop variables
    c = hl.Var('c')
    k = hl.Var('k')

    # Make the remapping function as a lookup table.
    remap = hl.Func('remap')
    fx = hl.cast(float_t, x / 256.0)
    #remap[x] = alpha*fx*exp(-fx*fx/2.0)
    remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0)

    # Convert to floating point
    floating = hl.Func('floating')
    floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0

    # Set a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y, c] = floating[hl.clamp(x, 0,
                                         input.width() - 1),
                                hl.clamp(y, 0,
                                         input.height() - 1), c]

    # Get the luminance channel
    gray = hl.Func('gray')
    gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[
        x, y, 1] + 0.114 * clamped[x, y, 2]

    # Make the processed Gaussian pyramid.
    gPyramid = [hl.Func('gPyramid%d' % i) for i in range(J)]
    # Do a lookup into a lut with 256 entires per intensity level
    level = k / (levels - 1)
    idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0
    idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256)
    gPyramid[0][x, y,
                k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k]
    for j in range(1, J):
        gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k]

    # Get its laplacian pyramid
    lPyramid = [hl.Func('lPyramid%d' % i) for i in range(J)]
    lPyramid[J - 1] = gPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(
            gPyramid[j + 1])[x, y, k]

    # Make the Gaussian pyramid of the input
    inGPyramid = [hl.Func('inGPyramid%d' % i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1, J):
        inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y]

    # Make the laplacian pyramid of the output
    outLPyramid = [hl.Func('outLPyramid%d' % i) for i in range(J)]
    for j in range(J):
        # Split input pyramid value into integer and floating parts
        level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1)
        li = hl.clamp(hl.cast(int_t, level), 0, levels - 2)
        lf = level - hl.cast(float_t, li)
        # Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j][x, y] = (
            1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1]

    # Make the Gaussian pyramid of the output
    outGPyramid = [hl.Func('outGPyramid%d' % i) for i in range(J)]
    outGPyramid[J - 1] = outLPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        outGPyramid[j][x, y] = upsample2D(
            outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]

    # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
    color = hl.Func('color')
    eps = 0.01
    color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] +
                                             eps) / (gray[x, y] + eps)

    output = hl.Func('local_laplacian')
    # Convert back to 16-bit
    output[x, y, c] = hl.cast(hl.UInt(16),
                              hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0)

    # THE SCHEDULE
    remap.compute_root()

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU Schedule
        print("Compiling for GPU")
        xi, yi = hl.Var("xi"), hl.Var("yi")
        output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default)
        for j in range(J):
            blockw = 32
            blockh = 16
            if j > 3:
                blockw = 2
                blockh = 2
            if j > 0:
                inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
                                                      blockh, GPU_Default)
            if j > 0:
                gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(
                    x, y, xi, yi, blockw, blockh, GPU_Default)
            outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw,
                                                   blockh, GPU_Default)
    else:
        # CPU schedule
        print("Compiling for CPU")
        output.parallel(y, 4).vectorize(x, 4)
        gray.compute_root().parallel(y, 4).vectorize(x, 4)
        for j in range(4):
            if j > 0:
                inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            if j > 0:
                gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            outGPyramid[j].compute_root().parallel(y).vectorize(x, 4)
        for j in range(4, J):
            inGPyramid[j].compute_root().parallel(y)
            gPyramid[j].compute_root().parallel(k)
            outGPyramid[j].compute_root().parallel(y)

    return output

Example #12

0

Show file

File: autoscheduler_error.py Project: rheimbuch/halide_autoscheduler_error

    # histogram bucket corresponding to the intensity of the
    # input image at that point.

    histogram[hl.Expr(img[r.x, r.y])] += 1
    histogram.set_estimate(hist_index, 0, 255)

    # Get the sum of all histogram cells
    r = hl.RDom([(0,255)])
    hist_sum = hl.Func('hist_sum')
    hist_sum[()] = 0.0 # Compute the sum as a 32-bit integer
    hist_sum[()] += histogram[r.x]

    # Return each histogram as a % of total color
    pct_hist = hl.Func('pct_hist')
    pct_hist[hist_index] = histogram[hist_index] / hist_sum[()]

    return histogram


def autoschedule(pipeline, autoscheduler_name, target, machine):
    hl.load_plugin('auto_schedule')
    pipeline.set_default_autoscheduler_name(autoscheduler_name)
    return pipeline.auto_schedule(target, machine)


if __name__ == "__main__":
    fs = focus_stack_pipeline()
    print("Autoscheduling with: Adams2019")
    autoschedule(fs['pipeline'], "Adams2019", hl.get_target_from_environment(), hl.MachineParams(4, 256*1024, 50))

Example #13

0

Show file

File: local_laplacian.py Project: darkbuck/Halide

def get_local_laplacian(input, levels, alpha, beta, J=8):
    downsample_counter=[0]
    upsample_counter=[0]

    x = hl.Var('x')
    y = hl.Var('y')

    def downsample(f):
        downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0])
        downsample_counter[0] += 1

        downx[x,y,c] = (f[2*x-1,y,c] + 3.0*(f[2*x,y,c]+f[2*x+1,y,c]) + f[2*x+2,y,c])/8.0
        downy[x,y,c] = (downx[x,2*y-1,c] + 3.0*(downx[x,2*y,c]+downx[x,2*y+1,c]) + downx[x,2*y+2,c])/8.0

        return downy

    def upsample(f):
        upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0])
        upsample_counter[0] += 1

        upx[x,y,c] = 0.25 * f[(x//2) - 1 + 2*(x%2),y,c] + 0.75 * f[x//2,y,c]
        upy[x,y,c] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2),c] + 0.75 * upx[x,y//2,c]

        return upy

    def downsample2D(f):
        downx, downy = hl.Func('downx%d'%downsample_counter[0]), hl.Func('downy%d'%downsample_counter[0])
        downsample_counter[0] += 1

        downx[x,y] = (f[2*x-1,y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0
        downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0

        return downy

    def upsample2D(f):
        upx, upy = hl.Func('upx%d'%upsample_counter[0]), hl.Func('upy%d'%upsample_counter[0])
        upsample_counter[0] += 1

        upx[x,y] = 0.25 * f[(x//2) - 1 + 2*(x%2),y] + 0.75 * f[x//2,y]
        upy[x,y] = 0.25 * upx[x, (y//2) - 1 + 2*(y%2)] + 0.75 * upx[x,y//2]

        return upy

    # THE ALGORITHM

    # loop variables
    c = hl.Var('c')
    k = hl.Var('k')

    # Make the remapping function as a lookup table.
    remap = hl.Func('remap')
    fx = hl.cast(float_t, x/256.0)
    #remap[x] = alpha*fx*exp(-fx*fx/2.0)
    remap[x] = alpha*fx*hl.exp(-fx*fx/2.0)

    # Convert to floating point
    floating = hl.Func('floating')
    floating[x,y,c] = hl.cast(float_t, input[x,y,c]) / 65535.0

    # Set a boundary condition
    clamped = hl.Func('clamped')
    clamped[x,y,c] = floating[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1), c]

    # Get the luminance channel
    gray = hl.Func('gray')
    gray[x,y] = 0.299*clamped[x,y,0] + 0.587*clamped[x,y,1] + 0.114*clamped[x,y,2]

    # Make the processed Gaussian pyramid.
    gPyramid = [hl.Func('gPyramid%d'%i) for i in range(J)]
    # Do a lookup into a lut with 256 entires per intensity level
    level = k / (levels - 1)
    idx = gray[x,y]*hl.cast(float_t, levels-1)*256.0
    idx = hl.clamp(hl.cast(int_t, idx), 0, (levels-1)*256)
    gPyramid[0][x,y,k] = beta*(gray[x, y] - level) + level + remap[idx - 256*k]
    for j in range(1,J):
        gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k]

    # Get its laplacian pyramid
    lPyramid = [hl.Func('lPyramid%d'%i) for i in range(J)]
    lPyramid[J-1] = gPyramid[J-1]
    for j in range(J-1)[::-1]:
        lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k]

    # Make the Gaussian pyramid of the input
    inGPyramid = [hl.Func('inGPyramid%d'%i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1,J):
        inGPyramid[j][x,y] = downsample2D(inGPyramid[j-1])[x,y]

    # Make the laplacian pyramid of the output
    outLPyramid = [hl.Func('outLPyramid%d'%i) for i in range(J)]
    for j in range(J):
        # Split input pyramid value into integer and floating parts
        level = inGPyramid[j][x,y]*hl.cast(float_t, levels-1)
        li = hl.clamp(hl.cast(int_t, level), 0, levels-2)
        lf = level - hl.cast(float_t, li)
        # Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1]

    # Make the Gaussian pyramid of the output
    outGPyramid = [hl.Func('outGPyramid%d'%i) for i in range(J)]
    outGPyramid[J-1] = outLPyramid[J-1]
    for j in range(J-1)[::-1]:
        outGPyramid[j][x,y] = upsample2D(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y]

    # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
    color = hl.Func('color')
    eps = 0.01
    color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c] + eps) / (gray[x,y] + eps)

    output = hl.Func('local_laplacian')
    # Convert back to 16-bit
    output[x,y,c] = hl.cast(hl.UInt(16), hl.clamp(color[x,y,c], 0.0, 1.0) * 65535.0)

    # THE SCHEDULE
    remap.compute_root()

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU Schedule
        print ("Compiling for GPU")
        xi, yi = hl.Var("xi"), hl.Var("yi")
        output.compute_root().gpu_tile(x, y, 32, 32, GPU_Default)
        for j in range(J):
            blockw = 32
            blockh = 16
            if j > 3:
                blockw = 2
                blockh = 2
            if j > 0:
                inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default)
            if j > 0:
                gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default)
            outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh, GPU_Default)
    else:
        # CPU schedule
        print ("Compiling for CPU")
        output.parallel(y, 4).vectorize(x, 4);
        gray.compute_root().parallel(y, 4).vectorize(x, 4);
        for j in range(4):
            if j > 0:
                inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            if j > 0:
                gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
            outGPyramid[j].compute_root().parallel(y).vectorize(x, 4)
        for j in range(4,J):
            inGPyramid[j].compute_root().parallel(y)
            gPyramid[j].compute_root().parallel(k)
            outGPyramid[j].compute_root().parallel(y)


    return output