Python is_cudaの例、autotune.is_cuda Pythonの例

コード例 #1

0

ファイルを表示

ファイル: autotune_template.py プロジェクト: FreeAlex/Halide

def sample(varlist, schedule, name, bounds):
    "Sample template using given variable list."
    varlist = autotune_bounds.get_xy(schedule.d[name].func, bounds)
    if len(varlist) < 2:
        raise autotune.MutateFailed
    x = varlist[0]
    y = varlist[1]
    #if autotune.SPECULATIVE_INTERPOLATE and len(varlist) >= 3:
    #    x = varlist[1]
    #    y = varlist[2]
    
    L = ['.chunk(%(chunk_var)s).vectorize(%(x)s,%(n)d)',
         '.root().tile(%(x)s,%(y)s,_c0,_c1,%(n)d,%(n)d).vectorize(_c0,%(n)d).parallel(%(y)s)',
         '.root().parallel(%(y)s).vectorize(%(x)s,%(n)d)']
    if autotune.is_cuda():
        L[0] = '.chunk(%(chunk_var)s)'
        L.extend([
            '.root().cudaTile(%(x)s,%(y)s,%(n)d,%(n)d)'
        ]*3)
    
    r = random.randrange(len(L))
    if r == 0:
        if CHUNK_X_ALWAYS:
            chunk_var = x
        else:
            cvars = list(autotune.chunk_vars(schedule, schedule.d[name].func))
            if '_c0' in cvars:
                cvars = cvars[:cvars.index('_c0'):] + cvars[cvars.index('_c0')+1:]
            if len(cvars) == 0:
                raise autotune.MutateFailed
            else:
                chunk_var = random.choice(cvars)
    n = random.choice([2,4,8])
    return L[r]%locals()

コード例 #2

0

ファイルを表示

ファイル: autotune_template.py プロジェクト: xiaonanzzz/Halide

def sample(varlist, schedule, name, bounds):
    "Sample template using given variable list."
    varlist = autotune_bounds.get_xy(schedule.d[name].func, bounds)
    if len(varlist) < 2:
        raise autotune.MutateFailed
    x = varlist[0]
    y = varlist[1]
    #if autotune.SPECULATIVE_INTERPOLATE and len(varlist) >= 3:
    #    x = varlist[1]
    #    y = varlist[2]

    L = [
        '.chunk(%(chunk_var)s).vectorize(%(x)s,%(n)d)',
        '.root().tile(%(x)s,%(y)s,_c0,_c1,%(n)d,%(n)d).vectorize(_c0,%(n)d).parallel(%(y)s)',
        '.root().parallel(%(y)s).vectorize(%(x)s,%(n)d)'
    ]
    if autotune.is_cuda():
        L[0] = '.chunk(%(chunk_var)s)'
        L.extend(['.root().cudaTile(%(x)s,%(y)s,%(n)d,%(n)d)'] * 3)

    r = random.randrange(len(L))
    if r == 0:
        if CHUNK_X_ALWAYS:
            chunk_var = x
        else:
            cvars = list(autotune.chunk_vars(schedule, schedule.d[name].func))
            if '_c0' in cvars:
                cvars = cvars[:cvars.index('_c0'
                                           ):] + cvars[cvars.index('_c0') + 1:]
            if len(cvars) == 0:
                raise autotune.MutateFailed
            else:
                chunk_var = random.choice(cvars)
    n = random.choice([2, 4, 8])
    return L[r] % locals()

コード例 #3

0

ファイルを表示

ファイル: autotune_template.py プロジェクト: iitaku/Halide

def sample(varlist):
    "Sample template using given variable list."
    if len(varlist) < 2:
        raise autotune.MutateFailed
    x = varlist[0]
    y = varlist[1]
    L = ['.chunk(%(x)s).vectorize(%(x)s,%(n)d)',
         '.root().tile(%(x)s,%(y)s,_c0,_c1,%(n)d,%(n)d).vectorize(_c0,%(n)d).parallel(%(y)s)',
         '.root().parallel(%(y)s).vectorize(%(x)s,%(n)d)']
    if autotune.is_cuda():
        L.extend([
            '.root().cudaTile(%(x)s,%(y)s,%(n)d,%(n)d)'
        ]*3)
    
    r = random.randrange(len(L))
    n = random.choice([2,4,8])
    return L[r]%locals()

コード例 #4

0

ファイルを表示

ファイル: local_laplacian.py プロジェクト: FreeAlex/Halide

def filter_func(J=8, dtype=UInt(16), use_uniforms=False):
    "Local Laplacian."

    downsample_counter=[0] 
    upsample_counter=[0]
    
    def downsample(f):
        downx, downy = Func('downx%d'%downsample_counter[0]), Func('downy%d'%downsample_counter[0])
        downsample_counter[0] += 1
        
        downx[x,y] = (f[2*x-1, y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0
        downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0

        return downy
    
    def upsample(f):
        upx, upy = Func('upx%d'%upsample_counter[0]), Func('upy%d'%upsample_counter[0])
        upsample_counter[0] += 1
        
        upx[x,y] = 0.25 * f[(x/2)-1+2*(x%2),y] + 0.75 * f[x/2,y]
        upy[x,y] = 0.25 * upx[x, (y/2) - 1 + 2*(y%2)] + 0.75 * upx[x,y/2]
        
        return upy
    
    if use_uniforms:
        levels = Uniform(int_t, 'levels', 8)
        alpha = Uniform(float_t, 'alpha', 1.0) #1.0)
        beta = Uniform(float_t, 'beta', 1.0)
    else:
        levels = 8
        alpha = 1.0
        beta = 1.0
    input = UniformImage(dtype, 3, 'input')
    
    x = Var('x')
    y = Var('y')
    c = Var('c')
    k = Var('k')
    
    fx = cast(float_t, x/256.0)
    remap = Func('remap')
    remap[x] = (alpha/cast(float_t, levels-1))*fx*exp(-fx*fx/2.0)
    
    floating = Func('floating')
    floating[x,y,c] = cast(float_t, input[x,y,c])/float(dtype.maxval())
    
    clamped = Func('clamped')
    clamped[x,y,c] = floating[clamp(x,cast(int_t,0),cast(int_t,input.width()-1)),
                              clamp(y,cast(int_t,0),cast(int_t,input.height()-1)), c]
    gray = Func('gray')
    gray[x,y] = 0.299*clamped[x,y,0]+0.587*clamped[x,y,1]+0.114*clamped[x,y,2]
    
    gPyramid = [Func('gPyramid%d'%i) for i in range(J)]
    idx = gray[x,y]*cast(float_t, levels-1)*256.0
    idx = clamp(cast(int_t, idx), cast(int_t, 0), cast(int_t, (levels-1)*256))
    gPyramid[0][x,y,k] = beta*gray[x,y] + remap[idx-256*k]
    for j in range(1,J):
        gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k]

    lPyramid = [Func('lPyramid%d'%i) for i in range(J)]
    lPyramid[J-1] = gPyramid[J-1]
    for j in range(J-1)[::-1]:
        lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k]
    
    inGPyramid = [Func('inGPyramid%d'%i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1,J):
        inGPyramid[j][x,y] = downsample(inGPyramid[j-1])[x,y]
    
    outLPyramid = [Func('outLPyramid%d'%i) for i in range(J)]
    for j in range(J):
        level = inGPyramid[j][x,y]*cast(float_t, levels-1)
        li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels-2))
        lf = level - cast(float_t, li)
        outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1]
    
    outGPyramid = [Func('outGPyramid%d'%i) for i in range(J)]
    outGPyramid[J-1] = outLPyramid[J-1]
    for j in range(J-1)[::-1]:
        outGPyramid[j][x,y] = upsample(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y]
    
    color = Func('color')
    #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y]
    color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c]+0.01) / (gray[x,y]+0.01)
    
    output = Func('output')
    output[x,y,c] = cast(dtype, clamp(color[x,y,c], cast(float_t,0.0), cast(float_t,1.0))*float(dtype.maxval()))
    
    root_all(output)
    #import autotune
    #print autotune.root_all_str(output)
    #autotune.print_root_all(output)
    
    human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n'
    for j in range(J):
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%inGPyramid[j].name()
        if j > 0:
            human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n'%j
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%outGPyramid[j].name()
    
    if autotune.is_cuda():
        human_schedule = 'remap.root()\n'
        human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n'
        for j in range(J):
            blockw = blockh = 32
            if j > 3:
                blockw = blockh = 2
            if j == 0:
                human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n'%(blockw, blockh)
            else:
                human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)
            human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)
            if j == J-1:
                human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)
            else:
                human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)

    # Special variables interpreted by autotuner
    tune_ref_schedules = {'human': human_schedule}
    tune_constraints = autotune.bound_recursive(output, 'c', 0, 3)

    #print '# schedules:'
    #import math
    #print math.log(autotune.lower_bound_schedules(output),10)
    #sys.exit(1)
    
    return (input, output, None, locals())

コード例 #5

0

ファイルを表示

def filter_func(dtype=UInt(16), use_uniforms=False):
    def lerp(a, b, alpha):
        return (1.0 - alpha) * a + alpha * b

    input = UniformImage(float_t, 3, 'input')
    if use_uniforms:
        r_sigma = Uniform(float_t, 0.1)
    else:
        r_sigma = 0.1
    s_sigma = 8

    x = Var('x')
    y = Var('y')
    z = Var('z')
    c = Var('c')

    clamped = Func('clamped')
    clamped[x, y] = input[clamp(x, 0,
                                input.width() - 1),
                          clamp(y, 0,
                                input.height() - 1), 0]

    r = RDom(0, s_sigma, 0, s_sigma, 'r')
    val = clamped[x * s_sigma + r.x - s_sigma / 2,
                  y * s_sigma + r.y - s_sigma / 2]
    val = clamp(val, 0.0, 1.0)
    zi = cast(int_t, val * (1.0 / r_sigma) + 0.5)
    grid = Func('grid')
    grid[x, y, z, c] = 0.0
    grid[x, y, zi, c] += select(c == 0, val, 1.0)

    # Blur the grid using a five-tap filter
    blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz')
    blurx[x, y, z] = grid[x - 2, y, z] + grid[x - 1, y, z] * 4 + grid[
        x, y, z] * 6 + grid[x + 1, y, z] * 4 + grid[x + 2, y, z]
    blury[x, y, z] = blurx[x, y - 2, z] + blurx[x, y - 1, z] * 4 + blurx[
        x, y, z] * 6 + blurx[x, y + 1, z] * 4 + blurx[x, y + 2, z]
    blurz[x, y, z] = blury[x, y, z - 2] + blury[x, y, z - 1] * 4 + blury[
        x, y, z] * 6 + blury[x, y, z + 1] * 4 + blury[x, y, z + 2]

    # Take trilinear samples to compute the output
    val = clamp(clamped[x, y], 0.0, 1.0)
    zv = val * (1.0 / r_sigma)
    zi = cast(int_t, zv)
    zf = zv - zi
    xf = cast(float_t, x % s_sigma) / s_sigma
    yf = cast(float_t, y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = Func('interpolated')
    interpolated[x, y] = lerp(
        lerp(lerp(blurz[xi, yi, zi], blurz[xi + 1, yi, zi], xf),
             lerp(blurz[xi, yi + 1, zi], blurz[xi + 1, yi + 1, zi], xf), yf),
        lerp(
            lerp(blurz[xi, yi, zi + 1], blurz[xi + 1, yi, zi + 1], xf),
            lerp(blurz[xi, yi + 1, zi + 1], blurz[xi + 1, yi + 1, zi + 1], xf),
            yf), zf)

    # Normalize
    smoothed = Func('smoothed')
    smoothed[x, y, c] = interpolated[x, y, 0] / interpolated[x, y, 1]

    schedule = 1
    if schedule == 0:
        pass
    elif schedule == 1:
        # Best schedule for CPU
        grid.root().parallel(z)
        grid.update().reorder(c, x, y).parallel(y)
        blurx.root().parallel(z).vectorize(x, 4)
        blury.root().parallel(z).vectorize(x, 4)
        blurz.root().parallel(z).vectorize(x, 4)
        smoothed.root().parallel(y).vectorize(x, 4)
    elif schedule == 2:
        # Best schedule for GPU
        gridz = grid.arg(2)
        grid.root().cudaTile(x, y, 16, 16)
        grid.update().root().cudaTile(x, y, 16, 16)
        blurx.root().cudaTile(x, y, 8, 8)
        blury.root().cudaTile(x, y, 8, 8)
        blurz.root().cudaTile(x, y, 8, 8)
        smoothed.root().cudaTile(x, y, s_sigma, s_sigma)
    else:
        raise ValueError

    tune_ref_schedules = {
        'human':
        'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' +
        'blurx.root().parallel(z).vectorize(x, 4)\n' +
        'blury.root().parallel(z).vectorize(x, 4)\n' +
        'blurz.root().parallel(z).vectorize(x, 4)\n' +
        'smoothed.root().parallel(y).vectorize(x, 4)\n'
    }
    # GPU
    gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \
                'blurx.root().cudaTile(x, y, 8, 8)\n' + \
                'blury.root().cudaTile(x, y, 8, 8)\n' + \
                'blurz.root().cudaTile(x, y, 8, 8)\n' + \
                'smoothed.root().cudaTile(x, y, 8, 8)\n'
    if autotune.is_cuda():
        tune_ref_schedules['human'] = gpu_human

    tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3)
    #print tune_constraints

    #autotune.print_tunables(smoothed)
    #for i in range(123,10000):
    #    random.seed(i)
    #    print '-'*40
    #    print 'Schedule %d'%i
    #    p = autotune.AutotuneParams()
    #    print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth)

    #    std::vector<Func::Arg> args;
    #    args.push_back(r_sigma);
    #    args.push_back(input);
    #    smoothed.compileToFile("bilateral_grid", args);
    return (input, smoothed, None, locals())

コード例 #6

0

ファイルを表示

ファイル: bilateral_grid.py プロジェクト: FreeAlex/Halide

def filter_func(dtype=UInt(16), use_uniforms=False):
    def lerp(a, b, alpha):
        return (1.0-alpha)*a + alpha*b

    input = UniformImage(float_t, 3, 'input')
    if use_uniforms:
        r_sigma = Uniform(float_t, 0.1)
    else:
        r_sigma = 0.1
    s_sigma = 8
    
    x = Var('x')
    y = Var('y')
    z = Var('z')
    c = Var('c')

    clamped = Func('clamped')
    clamped[x, y] = input[clamp(x, 0, input.width()-1),
                          clamp(y, 0, input.height()-1),0]

    r = RDom(0, s_sigma, 0, s_sigma, 'r')
    val = clamped[x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2]
    val = clamp(val, 0.0, 1.0)
    zi = cast(int_t, val * (1.0/r_sigma) + 0.5)
    grid = Func('grid')
    grid[x, y, z, c] = 0.0
    grid[x, y, zi, c] += select(c == 0, val, 1.0)

    # Blur the grid using a five-tap filter
    blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz')
    blurx[x, y, z] = grid[x-2, y, z] + grid[x-1, y, z]*4 + grid[x, y, z]*6 + grid[x+1, y, z]*4 + grid[x+2, y, z]
    blury[x, y, z] = blurx[x, y-2, z] + blurx[x, y-1, z]*4 + blurx[x, y, z]*6 + blurx[x, y+1, z]*4 + blurx[x, y+2, z]
    blurz[x, y, z] = blury[x, y, z-2] + blury[x, y, z-1]*4 + blury[x, y, z]*6 + blury[x, y, z+1]*4 + blury[x, y, z+2]

    # Take trilinear samples to compute the output
    val = clamp(clamped[x, y], 0.0, 1.0)
    zv = val * (1.0/r_sigma)
    zi = cast(int_t, zv)
    zf = zv - zi
    xf = cast(float_t, x % s_sigma) / s_sigma
    yf = cast(float_t, y % s_sigma) / s_sigma
    xi = x/s_sigma
    yi = y/s_sigma
    interpolated = Func('interpolated')
    interpolated[x, y] = lerp(lerp(lerp(blurz[xi, yi, zi], blurz[xi+1, yi, zi], xf),
                                   lerp(blurz[xi, yi+1, zi], blurz[xi+1, yi+1, zi], xf), yf),
                              lerp(lerp(blurz[xi, yi, zi+1], blurz[xi+1, yi, zi+1], xf),
                                   lerp(blurz[xi, yi+1, zi+1], blurz[xi+1, yi+1, zi+1], xf), yf), zf)

    # Normalize
    smoothed = Func('smoothed')
    smoothed[x, y, c] = interpolated[x, y, 0]/interpolated[x, y, 1]

    schedule = 1
    if schedule == 0:
        pass
    elif schedule == 1:
        # Best schedule for CPU
        grid.root().parallel(z)
        grid.update().reorder(c, x, y).parallel(y)
        blurx.root().parallel(z).vectorize(x, 4)
        blury.root().parallel(z).vectorize(x, 4)
        blurz.root().parallel(z).vectorize(x, 4)
        smoothed.root().parallel(y).vectorize(x, 4)
    elif schedule == 2:
        # Best schedule for GPU
        gridz = grid.arg(2)
        grid.root().cudaTile(x, y, 16, 16)
        grid.update().root().cudaTile(x, y, 16, 16)
        blurx.root().cudaTile(x, y, 8, 8)
        blury.root().cudaTile(x, y, 8, 8)
        blurz.root().cudaTile(x, y, 8, 8)
        smoothed.root().cudaTile(x, y, s_sigma, s_sigma)
    else:
        raise ValueError
    
    tune_ref_schedules = {'human': 'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' +
                                   'blurx.root().parallel(z).vectorize(x, 4)\n' +
                                   'blury.root().parallel(z).vectorize(x, 4)\n' +
                                   'blurz.root().parallel(z).vectorize(x, 4)\n' +
                                   'smoothed.root().parallel(y).vectorize(x, 4)\n'}
    # GPU
    gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \
                'blurx.root().cudaTile(x, y, 8, 8)\n' + \
                'blury.root().cudaTile(x, y, 8, 8)\n' + \
                'blurz.root().cudaTile(x, y, 8, 8)\n' + \
                'smoothed.root().cudaTile(x, y, 8, 8)\n'
    if autotune.is_cuda():
        tune_ref_schedules['human'] = gpu_human


    tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3)
    #print tune_constraints
    
    #autotune.print_tunables(smoothed)
    #for i in range(123,10000):
    #    random.seed(i)
    #    print '-'*40
    #    print 'Schedule %d'%i
    #    p = autotune.AutotuneParams()
    #    print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth)

#    std::vector<Func::Arg> args;
#    args.push_back(r_sigma);
#    args.push_back(input);
#    smoothed.compileToFile("bilateral_grid", args);
    return (input, smoothed, None, locals())

コード例 #7

0

ファイルを表示

ファイル: local_laplacian.py プロジェクト: xiaonanzzz/Halide

def filter_func(J=8, dtype=UInt(16), use_uniforms=False):
    "Local Laplacian."

    downsample_counter = [0]
    upsample_counter = [0]

    def downsample(f):
        downx, downy = Func('downx%d' % downsample_counter[0]), Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y] = (f[2 * x - 1, y] + 3.0 *
                       (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0
        downy[x, y] = (downx[x, 2 * y - 1] + 3.0 *
                       (downx[x, 2 * y] + downx[x, 2 * y + 1]) +
                       downx[x, 2 * y + 2]) / 8.0

        return downy

    def upsample(f):
        upx, upy = Func('upx%d' % upsample_counter[0]), Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x, y] = 0.25 * f[(x / 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x / 2, y]
        upy[x,
            y] = 0.25 * upx[x,
                            (y / 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y / 2]

        return upy

    if use_uniforms:
        levels = Uniform(int_t, 'levels', 8)
        alpha = Uniform(float_t, 'alpha', 1.0)  #1.0)
        beta = Uniform(float_t, 'beta', 1.0)
    else:
        levels = 8
        alpha = 1.0
        beta = 1.0
    input = UniformImage(dtype, 3, 'input')

    x = Var('x')
    y = Var('y')
    c = Var('c')
    k = Var('k')

    fx = cast(float_t, x / 256.0)
    remap = Func('remap')
    remap[x] = (alpha / cast(float_t, levels - 1)) * fx * exp(-fx * fx / 2.0)

    floating = Func('floating')
    floating[x, y, c] = cast(float_t, input[x, y, c]) / float(dtype.maxval())

    clamped = Func('clamped')
    clamped[x, y, c] = floating[
        clamp(x, cast(int_t, 0), cast(int_t,
                                      input.width() - 1)),
        clamp(y, cast(int_t, 0), cast(int_t,
                                      input.height() - 1)), c]
    gray = Func('gray')
    gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[
        x, y, 1] + 0.114 * clamped[x, y, 2]

    gPyramid = [Func('gPyramid%d' % i) for i in range(J)]
    idx = gray[x, y] * cast(float_t, levels - 1) * 256.0
    idx = clamp(cast(int_t, idx), cast(int_t, 0),
                cast(int_t, (levels - 1) * 256))
    gPyramid[0][x, y, k] = beta * gray[x, y] + remap[idx - 256 * k]
    for j in range(1, J):
        gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k]

    lPyramid = [Func('lPyramid%d' % i) for i in range(J)]
    lPyramid[J - 1] = gPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(
            gPyramid[j + 1])[x, y, k]

    inGPyramid = [Func('inGPyramid%d' % i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1, J):
        inGPyramid[j][x, y] = downsample(inGPyramid[j - 1])[x, y]

    outLPyramid = [Func('outLPyramid%d' % i) for i in range(J)]
    for j in range(J):
        level = inGPyramid[j][x, y] * cast(float_t, levels - 1)
        li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels - 2))
        lf = level - cast(float_t, li)
        outLPyramid[j][x, y] = (
            1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1]

    outGPyramid = [Func('outGPyramid%d' % i) for i in range(J)]
    outGPyramid[J - 1] = outLPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        outGPyramid[j][x, y] = upsample(
            outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]

    color = Func('color')
    #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y]
    color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] +
                                             0.01) / (gray[x, y] + 0.01)

    output = Func('output')
    output[x, y, c] = cast(
        dtype,
        clamp(color[x, y, c], cast(float_t, 0.0), cast(float_t, 1.0)) *
        float(dtype.maxval()))

    root_all(output)
    #import autotune
    #print autotune.root_all_str(output)
    #autotune.print_root_all(output)

    human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n'
    for j in range(J):
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % inGPyramid[
            j].name()
        if j > 0:
            human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n' % j
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % outGPyramid[
            j].name()

    if autotune.is_cuda():
        human_schedule = 'remap.root()\n'
        human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n'
        for j in range(J):
            blockw = blockh = 32
            if j > 3:
                blockw = blockh = 2
            if j == 0:
                human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n' % (
                    blockw, blockh)
            else:
                human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                    j, blockw, blockh)
            human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                j, blockw, blockh)
            if j == J - 1:
                human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                    j, blockw, blockh)
            else:
                human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                    j, blockw, blockh)

    # Special variables interpreted by autotuner
    tune_ref_schedules = {'human': human_schedule}
    tune_constraints = autotune.bound_recursive(output, 'c', 0, 3)

    #print '# schedules:'
    #import math
    #print math.log(autotune.lower_bound_schedules(output),10)
    #sys.exit(1)

    return (input, output, None, locals())