Ejemplo n.º 1
0
def filter_func(dtype=Float(32), use_uniforms=False, in_filename=os.path.join(inputs_dir(), 'interpolate_large.png')):
    "Fast interpolation using a pyramid."

    input = UniformImage(dtype, 3, 'input')
    x = Var('x')
    y = Var('y')
    c = Var('c')
    levels = 10
    
    downsampled = [Func('d%d'%i) for i in range(levels)]
    interpolated = [Func('i%d'%i) for i in range(levels)]

    clamped = Func('clamped')
    clamped[c, x, y] = input[clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1), c];

    downsampled[0][c,x,y] = select(c<3, clamped[c,x,y] * clamped[3,x,y], clamped[3,x,y])

    
    downx = [None] + [Func('dx%d'%l) for l in range(1,levels)]
    for l in range(1, levels):
        downx[l][c,x,y] = (downsampled[l-1][c,x*2-1,y] + 2.0 * downsampled[l-1][c,x*2,y] + downsampled[l-1][c,x*2+1,y]) * 0.25
        downsampled[l][c,x,y] = (downx[l][c,x,y*2-1] + 2.0 * downx[l][c,x,y*2] + downx[l][c,x,y*2+1]) * 0.25
    
    upsampled = [Func('u%d'%l) for l in range(levels-1)]
    upsampledx = [Func('ux%d'%l) for l in range(levels-1)]

    interpolated[levels-1][c,x,y] = downsampled[levels-1][c,x,y]
    for l in range(levels-1)[::-1]:
        upsampledx[l][c,x,y] = 0.5 * (interpolated[l+1][c, x/2 + (x%2),y] + interpolated[l+1][c,x/2,y])
        upsampled[l][c,x,y] = 0.5 * (upsampledx[l][c, x, y/2 + (y%2)] + upsampledx[l][c,x,y/2])
        interpolated[l][c,x,y] = downsampled[l][c,x,y] + (1.0 - downsampled[l][3,x,y]) * upsampled[l][c,x,y]

    final = Func('final')
    final[x,y,c] = interpolated[0][c,x,y] / interpolated[0][3,x,y]
    
    def evaluate(in_png):
        T0 = time.time()

        out = final.realize(in_png.width(), in_png.height(), 3)
        print 'Interpolated in %.5f secs' % (time.time()-T0)

        return out
    
    # Special tuning variables interpreted by the autotuner
    tune_out_dims = (1408, 1408, 3)
    tune_in_images = [in_filename]
    tune_image_ext = '.ppm'

    human_schedule = 'final.root().parallel(y).bound(c, 0, 3)\n'
    for i in range(1, levels-1):
        human_schedule += 'd%d.root().vectorize(c, 4).parallel(y)\n'%i
        human_schedule += 'i%d.root().vectorize(c, 4).parallel(y)\n'%i
        
    tune_ref_schedules = {'human': human_schedule}
    tune_constraints = autotune.bound_recursive(final, 'c', 0, 4).replace('final.bound(c,0,4)','final.bound(c,0,3)')
    print tune_constraints
    
    autotune.Schedule.fromstring(final, human_schedule).apply()
    
    return (input, final, evaluate, locals())
Ejemplo n.º 2
0
def filter_func(J=8, dtype=UInt(16), use_uniforms=False):
    "Local Laplacian."

    downsample_counter=[0] 
    upsample_counter=[0]
    
    def downsample(f):
        downx, downy = Func('downx%d'%downsample_counter[0]), Func('downy%d'%downsample_counter[0])
        downsample_counter[0] += 1
        
        downx[x,y] = (f[2*x-1, y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0
        downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0

        return downy
    
    def upsample(f):
        upx, upy = Func('upx%d'%upsample_counter[0]), Func('upy%d'%upsample_counter[0])
        upsample_counter[0] += 1
        
        upx[x,y] = 0.25 * f[(x/2)-1+2*(x%2),y] + 0.75 * f[x/2,y]
        upy[x,y] = 0.25 * upx[x, (y/2) - 1 + 2*(y%2)] + 0.75 * upx[x,y/2]
        
        return upy
    
    if use_uniforms:
        levels = Uniform(int_t, 'levels', 8)
        alpha = Uniform(float_t, 'alpha', 1.0) #1.0)
        beta = Uniform(float_t, 'beta', 1.0)
    else:
        levels = 8
        alpha = 1.0
        beta = 1.0
    input = UniformImage(dtype, 3, 'input')
    
    x = Var('x')
    y = Var('y')
    c = Var('c')
    k = Var('k')
    
    fx = cast(float_t, x/256.0)
    remap = Func('remap')
    remap[x] = (alpha/cast(float_t, levels-1))*fx*exp(-fx*fx/2.0)
    
    floating = Func('floating')
    floating[x,y,c] = cast(float_t, input[x,y,c])/float(dtype.maxval())
    
    clamped = Func('clamped')
    clamped[x,y,c] = floating[clamp(x,cast(int_t,0),cast(int_t,input.width()-1)),
                              clamp(y,cast(int_t,0),cast(int_t,input.height()-1)), c]
    gray = Func('gray')
    gray[x,y] = 0.299*clamped[x,y,0]+0.587*clamped[x,y,1]+0.114*clamped[x,y,2]
    
    gPyramid = [Func('gPyramid%d'%i) for i in range(J)]
    idx = gray[x,y]*cast(float_t, levels-1)*256.0
    idx = clamp(cast(int_t, idx), cast(int_t, 0), cast(int_t, (levels-1)*256))
    gPyramid[0][x,y,k] = beta*gray[x,y] + remap[idx-256*k]
    for j in range(1,J):
        gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k]

    lPyramid = [Func('lPyramid%d'%i) for i in range(J)]
    lPyramid[J-1] = gPyramid[J-1]
    for j in range(J-1)[::-1]:
        lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k]
    
    inGPyramid = [Func('inGPyramid%d'%i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1,J):
        inGPyramid[j][x,y] = downsample(inGPyramid[j-1])[x,y]
    
    outLPyramid = [Func('outLPyramid%d'%i) for i in range(J)]
    for j in range(J):
        level = inGPyramid[j][x,y]*cast(float_t, levels-1)
        li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels-2))
        lf = level - cast(float_t, li)
        outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1]
    
    outGPyramid = [Func('outGPyramid%d'%i) for i in range(J)]
    outGPyramid[J-1] = outLPyramid[J-1]
    for j in range(J-1)[::-1]:
        outGPyramid[j][x,y] = upsample(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y]
    
    color = Func('color')
    #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y]
    color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c]+0.01) / (gray[x,y]+0.01)
    
    output = Func('output')
    output[x,y,c] = cast(dtype, clamp(color[x,y,c], cast(float_t,0.0), cast(float_t,1.0))*float(dtype.maxval()))
    
    root_all(output)
    #import autotune
    #print autotune.root_all_str(output)
    #autotune.print_root_all(output)
    
    human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n'
    for j in range(J):
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%inGPyramid[j].name()
        if j > 0:
            human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n'%j
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%outGPyramid[j].name()
    
    if autotune.is_cuda():
        human_schedule = 'remap.root()\n'
        human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n'
        for j in range(J):
            blockw = blockh = 32
            if j > 3:
                blockw = blockh = 2
            if j == 0:
                human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n'%(blockw, blockh)
            else:
                human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)
            human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)
            if j == J-1:
                human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)
            else:
                human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh)

    # Special variables interpreted by autotuner
    tune_ref_schedules = {'human': human_schedule}
    tune_constraints = autotune.bound_recursive(output, 'c', 0, 3)

    #print '# schedules:'
    #import math
    #print math.log(autotune.lower_bound_schedules(output),10)
    #sys.exit(1)
    
    return (input, output, None, locals())
Ejemplo n.º 3
0
def filter_func(dtype=UInt(16), use_uniforms=False):
    def lerp(a, b, alpha):
        return (1.0 - alpha) * a + alpha * b

    input = UniformImage(float_t, 3, 'input')
    if use_uniforms:
        r_sigma = Uniform(float_t, 0.1)
    else:
        r_sigma = 0.1
    s_sigma = 8

    x = Var('x')
    y = Var('y')
    z = Var('z')
    c = Var('c')

    clamped = Func('clamped')
    clamped[x, y] = input[clamp(x, 0,
                                input.width() - 1),
                          clamp(y, 0,
                                input.height() - 1), 0]

    r = RDom(0, s_sigma, 0, s_sigma, 'r')
    val = clamped[x * s_sigma + r.x - s_sigma / 2,
                  y * s_sigma + r.y - s_sigma / 2]
    val = clamp(val, 0.0, 1.0)
    zi = cast(int_t, val * (1.0 / r_sigma) + 0.5)
    grid = Func('grid')
    grid[x, y, z, c] = 0.0
    grid[x, y, zi, c] += select(c == 0, val, 1.0)

    # Blur the grid using a five-tap filter
    blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz')
    blurx[x, y, z] = grid[x - 2, y, z] + grid[x - 1, y, z] * 4 + grid[
        x, y, z] * 6 + grid[x + 1, y, z] * 4 + grid[x + 2, y, z]
    blury[x, y, z] = blurx[x, y - 2, z] + blurx[x, y - 1, z] * 4 + blurx[
        x, y, z] * 6 + blurx[x, y + 1, z] * 4 + blurx[x, y + 2, z]
    blurz[x, y, z] = blury[x, y, z - 2] + blury[x, y, z - 1] * 4 + blury[
        x, y, z] * 6 + blury[x, y, z + 1] * 4 + blury[x, y, z + 2]

    # Take trilinear samples to compute the output
    val = clamp(clamped[x, y], 0.0, 1.0)
    zv = val * (1.0 / r_sigma)
    zi = cast(int_t, zv)
    zf = zv - zi
    xf = cast(float_t, x % s_sigma) / s_sigma
    yf = cast(float_t, y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = Func('interpolated')
    interpolated[x, y] = lerp(
        lerp(lerp(blurz[xi, yi, zi], blurz[xi + 1, yi, zi], xf),
             lerp(blurz[xi, yi + 1, zi], blurz[xi + 1, yi + 1, zi], xf), yf),
        lerp(
            lerp(blurz[xi, yi, zi + 1], blurz[xi + 1, yi, zi + 1], xf),
            lerp(blurz[xi, yi + 1, zi + 1], blurz[xi + 1, yi + 1, zi + 1], xf),
            yf), zf)

    # Normalize
    smoothed = Func('smoothed')
    smoothed[x, y, c] = interpolated[x, y, 0] / interpolated[x, y, 1]

    schedule = 1
    if schedule == 0:
        pass
    elif schedule == 1:
        # Best schedule for CPU
        grid.root().parallel(z)
        grid.update().reorder(c, x, y).parallel(y)
        blurx.root().parallel(z).vectorize(x, 4)
        blury.root().parallel(z).vectorize(x, 4)
        blurz.root().parallel(z).vectorize(x, 4)
        smoothed.root().parallel(y).vectorize(x, 4)
    elif schedule == 2:
        # Best schedule for GPU
        gridz = grid.arg(2)
        grid.root().cudaTile(x, y, 16, 16)
        grid.update().root().cudaTile(x, y, 16, 16)
        blurx.root().cudaTile(x, y, 8, 8)
        blury.root().cudaTile(x, y, 8, 8)
        blurz.root().cudaTile(x, y, 8, 8)
        smoothed.root().cudaTile(x, y, s_sigma, s_sigma)
    else:
        raise ValueError

    tune_ref_schedules = {
        'human':
        'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' +
        'blurx.root().parallel(z).vectorize(x, 4)\n' +
        'blury.root().parallel(z).vectorize(x, 4)\n' +
        'blurz.root().parallel(z).vectorize(x, 4)\n' +
        'smoothed.root().parallel(y).vectorize(x, 4)\n'
    }
    # GPU
    gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \
                'blurx.root().cudaTile(x, y, 8, 8)\n' + \
                'blury.root().cudaTile(x, y, 8, 8)\n' + \
                'blurz.root().cudaTile(x, y, 8, 8)\n' + \
                'smoothed.root().cudaTile(x, y, 8, 8)\n'
    if autotune.is_cuda():
        tune_ref_schedules['human'] = gpu_human

    tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3)
    #print tune_constraints

    #autotune.print_tunables(smoothed)
    #for i in range(123,10000):
    #    random.seed(i)
    #    print '-'*40
    #    print 'Schedule %d'%i
    #    p = autotune.AutotuneParams()
    #    print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth)

    #    std::vector<Func::Arg> args;
    #    args.push_back(r_sigma);
    #    args.push_back(input);
    #    smoothed.compileToFile("bilateral_grid", args);
    return (input, smoothed, None, locals())
Ejemplo n.º 4
0
def filter_func(dtype=UInt(16), use_uniforms=False):
    def lerp(a, b, alpha):
        return (1.0-alpha)*a + alpha*b

    input = UniformImage(float_t, 3, 'input')
    if use_uniforms:
        r_sigma = Uniform(float_t, 0.1)
    else:
        r_sigma = 0.1
    s_sigma = 8
    
    x = Var('x')
    y = Var('y')
    z = Var('z')
    c = Var('c')

    clamped = Func('clamped')
    clamped[x, y] = input[clamp(x, 0, input.width()-1),
                          clamp(y, 0, input.height()-1),0]

    r = RDom(0, s_sigma, 0, s_sigma, 'r')
    val = clamped[x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2]
    val = clamp(val, 0.0, 1.0)
    zi = cast(int_t, val * (1.0/r_sigma) + 0.5)
    grid = Func('grid')
    grid[x, y, z, c] = 0.0
    grid[x, y, zi, c] += select(c == 0, val, 1.0)

    # Blur the grid using a five-tap filter
    blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz')
    blurx[x, y, z] = grid[x-2, y, z] + grid[x-1, y, z]*4 + grid[x, y, z]*6 + grid[x+1, y, z]*4 + grid[x+2, y, z]
    blury[x, y, z] = blurx[x, y-2, z] + blurx[x, y-1, z]*4 + blurx[x, y, z]*6 + blurx[x, y+1, z]*4 + blurx[x, y+2, z]
    blurz[x, y, z] = blury[x, y, z-2] + blury[x, y, z-1]*4 + blury[x, y, z]*6 + blury[x, y, z+1]*4 + blury[x, y, z+2]

    # Take trilinear samples to compute the output
    val = clamp(clamped[x, y], 0.0, 1.0)
    zv = val * (1.0/r_sigma)
    zi = cast(int_t, zv)
    zf = zv - zi
    xf = cast(float_t, x % s_sigma) / s_sigma
    yf = cast(float_t, y % s_sigma) / s_sigma
    xi = x/s_sigma
    yi = y/s_sigma
    interpolated = Func('interpolated')
    interpolated[x, y] = lerp(lerp(lerp(blurz[xi, yi, zi], blurz[xi+1, yi, zi], xf),
                                   lerp(blurz[xi, yi+1, zi], blurz[xi+1, yi+1, zi], xf), yf),
                              lerp(lerp(blurz[xi, yi, zi+1], blurz[xi+1, yi, zi+1], xf),
                                   lerp(blurz[xi, yi+1, zi+1], blurz[xi+1, yi+1, zi+1], xf), yf), zf)

    # Normalize
    smoothed = Func('smoothed')
    smoothed[x, y, c] = interpolated[x, y, 0]/interpolated[x, y, 1]

    schedule = 1
    if schedule == 0:
        pass
    elif schedule == 1:
        # Best schedule for CPU
        grid.root().parallel(z)
        grid.update().reorder(c, x, y).parallel(y)
        blurx.root().parallel(z).vectorize(x, 4)
        blury.root().parallel(z).vectorize(x, 4)
        blurz.root().parallel(z).vectorize(x, 4)
        smoothed.root().parallel(y).vectorize(x, 4)
    elif schedule == 2:
        # Best schedule for GPU
        gridz = grid.arg(2)
        grid.root().cudaTile(x, y, 16, 16)
        grid.update().root().cudaTile(x, y, 16, 16)
        blurx.root().cudaTile(x, y, 8, 8)
        blury.root().cudaTile(x, y, 8, 8)
        blurz.root().cudaTile(x, y, 8, 8)
        smoothed.root().cudaTile(x, y, s_sigma, s_sigma)
    else:
        raise ValueError
    
    tune_ref_schedules = {'human': 'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' +
                                   'blurx.root().parallel(z).vectorize(x, 4)\n' +
                                   'blury.root().parallel(z).vectorize(x, 4)\n' +
                                   'blurz.root().parallel(z).vectorize(x, 4)\n' +
                                   'smoothed.root().parallel(y).vectorize(x, 4)\n'}
    # GPU
    gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \
                'blurx.root().cudaTile(x, y, 8, 8)\n' + \
                'blury.root().cudaTile(x, y, 8, 8)\n' + \
                'blurz.root().cudaTile(x, y, 8, 8)\n' + \
                'smoothed.root().cudaTile(x, y, 8, 8)\n'
    if autotune.is_cuda():
        tune_ref_schedules['human'] = gpu_human


    tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3)
    #print tune_constraints
    
    #autotune.print_tunables(smoothed)
    #for i in range(123,10000):
    #    random.seed(i)
    #    print '-'*40
    #    print 'Schedule %d'%i
    #    p = autotune.AutotuneParams()
    #    print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth)

#    std::vector<Func::Arg> args;
#    args.push_back(r_sigma);
#    args.push_back(input);
#    smoothed.compileToFile("bilateral_grid", args);
    return (input, smoothed, None, locals())
Ejemplo n.º 5
0
def filter_func(J=8, dtype=UInt(16), use_uniforms=False):
    "Local Laplacian."

    downsample_counter = [0]
    upsample_counter = [0]

    def downsample(f):
        downx, downy = Func('downx%d' % downsample_counter[0]), Func(
            'downy%d' % downsample_counter[0])
        downsample_counter[0] += 1

        downx[x, y] = (f[2 * x - 1, y] + 3.0 *
                       (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0
        downy[x, y] = (downx[x, 2 * y - 1] + 3.0 *
                       (downx[x, 2 * y] + downx[x, 2 * y + 1]) +
                       downx[x, 2 * y + 2]) / 8.0

        return downy

    def upsample(f):
        upx, upy = Func('upx%d' % upsample_counter[0]), Func(
            'upy%d' % upsample_counter[0])
        upsample_counter[0] += 1

        upx[x, y] = 0.25 * f[(x / 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x / 2, y]
        upy[x,
            y] = 0.25 * upx[x,
                            (y / 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y / 2]

        return upy

    if use_uniforms:
        levels = Uniform(int_t, 'levels', 8)
        alpha = Uniform(float_t, 'alpha', 1.0)  #1.0)
        beta = Uniform(float_t, 'beta', 1.0)
    else:
        levels = 8
        alpha = 1.0
        beta = 1.0
    input = UniformImage(dtype, 3, 'input')

    x = Var('x')
    y = Var('y')
    c = Var('c')
    k = Var('k')

    fx = cast(float_t, x / 256.0)
    remap = Func('remap')
    remap[x] = (alpha / cast(float_t, levels - 1)) * fx * exp(-fx * fx / 2.0)

    floating = Func('floating')
    floating[x, y, c] = cast(float_t, input[x, y, c]) / float(dtype.maxval())

    clamped = Func('clamped')
    clamped[x, y, c] = floating[
        clamp(x, cast(int_t, 0), cast(int_t,
                                      input.width() - 1)),
        clamp(y, cast(int_t, 0), cast(int_t,
                                      input.height() - 1)), c]
    gray = Func('gray')
    gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[
        x, y, 1] + 0.114 * clamped[x, y, 2]

    gPyramid = [Func('gPyramid%d' % i) for i in range(J)]
    idx = gray[x, y] * cast(float_t, levels - 1) * 256.0
    idx = clamp(cast(int_t, idx), cast(int_t, 0),
                cast(int_t, (levels - 1) * 256))
    gPyramid[0][x, y, k] = beta * gray[x, y] + remap[idx - 256 * k]
    for j in range(1, J):
        gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k]

    lPyramid = [Func('lPyramid%d' % i) for i in range(J)]
    lPyramid[J - 1] = gPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(
            gPyramid[j + 1])[x, y, k]

    inGPyramid = [Func('inGPyramid%d' % i) for i in range(J)]
    inGPyramid[0] = gray
    for j in range(1, J):
        inGPyramid[j][x, y] = downsample(inGPyramid[j - 1])[x, y]

    outLPyramid = [Func('outLPyramid%d' % i) for i in range(J)]
    for j in range(J):
        level = inGPyramid[j][x, y] * cast(float_t, levels - 1)
        li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels - 2))
        lf = level - cast(float_t, li)
        outLPyramid[j][x, y] = (
            1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1]

    outGPyramid = [Func('outGPyramid%d' % i) for i in range(J)]
    outGPyramid[J - 1] = outLPyramid[J - 1]
    for j in range(J - 1)[::-1]:
        outGPyramid[j][x, y] = upsample(
            outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]

    color = Func('color')
    #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y]
    color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] +
                                             0.01) / (gray[x, y] + 0.01)

    output = Func('output')
    output[x, y, c] = cast(
        dtype,
        clamp(color[x, y, c], cast(float_t, 0.0), cast(float_t, 1.0)) *
        float(dtype.maxval()))

    root_all(output)
    #import autotune
    #print autotune.root_all_str(output)
    #autotune.print_root_all(output)

    human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n'
    for j in range(J):
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % inGPyramid[
            j].name()
        if j > 0:
            human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n' % j
        human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % outGPyramid[
            j].name()

    if autotune.is_cuda():
        human_schedule = 'remap.root()\n'
        human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n'
        for j in range(J):
            blockw = blockh = 32
            if j > 3:
                blockw = blockh = 2
            if j == 0:
                human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n' % (
                    blockw, blockh)
            else:
                human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                    j, blockw, blockh)
            human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                j, blockw, blockh)
            if j == J - 1:
                human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                    j, blockw, blockh)
            else:
                human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % (
                    j, blockw, blockh)

    # Special variables interpreted by autotuner
    tune_ref_schedules = {'human': human_schedule}
    tune_constraints = autotune.bound_recursive(output, 'c', 0, 3)

    #print '# schedules:'
    #import math
    #print math.log(autotune.lower_bound_schedules(output),10)
    #sys.exit(1)

    return (input, output, None, locals())
Ejemplo n.º 6
0
def filter_func(result_type=UInt(8), schedule=0, use_uniforms=False):
    x, y, tx, ty, c = Var('x'), Var('y'), Var('tx'), Var('ty'), Var('c')
    counter_interleave_x = [0]
    counter_interleave_y = [0]
    
    def hot_pixel_suppression(input):
        a = max(max(input[x-2, y], input[x+2, y]),
                max(input[x, y-2], input[x, y+2]))
        b = min(min(input[x-2, y], input[x+2, y]),
                min(input[x, y-2], input[x, y+2]))
        
        denoised = Func('denoised')
        denoised[x, y] = clamp(input[x, y], b, a)

        return denoised

    def interleave_x(a, b):
        counter_interleave_x[0] += 1
        out = Func('interleave_x%d'%counter_interleave_x[0])
        out[x, y] = select((x%2)==0, a[x/2, y], b[x/2, y])
        return out

    def interleave_y(a, b):
        counter_interleave_y[0] += 1
        out = Func('interleave_y%d'%counter_interleave_y[0])
        out[x, y] = select((y%2)==0, a[x, y/2], b[x, y/2])
        return out

    def deinterleave(raw):
        # Deinterleave the color channels
        deinterleaved = Func('deinterleaved')

        deinterleaved[x, y, c] = select(c == 0, raw[2*x, 2*y], 
                                 select(c == 1, raw[2*x+1, 2*y],
                                 select(c == 2, raw[2*x, 2*y+1], 
                                                raw[2*x+1, 2*y+1])))
        return deinterleaved
        
    def absd(a, b):
        return select(a > b, a-b, b-a)

    def demosaic(deinterleaved):
        # These are the values we already know from the input
        # x_y = the value of channel x at a site in the input of channel y
        # gb refers to green sites in the blue rows
        # gr refers to green sites in the red rows

        # Give more convenient names to the four channels we know
        r_r, g_gr, g_gb, b_b = Func('r_r'), Func('g_gr'), Func('g_gb'), Func('b_b')
        g_gr[x, y] = deinterleaved[x, y, 0]
        r_r[x, y]  = deinterleaved[x, y, 1]
        b_b[x, y]  = deinterleaved[x, y, 2]
        g_gb[x, y] = deinterleaved[x, y, 3]

        # These are the ones we need to interpolate
        b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b = Func('b_r'), Func('g_r'), Func('b_gr'), Func('r_gr'), Func('b_gb'), Func('r_gb'), Func('r_b'), Func('g_b')

        # First calculate green at the red and blue sites

        # Try interpolating vertically and horizontally. Also compute
        # differences vertically and horizontally. Use interpolation in
        # whichever direction had the smallest difference.
        gv_r  =     (g_gb[x, y-1] + g_gb[x, y])/2
        gvd_r = absd(g_gb[x, y-1], g_gb[x, y])
        gh_r  =     (g_gr[x+1, y] + g_gr[x, y])/2
        ghd_r = absd(g_gr[x+1, y], g_gr[x, y])
        
        g_r[x, y]  = select(ghd_r < gvd_r, gh_r, gv_r)

        gv_b  =     (g_gr[x, y+1] + g_gr[x, y])/2
        gvd_b = absd(g_gr[x, y+1], g_gr[x, y])
        gh_b  =     (g_gb[x-1, y] + g_gb[x, y])/2
        ghd_b = absd(g_gb[x-1, y], g_gb[x, y])

        g_b[x, y]  = select(ghd_b < gvd_b, gh_b, gv_b)

        # Next interpolate red at gr by first interpolating, then
        # correcting using the error green would have had if we had
        # interpolated it in the same way (i.e. add the second derivative
        # of the green channel at the same place).
        correction = g_gr[x, y] - (g_r[x, y] + g_r[x-1, y])/2
        r_gr[x, y] = correction + (r_r[x-1, y] + r_r[x, y])/2

        # Do the same for other reds and blues at green sites
        correction = g_gr[x, y] - (g_b[x, y] + g_b[x, y-1])/2
        b_gr[x, y] = correction + (b_b[x, y] + b_b[x, y-1])/2

        correction = g_gb[x, y] - (g_r[x, y] + g_r[x, y+1])/2
        r_gb[x, y] = correction + (r_r[x, y] + r_r[x, y+1])/2

        correction = g_gb[x, y] - (g_b[x, y] + g_b[x+1, y])/2
        b_gb[x, y] = correction + (b_b[x, y] + b_b[x+1, y])/2

        # Now interpolate diagonally to get red at blue and blue at
        # red. Hold onto your hats; this gets really fancy. We do the
        # same thing as for interpolating green where we try both
        # directions (in this case the positive and negative diagonals),
        # and use the one with the lowest absolute difference. But we
        # also use the same trick as interpolating red and blue at green
        # sites - we correct our interpolations using the second
        # derivative of green at the same sites.
        
        correction = g_b[x, y]  - (g_r[x, y] + g_r[x-1, y+1])/2
        rp_b       = correction + (r_r[x, y] + r_r[x-1, y+1])/2
        rpd_b      = absd(r_r[x, y], r_r[x-1, y+1])

        correction = g_b[x, y]  - (g_r[x-1, y] + g_r[x, y+1])/2
        rn_b       = correction + (r_r[x-1, y] + r_r[x, y+1])/2
        rnd_b      = absd(r_r[x-1, y], r_r[x, y+1])

        r_b[x, y]  = select(rpd_b < rnd_b, rp_b, rn_b)


        # Same thing for blue at red
        correction = g_r[x, y]  - (g_b[x, y] + g_b[x+1, y-1])/2
        bp_r       = correction + (b_b[x, y] + b_b[x+1, y-1])/2
        bpd_r      = absd(b_b[x, y], b_b[x+1, y-1])

        correction = g_r[x, y]  - (g_b[x+1, y] + g_b[x, y-1])/2
        bn_r       = correction + (b_b[x+1, y] + b_b[x, y-1])/2
        bnd_r      = absd(b_b[x+1, y], b_b[x, y-1])

        b_r[x, y]  =  select(bpd_r < bnd_r, bp_r, bn_r)

        # Interleave the resulting channels
        r = interleave_y(interleave_x(r_gr, r_r),
                         interleave_x(r_b, r_gb))
        g = interleave_y(interleave_x(g_gr, g_r),
                         interleave_x(g_b, g_gb))
        b = interleave_y(interleave_x(b_gr, b_r),
                         interleave_x(b_b, b_gb))


        output = Func('demosaic')
        output[x, y, c] = select(c == 0, r[x, y], 
                          select(c == 1, g[x, y], b[x, y]))


        # THE SCHEDULE
        if schedule == 0:
            # optimized for ARM
            # Compute these in chunks over tiles, vectorized by 8
            g_r.chunk(tx).vectorize(x, 8)
            g_b.chunk(tx).vectorize(x, 8)
            r_gr.chunk(tx).vectorize(x, 8)
            b_gr.chunk(tx).vectorize(x, 8)
            r_gb.chunk(tx).vectorize(x, 8)
            b_gb.chunk(tx).vectorize(x, 8)
            r_b.chunk(tx).vectorize(x, 8)
            b_r.chunk(tx).vectorize(x, 8)
            # These interleave in y, so unrolling them in y helps
            r.chunk(tx).vectorize(x, 8).unroll(y, 2)
            g.chunk(tx).vectorize(x, 8).unroll(y, 2)
            b.chunk(tx).vectorize(x, 8).unroll(y, 2)
        elif schedule == 1:
            # optimized for X86
            # Don't vectorize, because sse is bad at 16-bit interleaving
            g_r.chunk(tx)
            g_b.chunk(tx)
            r_gr.chunk(tx)
            b_gr.chunk(tx)
            r_gb.chunk(tx)
            b_gb.chunk(tx)
            r_b.chunk(tx)
            b_r.chunk(tx)
            # These interleave in x and y, so unrolling them helps
            r.chunk(tx).unroll(x, 2).unroll(y, 2)
            g.chunk(tx).unroll(x, 2).unroll(y, 2)
            b.chunk(tx).unroll(x, 2).unroll(y, 2)
        elif schedule == -1:
            # Basic naive schedule
            g_r.root()
            g_b.root()
            r_gr.root()
            b_gr.root()
            r_gb.root()
            b_gb.root()
            r_b.root()
            b_r.root()
            r.root()
            g.root()
            b.root()

        return output

    def color_correct(input, matrix_3200, matrix_7000, kelvin):
        # Get a color matrix by linearly interpolating between two
        # calibrated matrices using inverse kelvin.

        matrix = Func('matrix')
        alpha = (1.0/kelvin - 1.0/3200) / (1.0/7000 - 1.0/3200)
        val =  (matrix_3200[x, y] * alpha + matrix_7000[x, y] * (1 - alpha))
        matrix[x, y] = cast(int_t, val * 256.0) # Q8.8 fixed point
        matrix.root()

        corrected = Func('corrected')
        ir = cast(int_t, input[x, y, 0])
        ig = cast(int_t, input[x, y, 1])
        ib = cast(int_t, input[x, y, 2])

        r = matrix[3, 0] + matrix[0, 0] * ir + matrix[1, 0] * ig + matrix[2, 0] * ib
        g = matrix[3, 1] + matrix[0, 1] * ir + matrix[1, 1] * ig + matrix[2, 1] * ib
        b = matrix[3, 2] + matrix[0, 2] * ir + matrix[1, 2] * ig + matrix[2, 2] * ib

        r = cast(Int(16), r/256)
        g = cast(Int(16), g/256)
        b = cast(Int(16), b/256)
        corrected[x, y, c] = select(c == 0, r,
                             select(c == 1, g, b))

        return corrected

    def apply_curve(input, gamma, contrast):
        # copied from FCam
        curve = Func('curve')

        xf = clamp(cast(float_t, x)/1024.0, 0.0, 1.0)
        g = pow(xf, 1.0/gamma)
        b = 2.0 - pow(2.0, contrast/100.0)
        a = 2.0 - 2.0*b
        z = select(g > 0.5,
                   1.0 - (a*(1.0-g)*(1.0-g) + b*(1.0-g)),
                   a*g*g + b*g)

        val = cast(result_type, clamp(z*256.0, 0.0, 255.0))
        curve[x] = val
        curve.root() # It's a LUT, compute it once ahead of time.

        curved = Func('curved')
        curved[x, y, c] = curve[input[x, y, c]]

        return curved

    def process(raw, matrix_3200, matrix_7000, color_temp, gamma, contrast):

        processed = Func('processed')
        xi, yi = Var('xi'), Var('yi')

        denoised = hot_pixel_suppression(raw)
        deinterleaved = deinterleave(denoised)
        demosaiced = demosaic(deinterleaved)
        corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp)
        curved = apply_curve(corrected, gamma, contrast)

        # Schedule
        #co, ci = Var('co'), Var('ci')
        processed[tx, ty, c] = curved[tx, ty, c]
        #processed.split(c, co, ci, 3) # bound color loop to 0-3
        if schedule == 0:
            # Compute in chunks over tiles, vectorized by 8
            denoised.chunk(tx).vectorize(x, 8)
            deinterleaved.chunk(tx).vectorize(x, 8)
            corrected.chunk(tx).vectorize(x, 4)
            processed.tile(tx, ty, xi, yi, 32, 32).reorder(xi, yi, c, tx, ty)
            processed.parallel(ty)
        elif schedule == 1:
            # Same as above, but don't vectorize (sse is bad at interleaved 16-bit ops)
            denoised.chunk(tx)
            deinterleaved.chunk(tx)
            corrected.chunk(tx)
            processed.tile(tx, ty, xi, yi, 128, 128).reorder(xi, yi, c, tx, ty)
            processed.parallel(ty)
        elif schedule == -1:
            # Naive schedule
            denoised.root()
            deinterleaved.root()
            corrected.root()
            processed.root()

        return processed

    # The camera pipe is specialized on the 2592x1968 images that
    # come in, so we'll just use an image instead of a uniform image.
    #Image<int16_t> input(2592, 1968);
    input = UniformImage(UInt(16), 2, 'input')
    
    if use_uniforms:
        color_temp = Uniform(float_t, "color_temp", 3200.0)
        gamma = Uniform(float_t, "gamma", 1.8)
        contrast = Uniform(float_t, "contrast", 10.0)
    else:
        color_temp = 3700.0 #3200.0
        gamma = 2.0 #1.8
        contrast = 50.0 #10.0
        
    # shift things inwards to give us enough padding on the
    # boundaries so that we don't need to check bounds. We're going
    # to make a 2560x1920 output image, just like the FCam pipe, so
    # shift by 16, 12
    shifted = Func('shifted')
    shifted[x, y] = cast(Int(16), input[x+16, y+12])

    if use_uniforms:
        matrix_3200 = UniformImage(float_t, 2, 'm3200')
        matrix_7000 = UniformImage(float_t, 2, 'm7000')
        matrix_3200_npy = numpy.array([[ 1.6697, -0.2693, -0.4004, -42.4346],
                                       [-0.3576,  1.0615,  1.5949, -37.1158],
                                       [-0.2175, -1.8751,  6.9640, -26.6970]],'float32')
        matrix_7000_npy = numpy.array([[ 2.2997, -0.4478,  0.1706, -39.0923],
                                       [-0.3826,  1.5906, -0.2080, -25.4311],
                                       [-0.0888, -0.7344,  2.2832, -20.0826]],'float32')
        matrix_3200.assign(matrix_3200_npy)
        matrix_7000.assign(matrix_7000_npy)
    else:
        matrix_3200 = Func('matrix_3200')
        matrix_7000 = Func('matrix_7000')
        matrix_3200[x,y] = select(y==0, select(x==0,  1.6697, select(x==1, -0.2693, select(x==2, -0.4004, -42.4346))),
                           select(y==1, select(x==0, -0.3576, select(x==1,  1.0615, select(x==2,  1.5949, -37.1158))),
                                        select(x==0, -0.2175, select(x==1, -1.8751, select(x==2,  6.9640, -26.6970)))))
        matrix_7000[x,y] = select(y==0, select(x==0,  2.2997, select(x==1, -0.4478, select(x==2,  0.1706, -39.0923))),
                           select(y==1, select(x==0, -0.3826, select(x==1,  1.5906, select(x==2, -0.2080, -25.4311))),
                                        select(x==0, -0.0888, select(x==1, -0.7344, select(x==2,  2.2832, -20.0826)))))
        matrix_3200.root()
        matrix_7000.root()

    processed = process(shifted, matrix_3200, matrix_7000, color_temp, gamma, contrast)

    # Special tuning variables interpreted by the autotuner
    tune_out_dims = OUT_DIMS
    tune_in_images = [os.path.join(inputs_dir(), '../apps/camera_pipe/raw_crop.png')]

    if schedule == 2:
        # Autotuned schedule
        asched = autotune.Schedule.fromstring(processed, 'b_b.chunk(x).vectorize(x,2)\nb_gb.chunk(x).vectorize(x,8)\nb_gr.chunk(y).tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\nb_r.chunk(y).tile(x,y,_c0,_c1,8,8).vectorize(_c0,8)\ncorrected.chunk(x).vectorize(x,8)\ncurve.root().vectorize(x,4).split(x,x,_c0,16)\ncurved.root().tile(x,y,_c0,_c1,32,32).parallel(y)\n\n\ndenoised.root().tile(x,y,_c0,_c1,64,64).vectorize(_c0,8).parallel(y)\ng_b.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ng_gb.chunk(x).vectorize(x,4)\ng_gr.chunk(y)\ng_r.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\n\n\ninterleave_x3.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_x4.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_x5.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_x6.root().tile(x,y,_c0,_c1,16,16).vectorize(_c0,16).parallel(y)\ninterleave_y1.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_y2.chunk(x).vectorize(x,8)\ninterleave_y3.chunk(x).vectorize(x,8)\nmatrix.root().tile(x,y,_c0,_c1,4,4).vectorize(_c0,4).parallel(y)\nmatrix_3200.root().tile(x,y,_c0,_c1,4,4).parallel(y)\n\nprocessed.root().vectorize(tx,8)\nr_b.chunk(y).vectorize(x,8)\nr_gb.chunk(y).vectorize(x,8)\nr_gr.chunk(x)\nr_r.chunk(y)\nshifted.chunk(x).vectorize(x,4)')
        print asched
        asched.apply()
    
    # FIXME: This gives in inaccurate timing in the tuner, not sure why
    tune_ref_schedules = {'human': """
            g_r.chunk(tx).vectorize(x, 8)
            g_b.chunk(tx).vectorize(x, 8)
            r_gr.chunk(tx).vectorize(x, 8)
            b_gr.chunk(tx).vectorize(x, 8)
            r_gb.chunk(tx).vectorize(x, 8)
            b_gb.chunk(tx).vectorize(x, 8)
            r_b.chunk(tx).vectorize(x, 8)
            b_r.chunk(tx).vectorize(x, 8)
            interleave_y1.chunk(tx).vectorize(x, 8).unroll(y, 2)
            interleave_y2.chunk(tx).vectorize(x, 8).unroll(y, 2)
            interleave_y3.chunk(tx).vectorize(x, 8).unroll(y, 2)
            curve.root()
            matrix.root()
            matrix_3200.root()
            matrix_7000.root()
            denoised.chunk(tx).vectorize(x, 8)
            deinterleaved.chunk(tx).vectorize(x, 8).reorder(c, x, y).unroll(c, 4)
            corrected.chunk(tx).vectorize(x, 4).reorder(c, x, y).unroll(c, 3)
            processed.root().bound(c, 0, 3).tile(tx, ty, _c0, _c1, 32, 32).parallel(ty).reorder(_c0, _c1, c, tx, ty)
            """}

    tune_constraints = autotune.bound_recursive(processed, 'c', 0, 3).replace('deinterleaved.bound(c,0,3)','deinterleaved.bound(c,0,4)')
    #print tune_constraints
    
    #def evaluate(in_png):
    #    output = Image(UInt(8), 2560, 1920, 3); # image size is hard-coded for the N900 raw pipeline
    #autotune.print_tunables(processed)
    #import autotune
    #g_r = all_funcs(processed)['g_r']
    #print 'caller_vars for g_r:', autotune.caller_vars(processed, g_r)
    #root_all(processed)
    #print 'Grouping'
    #import autotune
    #for sub in autotune.default_grouping(processed):
    #    print sub

    # In C++-11, this can be done as a simple initializer_list {color_temp,gamma,etc.} in place.
    #Func::Arg args[] = {color_temp, gamma, contrast, input, matrix_3200, matrix_7000};
    #processed.compileToFile("curved", std::vector<Func::Arg>(args, args+6));
    return (input, processed, None, locals())