def filter_func(dtype=Float(32), use_uniforms=False, in_filename=os.path.join(inputs_dir(), 'interpolate_large.png')): "Fast interpolation using a pyramid." input = UniformImage(dtype, 3, 'input') x = Var('x') y = Var('y') c = Var('c') levels = 10 downsampled = [Func('d%d'%i) for i in range(levels)] interpolated = [Func('i%d'%i) for i in range(levels)] clamped = Func('clamped') clamped[c, x, y] = input[clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1), c]; downsampled[0][c,x,y] = select(c<3, clamped[c,x,y] * clamped[3,x,y], clamped[3,x,y]) downx = [None] + [Func('dx%d'%l) for l in range(1,levels)] for l in range(1, levels): downx[l][c,x,y] = (downsampled[l-1][c,x*2-1,y] + 2.0 * downsampled[l-1][c,x*2,y] + downsampled[l-1][c,x*2+1,y]) * 0.25 downsampled[l][c,x,y] = (downx[l][c,x,y*2-1] + 2.0 * downx[l][c,x,y*2] + downx[l][c,x,y*2+1]) * 0.25 upsampled = [Func('u%d'%l) for l in range(levels-1)] upsampledx = [Func('ux%d'%l) for l in range(levels-1)] interpolated[levels-1][c,x,y] = downsampled[levels-1][c,x,y] for l in range(levels-1)[::-1]: upsampledx[l][c,x,y] = 0.5 * (interpolated[l+1][c, x/2 + (x%2),y] + interpolated[l+1][c,x/2,y]) upsampled[l][c,x,y] = 0.5 * (upsampledx[l][c, x, y/2 + (y%2)] + upsampledx[l][c,x,y/2]) interpolated[l][c,x,y] = downsampled[l][c,x,y] + (1.0 - downsampled[l][3,x,y]) * upsampled[l][c,x,y] final = Func('final') final[x,y,c] = interpolated[0][c,x,y] / interpolated[0][3,x,y] def evaluate(in_png): T0 = time.time() out = final.realize(in_png.width(), in_png.height(), 3) print 'Interpolated in %.5f secs' % (time.time()-T0) return out # Special tuning variables interpreted by the autotuner tune_out_dims = (1408, 1408, 3) tune_in_images = [in_filename] tune_image_ext = '.ppm' human_schedule = 'final.root().parallel(y).bound(c, 0, 3)\n' for i in range(1, levels-1): human_schedule += 'd%d.root().vectorize(c, 4).parallel(y)\n'%i human_schedule += 'i%d.root().vectorize(c, 4).parallel(y)\n'%i tune_ref_schedules = {'human': human_schedule} tune_constraints = autotune.bound_recursive(final, 'c', 0, 4).replace('final.bound(c,0,4)','final.bound(c,0,3)') print tune_constraints autotune.Schedule.fromstring(final, human_schedule).apply() return (input, final, evaluate, locals())
def filter_func(J=8, dtype=UInt(16), use_uniforms=False): "Local Laplacian." downsample_counter=[0] upsample_counter=[0] def downsample(f): downx, downy = Func('downx%d'%downsample_counter[0]), Func('downy%d'%downsample_counter[0]) downsample_counter[0] += 1 downx[x,y] = (f[2*x-1, y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0 downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0 return downy def upsample(f): upx, upy = Func('upx%d'%upsample_counter[0]), Func('upy%d'%upsample_counter[0]) upsample_counter[0] += 1 upx[x,y] = 0.25 * f[(x/2)-1+2*(x%2),y] + 0.75 * f[x/2,y] upy[x,y] = 0.25 * upx[x, (y/2) - 1 + 2*(y%2)] + 0.75 * upx[x,y/2] return upy if use_uniforms: levels = Uniform(int_t, 'levels', 8) alpha = Uniform(float_t, 'alpha', 1.0) #1.0) beta = Uniform(float_t, 'beta', 1.0) else: levels = 8 alpha = 1.0 beta = 1.0 input = UniformImage(dtype, 3, 'input') x = Var('x') y = Var('y') c = Var('c') k = Var('k') fx = cast(float_t, x/256.0) remap = Func('remap') remap[x] = (alpha/cast(float_t, levels-1))*fx*exp(-fx*fx/2.0) floating = Func('floating') floating[x,y,c] = cast(float_t, input[x,y,c])/float(dtype.maxval()) clamped = Func('clamped') clamped[x,y,c] = floating[clamp(x,cast(int_t,0),cast(int_t,input.width()-1)), clamp(y,cast(int_t,0),cast(int_t,input.height()-1)), c] gray = Func('gray') gray[x,y] = 0.299*clamped[x,y,0]+0.587*clamped[x,y,1]+0.114*clamped[x,y,2] gPyramid = [Func('gPyramid%d'%i) for i in range(J)] idx = gray[x,y]*cast(float_t, levels-1)*256.0 idx = clamp(cast(int_t, idx), cast(int_t, 0), cast(int_t, (levels-1)*256)) gPyramid[0][x,y,k] = beta*gray[x,y] + remap[idx-256*k] for j in range(1,J): gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k] lPyramid = [Func('lPyramid%d'%i) for i in range(J)] lPyramid[J-1] = gPyramid[J-1] for j in range(J-1)[::-1]: lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k] inGPyramid = [Func('inGPyramid%d'%i) for i in range(J)] inGPyramid[0] = gray for j in range(1,J): inGPyramid[j][x,y] = downsample(inGPyramid[j-1])[x,y] outLPyramid = [Func('outLPyramid%d'%i) for i in range(J)] for j in range(J): level = inGPyramid[j][x,y]*cast(float_t, levels-1) li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels-2)) lf = level - cast(float_t, li) outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1] outGPyramid = [Func('outGPyramid%d'%i) for i in range(J)] outGPyramid[J-1] = outLPyramid[J-1] for j in range(J-1)[::-1]: outGPyramid[j][x,y] = upsample(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y] color = Func('color') #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y] color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c]+0.01) / (gray[x,y]+0.01) output = Func('output') output[x,y,c] = cast(dtype, clamp(color[x,y,c], cast(float_t,0.0), cast(float_t,1.0))*float(dtype.maxval())) root_all(output) #import autotune #print autotune.root_all_str(output) #autotune.print_root_all(output) human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n' for j in range(J): human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%inGPyramid[j].name() if j > 0: human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n'%j human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%outGPyramid[j].name() if autotune.is_cuda(): human_schedule = 'remap.root()\n' human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n' for j in range(J): blockw = blockh = 32 if j > 3: blockw = blockh = 2 if j == 0: human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n'%(blockw, blockh) else: human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) if j == J-1: human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) else: human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) # Special variables interpreted by autotuner tune_ref_schedules = {'human': human_schedule} tune_constraints = autotune.bound_recursive(output, 'c', 0, 3) #print '# schedules:' #import math #print math.log(autotune.lower_bound_schedules(output),10) #sys.exit(1) return (input, output, None, locals())
def filter_func(dtype=UInt(16), use_uniforms=False): def lerp(a, b, alpha): return (1.0 - alpha) * a + alpha * b input = UniformImage(float_t, 3, 'input') if use_uniforms: r_sigma = Uniform(float_t, 0.1) else: r_sigma = 0.1 s_sigma = 8 x = Var('x') y = Var('y') z = Var('z') c = Var('c') clamped = Func('clamped') clamped[x, y] = input[clamp(x, 0, input.width() - 1), clamp(y, 0, input.height() - 1), 0] r = RDom(0, s_sigma, 0, s_sigma, 'r') val = clamped[x * s_sigma + r.x - s_sigma / 2, y * s_sigma + r.y - s_sigma / 2] val = clamp(val, 0.0, 1.0) zi = cast(int_t, val * (1.0 / r_sigma) + 0.5) grid = Func('grid') grid[x, y, z, c] = 0.0 grid[x, y, zi, c] += select(c == 0, val, 1.0) # Blur the grid using a five-tap filter blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz') blurx[x, y, z] = grid[x - 2, y, z] + grid[x - 1, y, z] * 4 + grid[ x, y, z] * 6 + grid[x + 1, y, z] * 4 + grid[x + 2, y, z] blury[x, y, z] = blurx[x, y - 2, z] + blurx[x, y - 1, z] * 4 + blurx[ x, y, z] * 6 + blurx[x, y + 1, z] * 4 + blurx[x, y + 2, z] blurz[x, y, z] = blury[x, y, z - 2] + blury[x, y, z - 1] * 4 + blury[ x, y, z] * 6 + blury[x, y, z + 1] * 4 + blury[x, y, z + 2] # Take trilinear samples to compute the output val = clamp(clamped[x, y], 0.0, 1.0) zv = val * (1.0 / r_sigma) zi = cast(int_t, zv) zf = zv - zi xf = cast(float_t, x % s_sigma) / s_sigma yf = cast(float_t, y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = Func('interpolated') interpolated[x, y] = lerp( lerp(lerp(blurz[xi, yi, zi], blurz[xi + 1, yi, zi], xf), lerp(blurz[xi, yi + 1, zi], blurz[xi + 1, yi + 1, zi], xf), yf), lerp( lerp(blurz[xi, yi, zi + 1], blurz[xi + 1, yi, zi + 1], xf), lerp(blurz[xi, yi + 1, zi + 1], blurz[xi + 1, yi + 1, zi + 1], xf), yf), zf) # Normalize smoothed = Func('smoothed') smoothed[x, y, c] = interpolated[x, y, 0] / interpolated[x, y, 1] schedule = 1 if schedule == 0: pass elif schedule == 1: # Best schedule for CPU grid.root().parallel(z) grid.update().reorder(c, x, y).parallel(y) blurx.root().parallel(z).vectorize(x, 4) blury.root().parallel(z).vectorize(x, 4) blurz.root().parallel(z).vectorize(x, 4) smoothed.root().parallel(y).vectorize(x, 4) elif schedule == 2: # Best schedule for GPU gridz = grid.arg(2) grid.root().cudaTile(x, y, 16, 16) grid.update().root().cudaTile(x, y, 16, 16) blurx.root().cudaTile(x, y, 8, 8) blury.root().cudaTile(x, y, 8, 8) blurz.root().cudaTile(x, y, 8, 8) smoothed.root().cudaTile(x, y, s_sigma, s_sigma) else: raise ValueError tune_ref_schedules = { 'human': 'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' + 'blurx.root().parallel(z).vectorize(x, 4)\n' + 'blury.root().parallel(z).vectorize(x, 4)\n' + 'blurz.root().parallel(z).vectorize(x, 4)\n' + 'smoothed.root().parallel(y).vectorize(x, 4)\n' } # GPU gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \ 'blurx.root().cudaTile(x, y, 8, 8)\n' + \ 'blury.root().cudaTile(x, y, 8, 8)\n' + \ 'blurz.root().cudaTile(x, y, 8, 8)\n' + \ 'smoothed.root().cudaTile(x, y, 8, 8)\n' if autotune.is_cuda(): tune_ref_schedules['human'] = gpu_human tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3) #print tune_constraints #autotune.print_tunables(smoothed) #for i in range(123,10000): # random.seed(i) # print '-'*40 # print 'Schedule %d'%i # p = autotune.AutotuneParams() # print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth) # std::vector<Func::Arg> args; # args.push_back(r_sigma); # args.push_back(input); # smoothed.compileToFile("bilateral_grid", args); return (input, smoothed, None, locals())
def filter_func(dtype=UInt(16), use_uniforms=False): def lerp(a, b, alpha): return (1.0-alpha)*a + alpha*b input = UniformImage(float_t, 3, 'input') if use_uniforms: r_sigma = Uniform(float_t, 0.1) else: r_sigma = 0.1 s_sigma = 8 x = Var('x') y = Var('y') z = Var('z') c = Var('c') clamped = Func('clamped') clamped[x, y] = input[clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1),0] r = RDom(0, s_sigma, 0, s_sigma, 'r') val = clamped[x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2] val = clamp(val, 0.0, 1.0) zi = cast(int_t, val * (1.0/r_sigma) + 0.5) grid = Func('grid') grid[x, y, z, c] = 0.0 grid[x, y, zi, c] += select(c == 0, val, 1.0) # Blur the grid using a five-tap filter blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz') blurx[x, y, z] = grid[x-2, y, z] + grid[x-1, y, z]*4 + grid[x, y, z]*6 + grid[x+1, y, z]*4 + grid[x+2, y, z] blury[x, y, z] = blurx[x, y-2, z] + blurx[x, y-1, z]*4 + blurx[x, y, z]*6 + blurx[x, y+1, z]*4 + blurx[x, y+2, z] blurz[x, y, z] = blury[x, y, z-2] + blury[x, y, z-1]*4 + blury[x, y, z]*6 + blury[x, y, z+1]*4 + blury[x, y, z+2] # Take trilinear samples to compute the output val = clamp(clamped[x, y], 0.0, 1.0) zv = val * (1.0/r_sigma) zi = cast(int_t, zv) zf = zv - zi xf = cast(float_t, x % s_sigma) / s_sigma yf = cast(float_t, y % s_sigma) / s_sigma xi = x/s_sigma yi = y/s_sigma interpolated = Func('interpolated') interpolated[x, y] = lerp(lerp(lerp(blurz[xi, yi, zi], blurz[xi+1, yi, zi], xf), lerp(blurz[xi, yi+1, zi], blurz[xi+1, yi+1, zi], xf), yf), lerp(lerp(blurz[xi, yi, zi+1], blurz[xi+1, yi, zi+1], xf), lerp(blurz[xi, yi+1, zi+1], blurz[xi+1, yi+1, zi+1], xf), yf), zf) # Normalize smoothed = Func('smoothed') smoothed[x, y, c] = interpolated[x, y, 0]/interpolated[x, y, 1] schedule = 1 if schedule == 0: pass elif schedule == 1: # Best schedule for CPU grid.root().parallel(z) grid.update().reorder(c, x, y).parallel(y) blurx.root().parallel(z).vectorize(x, 4) blury.root().parallel(z).vectorize(x, 4) blurz.root().parallel(z).vectorize(x, 4) smoothed.root().parallel(y).vectorize(x, 4) elif schedule == 2: # Best schedule for GPU gridz = grid.arg(2) grid.root().cudaTile(x, y, 16, 16) grid.update().root().cudaTile(x, y, 16, 16) blurx.root().cudaTile(x, y, 8, 8) blury.root().cudaTile(x, y, 8, 8) blurz.root().cudaTile(x, y, 8, 8) smoothed.root().cudaTile(x, y, s_sigma, s_sigma) else: raise ValueError tune_ref_schedules = {'human': 'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' + 'blurx.root().parallel(z).vectorize(x, 4)\n' + 'blury.root().parallel(z).vectorize(x, 4)\n' + 'blurz.root().parallel(z).vectorize(x, 4)\n' + 'smoothed.root().parallel(y).vectorize(x, 4)\n'} # GPU gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \ 'blurx.root().cudaTile(x, y, 8, 8)\n' + \ 'blury.root().cudaTile(x, y, 8, 8)\n' + \ 'blurz.root().cudaTile(x, y, 8, 8)\n' + \ 'smoothed.root().cudaTile(x, y, 8, 8)\n' if autotune.is_cuda(): tune_ref_schedules['human'] = gpu_human tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3) #print tune_constraints #autotune.print_tunables(smoothed) #for i in range(123,10000): # random.seed(i) # print '-'*40 # print 'Schedule %d'%i # p = autotune.AutotuneParams() # print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth) # std::vector<Func::Arg> args; # args.push_back(r_sigma); # args.push_back(input); # smoothed.compileToFile("bilateral_grid", args); return (input, smoothed, None, locals())
def filter_func(J=8, dtype=UInt(16), use_uniforms=False): "Local Laplacian." downsample_counter = [0] upsample_counter = [0] def downsample(f): downx, downy = Func('downx%d' % downsample_counter[0]), Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0 downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0 return downy def upsample(f): upx, upy = Func('upx%d' % upsample_counter[0]), Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y] = 0.25 * f[(x / 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x / 2, y] upy[x, y] = 0.25 * upx[x, (y / 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y / 2] return upy if use_uniforms: levels = Uniform(int_t, 'levels', 8) alpha = Uniform(float_t, 'alpha', 1.0) #1.0) beta = Uniform(float_t, 'beta', 1.0) else: levels = 8 alpha = 1.0 beta = 1.0 input = UniformImage(dtype, 3, 'input') x = Var('x') y = Var('y') c = Var('c') k = Var('k') fx = cast(float_t, x / 256.0) remap = Func('remap') remap[x] = (alpha / cast(float_t, levels - 1)) * fx * exp(-fx * fx / 2.0) floating = Func('floating') floating[x, y, c] = cast(float_t, input[x, y, c]) / float(dtype.maxval()) clamped = Func('clamped') clamped[x, y, c] = floating[ clamp(x, cast(int_t, 0), cast(int_t, input.width() - 1)), clamp(y, cast(int_t, 0), cast(int_t, input.height() - 1)), c] gray = Func('gray') gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[ x, y, 1] + 0.114 * clamped[x, y, 2] gPyramid = [Func('gPyramid%d' % i) for i in range(J)] idx = gray[x, y] * cast(float_t, levels - 1) * 256.0 idx = clamp(cast(int_t, idx), cast(int_t, 0), cast(int_t, (levels - 1) * 256)) gPyramid[0][x, y, k] = beta * gray[x, y] + remap[idx - 256 * k] for j in range(1, J): gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k] lPyramid = [Func('lPyramid%d' % i) for i in range(J)] lPyramid[J - 1] = gPyramid[J - 1] for j in range(J - 1)[::-1]: lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample( gPyramid[j + 1])[x, y, k] inGPyramid = [Func('inGPyramid%d' % i) for i in range(J)] inGPyramid[0] = gray for j in range(1, J): inGPyramid[j][x, y] = downsample(inGPyramid[j - 1])[x, y] outLPyramid = [Func('outLPyramid%d' % i) for i in range(J)] for j in range(J): level = inGPyramid[j][x, y] * cast(float_t, levels - 1) li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels - 2)) lf = level - cast(float_t, li) outLPyramid[j][x, y] = ( 1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1] outGPyramid = [Func('outGPyramid%d' % i) for i in range(J)] outGPyramid[J - 1] = outLPyramid[J - 1] for j in range(J - 1)[::-1]: outGPyramid[j][x, y] = upsample( outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] color = Func('color') #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y] color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + 0.01) / (gray[x, y] + 0.01) output = Func('output') output[x, y, c] = cast( dtype, clamp(color[x, y, c], cast(float_t, 0.0), cast(float_t, 1.0)) * float(dtype.maxval())) root_all(output) #import autotune #print autotune.root_all_str(output) #autotune.print_root_all(output) human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n' for j in range(J): human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % inGPyramid[ j].name() if j > 0: human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n' % j human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % outGPyramid[ j].name() if autotune.is_cuda(): human_schedule = 'remap.root()\n' human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n' for j in range(J): blockw = blockh = 32 if j > 3: blockw = blockh = 2 if j == 0: human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n' % ( blockw, blockh) else: human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) if j == J - 1: human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) else: human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) # Special variables interpreted by autotuner tune_ref_schedules = {'human': human_schedule} tune_constraints = autotune.bound_recursive(output, 'c', 0, 3) #print '# schedules:' #import math #print math.log(autotune.lower_bound_schedules(output),10) #sys.exit(1) return (input, output, None, locals())
def filter_func(result_type=UInt(8), schedule=0, use_uniforms=False): x, y, tx, ty, c = Var('x'), Var('y'), Var('tx'), Var('ty'), Var('c') counter_interleave_x = [0] counter_interleave_y = [0] def hot_pixel_suppression(input): a = max(max(input[x-2, y], input[x+2, y]), max(input[x, y-2], input[x, y+2])) b = min(min(input[x-2, y], input[x+2, y]), min(input[x, y-2], input[x, y+2])) denoised = Func('denoised') denoised[x, y] = clamp(input[x, y], b, a) return denoised def interleave_x(a, b): counter_interleave_x[0] += 1 out = Func('interleave_x%d'%counter_interleave_x[0]) out[x, y] = select((x%2)==0, a[x/2, y], b[x/2, y]) return out def interleave_y(a, b): counter_interleave_y[0] += 1 out = Func('interleave_y%d'%counter_interleave_y[0]) out[x, y] = select((y%2)==0, a[x, y/2], b[x, y/2]) return out def deinterleave(raw): # Deinterleave the color channels deinterleaved = Func('deinterleaved') deinterleaved[x, y, c] = select(c == 0, raw[2*x, 2*y], select(c == 1, raw[2*x+1, 2*y], select(c == 2, raw[2*x, 2*y+1], raw[2*x+1, 2*y+1]))) return deinterleaved def absd(a, b): return select(a > b, a-b, b-a) def demosaic(deinterleaved): # These are the values we already know from the input # x_y = the value of channel x at a site in the input of channel y # gb refers to green sites in the blue rows # gr refers to green sites in the red rows # Give more convenient names to the four channels we know r_r, g_gr, g_gb, b_b = Func('r_r'), Func('g_gr'), Func('g_gb'), Func('b_b') g_gr[x, y] = deinterleaved[x, y, 0] r_r[x, y] = deinterleaved[x, y, 1] b_b[x, y] = deinterleaved[x, y, 2] g_gb[x, y] = deinterleaved[x, y, 3] # These are the ones we need to interpolate b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b = Func('b_r'), Func('g_r'), Func('b_gr'), Func('r_gr'), Func('b_gb'), Func('r_gb'), Func('r_b'), Func('g_b') # First calculate green at the red and blue sites # Try interpolating vertically and horizontally. Also compute # differences vertically and horizontally. Use interpolation in # whichever direction had the smallest difference. gv_r = (g_gb[x, y-1] + g_gb[x, y])/2 gvd_r = absd(g_gb[x, y-1], g_gb[x, y]) gh_r = (g_gr[x+1, y] + g_gr[x, y])/2 ghd_r = absd(g_gr[x+1, y], g_gr[x, y]) g_r[x, y] = select(ghd_r < gvd_r, gh_r, gv_r) gv_b = (g_gr[x, y+1] + g_gr[x, y])/2 gvd_b = absd(g_gr[x, y+1], g_gr[x, y]) gh_b = (g_gb[x-1, y] + g_gb[x, y])/2 ghd_b = absd(g_gb[x-1, y], g_gb[x, y]) g_b[x, y] = select(ghd_b < gvd_b, gh_b, gv_b) # Next interpolate red at gr by first interpolating, then # correcting using the error green would have had if we had # interpolated it in the same way (i.e. add the second derivative # of the green channel at the same place). correction = g_gr[x, y] - (g_r[x, y] + g_r[x-1, y])/2 r_gr[x, y] = correction + (r_r[x-1, y] + r_r[x, y])/2 # Do the same for other reds and blues at green sites correction = g_gr[x, y] - (g_b[x, y] + g_b[x, y-1])/2 b_gr[x, y] = correction + (b_b[x, y] + b_b[x, y-1])/2 correction = g_gb[x, y] - (g_r[x, y] + g_r[x, y+1])/2 r_gb[x, y] = correction + (r_r[x, y] + r_r[x, y+1])/2 correction = g_gb[x, y] - (g_b[x, y] + g_b[x+1, y])/2 b_gb[x, y] = correction + (b_b[x, y] + b_b[x+1, y])/2 # Now interpolate diagonally to get red at blue and blue at # red. Hold onto your hats; this gets really fancy. We do the # same thing as for interpolating green where we try both # directions (in this case the positive and negative diagonals), # and use the one with the lowest absolute difference. But we # also use the same trick as interpolating red and blue at green # sites - we correct our interpolations using the second # derivative of green at the same sites. correction = g_b[x, y] - (g_r[x, y] + g_r[x-1, y+1])/2 rp_b = correction + (r_r[x, y] + r_r[x-1, y+1])/2 rpd_b = absd(r_r[x, y], r_r[x-1, y+1]) correction = g_b[x, y] - (g_r[x-1, y] + g_r[x, y+1])/2 rn_b = correction + (r_r[x-1, y] + r_r[x, y+1])/2 rnd_b = absd(r_r[x-1, y], r_r[x, y+1]) r_b[x, y] = select(rpd_b < rnd_b, rp_b, rn_b) # Same thing for blue at red correction = g_r[x, y] - (g_b[x, y] + g_b[x+1, y-1])/2 bp_r = correction + (b_b[x, y] + b_b[x+1, y-1])/2 bpd_r = absd(b_b[x, y], b_b[x+1, y-1]) correction = g_r[x, y] - (g_b[x+1, y] + g_b[x, y-1])/2 bn_r = correction + (b_b[x+1, y] + b_b[x, y-1])/2 bnd_r = absd(b_b[x+1, y], b_b[x, y-1]) b_r[x, y] = select(bpd_r < bnd_r, bp_r, bn_r) # Interleave the resulting channels r = interleave_y(interleave_x(r_gr, r_r), interleave_x(r_b, r_gb)) g = interleave_y(interleave_x(g_gr, g_r), interleave_x(g_b, g_gb)) b = interleave_y(interleave_x(b_gr, b_r), interleave_x(b_b, b_gb)) output = Func('demosaic') output[x, y, c] = select(c == 0, r[x, y], select(c == 1, g[x, y], b[x, y])) # THE SCHEDULE if schedule == 0: # optimized for ARM # Compute these in chunks over tiles, vectorized by 8 g_r.chunk(tx).vectorize(x, 8) g_b.chunk(tx).vectorize(x, 8) r_gr.chunk(tx).vectorize(x, 8) b_gr.chunk(tx).vectorize(x, 8) r_gb.chunk(tx).vectorize(x, 8) b_gb.chunk(tx).vectorize(x, 8) r_b.chunk(tx).vectorize(x, 8) b_r.chunk(tx).vectorize(x, 8) # These interleave in y, so unrolling them in y helps r.chunk(tx).vectorize(x, 8).unroll(y, 2) g.chunk(tx).vectorize(x, 8).unroll(y, 2) b.chunk(tx).vectorize(x, 8).unroll(y, 2) elif schedule == 1: # optimized for X86 # Don't vectorize, because sse is bad at 16-bit interleaving g_r.chunk(tx) g_b.chunk(tx) r_gr.chunk(tx) b_gr.chunk(tx) r_gb.chunk(tx) b_gb.chunk(tx) r_b.chunk(tx) b_r.chunk(tx) # These interleave in x and y, so unrolling them helps r.chunk(tx).unroll(x, 2).unroll(y, 2) g.chunk(tx).unroll(x, 2).unroll(y, 2) b.chunk(tx).unroll(x, 2).unroll(y, 2) elif schedule == -1: # Basic naive schedule g_r.root() g_b.root() r_gr.root() b_gr.root() r_gb.root() b_gb.root() r_b.root() b_r.root() r.root() g.root() b.root() return output def color_correct(input, matrix_3200, matrix_7000, kelvin): # Get a color matrix by linearly interpolating between two # calibrated matrices using inverse kelvin. matrix = Func('matrix') alpha = (1.0/kelvin - 1.0/3200) / (1.0/7000 - 1.0/3200) val = (matrix_3200[x, y] * alpha + matrix_7000[x, y] * (1 - alpha)) matrix[x, y] = cast(int_t, val * 256.0) # Q8.8 fixed point matrix.root() corrected = Func('corrected') ir = cast(int_t, input[x, y, 0]) ig = cast(int_t, input[x, y, 1]) ib = cast(int_t, input[x, y, 2]) r = matrix[3, 0] + matrix[0, 0] * ir + matrix[1, 0] * ig + matrix[2, 0] * ib g = matrix[3, 1] + matrix[0, 1] * ir + matrix[1, 1] * ig + matrix[2, 1] * ib b = matrix[3, 2] + matrix[0, 2] * ir + matrix[1, 2] * ig + matrix[2, 2] * ib r = cast(Int(16), r/256) g = cast(Int(16), g/256) b = cast(Int(16), b/256) corrected[x, y, c] = select(c == 0, r, select(c == 1, g, b)) return corrected def apply_curve(input, gamma, contrast): # copied from FCam curve = Func('curve') xf = clamp(cast(float_t, x)/1024.0, 0.0, 1.0) g = pow(xf, 1.0/gamma) b = 2.0 - pow(2.0, contrast/100.0) a = 2.0 - 2.0*b z = select(g > 0.5, 1.0 - (a*(1.0-g)*(1.0-g) + b*(1.0-g)), a*g*g + b*g) val = cast(result_type, clamp(z*256.0, 0.0, 255.0)) curve[x] = val curve.root() # It's a LUT, compute it once ahead of time. curved = Func('curved') curved[x, y, c] = curve[input[x, y, c]] return curved def process(raw, matrix_3200, matrix_7000, color_temp, gamma, contrast): processed = Func('processed') xi, yi = Var('xi'), Var('yi') denoised = hot_pixel_suppression(raw) deinterleaved = deinterleave(denoised) demosaiced = demosaic(deinterleaved) corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp) curved = apply_curve(corrected, gamma, contrast) # Schedule #co, ci = Var('co'), Var('ci') processed[tx, ty, c] = curved[tx, ty, c] #processed.split(c, co, ci, 3) # bound color loop to 0-3 if schedule == 0: # Compute in chunks over tiles, vectorized by 8 denoised.chunk(tx).vectorize(x, 8) deinterleaved.chunk(tx).vectorize(x, 8) corrected.chunk(tx).vectorize(x, 4) processed.tile(tx, ty, xi, yi, 32, 32).reorder(xi, yi, c, tx, ty) processed.parallel(ty) elif schedule == 1: # Same as above, but don't vectorize (sse is bad at interleaved 16-bit ops) denoised.chunk(tx) deinterleaved.chunk(tx) corrected.chunk(tx) processed.tile(tx, ty, xi, yi, 128, 128).reorder(xi, yi, c, tx, ty) processed.parallel(ty) elif schedule == -1: # Naive schedule denoised.root() deinterleaved.root() corrected.root() processed.root() return processed # The camera pipe is specialized on the 2592x1968 images that # come in, so we'll just use an image instead of a uniform image. #Image<int16_t> input(2592, 1968); input = UniformImage(UInt(16), 2, 'input') if use_uniforms: color_temp = Uniform(float_t, "color_temp", 3200.0) gamma = Uniform(float_t, "gamma", 1.8) contrast = Uniform(float_t, "contrast", 10.0) else: color_temp = 3700.0 #3200.0 gamma = 2.0 #1.8 contrast = 50.0 #10.0 # shift things inwards to give us enough padding on the # boundaries so that we don't need to check bounds. We're going # to make a 2560x1920 output image, just like the FCam pipe, so # shift by 16, 12 shifted = Func('shifted') shifted[x, y] = cast(Int(16), input[x+16, y+12]) if use_uniforms: matrix_3200 = UniformImage(float_t, 2, 'm3200') matrix_7000 = UniformImage(float_t, 2, 'm7000') matrix_3200_npy = numpy.array([[ 1.6697, -0.2693, -0.4004, -42.4346], [-0.3576, 1.0615, 1.5949, -37.1158], [-0.2175, -1.8751, 6.9640, -26.6970]],'float32') matrix_7000_npy = numpy.array([[ 2.2997, -0.4478, 0.1706, -39.0923], [-0.3826, 1.5906, -0.2080, -25.4311], [-0.0888, -0.7344, 2.2832, -20.0826]],'float32') matrix_3200.assign(matrix_3200_npy) matrix_7000.assign(matrix_7000_npy) else: matrix_3200 = Func('matrix_3200') matrix_7000 = Func('matrix_7000') matrix_3200[x,y] = select(y==0, select(x==0, 1.6697, select(x==1, -0.2693, select(x==2, -0.4004, -42.4346))), select(y==1, select(x==0, -0.3576, select(x==1, 1.0615, select(x==2, 1.5949, -37.1158))), select(x==0, -0.2175, select(x==1, -1.8751, select(x==2, 6.9640, -26.6970))))) matrix_7000[x,y] = select(y==0, select(x==0, 2.2997, select(x==1, -0.4478, select(x==2, 0.1706, -39.0923))), select(y==1, select(x==0, -0.3826, select(x==1, 1.5906, select(x==2, -0.2080, -25.4311))), select(x==0, -0.0888, select(x==1, -0.7344, select(x==2, 2.2832, -20.0826))))) matrix_3200.root() matrix_7000.root() processed = process(shifted, matrix_3200, matrix_7000, color_temp, gamma, contrast) # Special tuning variables interpreted by the autotuner tune_out_dims = OUT_DIMS tune_in_images = [os.path.join(inputs_dir(), '../apps/camera_pipe/raw_crop.png')] if schedule == 2: # Autotuned schedule asched = autotune.Schedule.fromstring(processed, 'b_b.chunk(x).vectorize(x,2)\nb_gb.chunk(x).vectorize(x,8)\nb_gr.chunk(y).tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\nb_r.chunk(y).tile(x,y,_c0,_c1,8,8).vectorize(_c0,8)\ncorrected.chunk(x).vectorize(x,8)\ncurve.root().vectorize(x,4).split(x,x,_c0,16)\ncurved.root().tile(x,y,_c0,_c1,32,32).parallel(y)\n\n\ndenoised.root().tile(x,y,_c0,_c1,64,64).vectorize(_c0,8).parallel(y)\ng_b.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ng_gb.chunk(x).vectorize(x,4)\ng_gr.chunk(y)\ng_r.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\n\n\ninterleave_x3.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_x4.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_x5.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_x6.root().tile(x,y,_c0,_c1,16,16).vectorize(_c0,16).parallel(y)\ninterleave_y1.root().tile(x,y,_c0,_c1,8,8).vectorize(_c0,8).parallel(y)\ninterleave_y2.chunk(x).vectorize(x,8)\ninterleave_y3.chunk(x).vectorize(x,8)\nmatrix.root().tile(x,y,_c0,_c1,4,4).vectorize(_c0,4).parallel(y)\nmatrix_3200.root().tile(x,y,_c0,_c1,4,4).parallel(y)\n\nprocessed.root().vectorize(tx,8)\nr_b.chunk(y).vectorize(x,8)\nr_gb.chunk(y).vectorize(x,8)\nr_gr.chunk(x)\nr_r.chunk(y)\nshifted.chunk(x).vectorize(x,4)') print asched asched.apply() # FIXME: This gives in inaccurate timing in the tuner, not sure why tune_ref_schedules = {'human': """ g_r.chunk(tx).vectorize(x, 8) g_b.chunk(tx).vectorize(x, 8) r_gr.chunk(tx).vectorize(x, 8) b_gr.chunk(tx).vectorize(x, 8) r_gb.chunk(tx).vectorize(x, 8) b_gb.chunk(tx).vectorize(x, 8) r_b.chunk(tx).vectorize(x, 8) b_r.chunk(tx).vectorize(x, 8) interleave_y1.chunk(tx).vectorize(x, 8).unroll(y, 2) interleave_y2.chunk(tx).vectorize(x, 8).unroll(y, 2) interleave_y3.chunk(tx).vectorize(x, 8).unroll(y, 2) curve.root() matrix.root() matrix_3200.root() matrix_7000.root() denoised.chunk(tx).vectorize(x, 8) deinterleaved.chunk(tx).vectorize(x, 8).reorder(c, x, y).unroll(c, 4) corrected.chunk(tx).vectorize(x, 4).reorder(c, x, y).unroll(c, 3) processed.root().bound(c, 0, 3).tile(tx, ty, _c0, _c1, 32, 32).parallel(ty).reorder(_c0, _c1, c, tx, ty) """} tune_constraints = autotune.bound_recursive(processed, 'c', 0, 3).replace('deinterleaved.bound(c,0,3)','deinterleaved.bound(c,0,4)') #print tune_constraints #def evaluate(in_png): # output = Image(UInt(8), 2560, 1920, 3); # image size is hard-coded for the N900 raw pipeline #autotune.print_tunables(processed) #import autotune #g_r = all_funcs(processed)['g_r'] #print 'caller_vars for g_r:', autotune.caller_vars(processed, g_r) #root_all(processed) #print 'Grouping' #import autotune #for sub in autotune.default_grouping(processed): # print sub # In C++-11, this can be done as a simple initializer_list {color_temp,gamma,etc.} in place. #Func::Arg args[] = {color_temp, gamma, contrast, input, matrix_3200, matrix_7000}; #processed.compileToFile("curved", std::vector<Func::Arg>(args, args+6)); return (input, processed, None, locals())