def sample(varlist, schedule, name, bounds): "Sample template using given variable list." varlist = autotune_bounds.get_xy(schedule.d[name].func, bounds) if len(varlist) < 2: raise autotune.MutateFailed x = varlist[0] y = varlist[1] #if autotune.SPECULATIVE_INTERPOLATE and len(varlist) >= 3: # x = varlist[1] # y = varlist[2] L = ['.chunk(%(chunk_var)s).vectorize(%(x)s,%(n)d)', '.root().tile(%(x)s,%(y)s,_c0,_c1,%(n)d,%(n)d).vectorize(_c0,%(n)d).parallel(%(y)s)', '.root().parallel(%(y)s).vectorize(%(x)s,%(n)d)'] if autotune.is_cuda(): L[0] = '.chunk(%(chunk_var)s)' L.extend([ '.root().cudaTile(%(x)s,%(y)s,%(n)d,%(n)d)' ]*3) r = random.randrange(len(L)) if r == 0: if CHUNK_X_ALWAYS: chunk_var = x else: cvars = list(autotune.chunk_vars(schedule, schedule.d[name].func)) if '_c0' in cvars: cvars = cvars[:cvars.index('_c0'):] + cvars[cvars.index('_c0')+1:] if len(cvars) == 0: raise autotune.MutateFailed else: chunk_var = random.choice(cvars) n = random.choice([2,4,8]) return L[r]%locals()
def sample(varlist, schedule, name, bounds): "Sample template using given variable list." varlist = autotune_bounds.get_xy(schedule.d[name].func, bounds) if len(varlist) < 2: raise autotune.MutateFailed x = varlist[0] y = varlist[1] #if autotune.SPECULATIVE_INTERPOLATE and len(varlist) >= 3: # x = varlist[1] # y = varlist[2] L = [ '.chunk(%(chunk_var)s).vectorize(%(x)s,%(n)d)', '.root().tile(%(x)s,%(y)s,_c0,_c1,%(n)d,%(n)d).vectorize(_c0,%(n)d).parallel(%(y)s)', '.root().parallel(%(y)s).vectorize(%(x)s,%(n)d)' ] if autotune.is_cuda(): L[0] = '.chunk(%(chunk_var)s)' L.extend(['.root().cudaTile(%(x)s,%(y)s,%(n)d,%(n)d)'] * 3) r = random.randrange(len(L)) if r == 0: if CHUNK_X_ALWAYS: chunk_var = x else: cvars = list(autotune.chunk_vars(schedule, schedule.d[name].func)) if '_c0' in cvars: cvars = cvars[:cvars.index('_c0' ):] + cvars[cvars.index('_c0') + 1:] if len(cvars) == 0: raise autotune.MutateFailed else: chunk_var = random.choice(cvars) n = random.choice([2, 4, 8]) return L[r] % locals()
def sample(varlist): "Sample template using given variable list." if len(varlist) < 2: raise autotune.MutateFailed x = varlist[0] y = varlist[1] L = ['.chunk(%(x)s).vectorize(%(x)s,%(n)d)', '.root().tile(%(x)s,%(y)s,_c0,_c1,%(n)d,%(n)d).vectorize(_c0,%(n)d).parallel(%(y)s)', '.root().parallel(%(y)s).vectorize(%(x)s,%(n)d)'] if autotune.is_cuda(): L.extend([ '.root().cudaTile(%(x)s,%(y)s,%(n)d,%(n)d)' ]*3) r = random.randrange(len(L)) n = random.choice([2,4,8]) return L[r]%locals()
def filter_func(J=8, dtype=UInt(16), use_uniforms=False): "Local Laplacian." downsample_counter=[0] upsample_counter=[0] def downsample(f): downx, downy = Func('downx%d'%downsample_counter[0]), Func('downy%d'%downsample_counter[0]) downsample_counter[0] += 1 downx[x,y] = (f[2*x-1, y] + 3.0*(f[2*x,y]+f[2*x+1,y]) + f[2*x+2,y])/8.0 downy[x,y] = (downx[x,2*y-1] + 3.0*(downx[x,2*y]+downx[x,2*y+1]) + downx[x,2*y+2])/8.0 return downy def upsample(f): upx, upy = Func('upx%d'%upsample_counter[0]), Func('upy%d'%upsample_counter[0]) upsample_counter[0] += 1 upx[x,y] = 0.25 * f[(x/2)-1+2*(x%2),y] + 0.75 * f[x/2,y] upy[x,y] = 0.25 * upx[x, (y/2) - 1 + 2*(y%2)] + 0.75 * upx[x,y/2] return upy if use_uniforms: levels = Uniform(int_t, 'levels', 8) alpha = Uniform(float_t, 'alpha', 1.0) #1.0) beta = Uniform(float_t, 'beta', 1.0) else: levels = 8 alpha = 1.0 beta = 1.0 input = UniformImage(dtype, 3, 'input') x = Var('x') y = Var('y') c = Var('c') k = Var('k') fx = cast(float_t, x/256.0) remap = Func('remap') remap[x] = (alpha/cast(float_t, levels-1))*fx*exp(-fx*fx/2.0) floating = Func('floating') floating[x,y,c] = cast(float_t, input[x,y,c])/float(dtype.maxval()) clamped = Func('clamped') clamped[x,y,c] = floating[clamp(x,cast(int_t,0),cast(int_t,input.width()-1)), clamp(y,cast(int_t,0),cast(int_t,input.height()-1)), c] gray = Func('gray') gray[x,y] = 0.299*clamped[x,y,0]+0.587*clamped[x,y,1]+0.114*clamped[x,y,2] gPyramid = [Func('gPyramid%d'%i) for i in range(J)] idx = gray[x,y]*cast(float_t, levels-1)*256.0 idx = clamp(cast(int_t, idx), cast(int_t, 0), cast(int_t, (levels-1)*256)) gPyramid[0][x,y,k] = beta*gray[x,y] + remap[idx-256*k] for j in range(1,J): gPyramid[j][x,y,k] = downsample(gPyramid[j-1])[x,y,k] lPyramid = [Func('lPyramid%d'%i) for i in range(J)] lPyramid[J-1] = gPyramid[J-1] for j in range(J-1)[::-1]: lPyramid[j][x,y,k] = gPyramid[j][x,y,k] - upsample(gPyramid[j+1])[x,y,k] inGPyramid = [Func('inGPyramid%d'%i) for i in range(J)] inGPyramid[0] = gray for j in range(1,J): inGPyramid[j][x,y] = downsample(inGPyramid[j-1])[x,y] outLPyramid = [Func('outLPyramid%d'%i) for i in range(J)] for j in range(J): level = inGPyramid[j][x,y]*cast(float_t, levels-1) li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels-2)) lf = level - cast(float_t, li) outLPyramid[j][x,y] = (1.0-lf)*lPyramid[j][x,y,li] + lf*lPyramid[j][x,y,li+1] outGPyramid = [Func('outGPyramid%d'%i) for i in range(J)] outGPyramid[J-1] = outLPyramid[J-1] for j in range(J-1)[::-1]: outGPyramid[j][x,y] = upsample(outGPyramid[j+1])[x,y] + outLPyramid[j][x,y] color = Func('color') #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y] color[x,y,c] = outGPyramid[0][x,y] * (clamped[x,y,c]+0.01) / (gray[x,y]+0.01) output = Func('output') output[x,y,c] = cast(dtype, clamp(color[x,y,c], cast(float_t,0.0), cast(float_t,1.0))*float(dtype.maxval())) root_all(output) #import autotune #print autotune.root_all_str(output) #autotune.print_root_all(output) human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n' for j in range(J): human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%inGPyramid[j].name() if j > 0: human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n'%j human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n'%outGPyramid[j].name() if autotune.is_cuda(): human_schedule = 'remap.root()\n' human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n' for j in range(J): blockw = blockh = 32 if j > 3: blockw = blockh = 2 if j == 0: human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n'%(blockw, blockh) else: human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) if j == J-1: human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) else: human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n'%(j, blockw, blockh) # Special variables interpreted by autotuner tune_ref_schedules = {'human': human_schedule} tune_constraints = autotune.bound_recursive(output, 'c', 0, 3) #print '# schedules:' #import math #print math.log(autotune.lower_bound_schedules(output),10) #sys.exit(1) return (input, output, None, locals())
def filter_func(dtype=UInt(16), use_uniforms=False): def lerp(a, b, alpha): return (1.0 - alpha) * a + alpha * b input = UniformImage(float_t, 3, 'input') if use_uniforms: r_sigma = Uniform(float_t, 0.1) else: r_sigma = 0.1 s_sigma = 8 x = Var('x') y = Var('y') z = Var('z') c = Var('c') clamped = Func('clamped') clamped[x, y] = input[clamp(x, 0, input.width() - 1), clamp(y, 0, input.height() - 1), 0] r = RDom(0, s_sigma, 0, s_sigma, 'r') val = clamped[x * s_sigma + r.x - s_sigma / 2, y * s_sigma + r.y - s_sigma / 2] val = clamp(val, 0.0, 1.0) zi = cast(int_t, val * (1.0 / r_sigma) + 0.5) grid = Func('grid') grid[x, y, z, c] = 0.0 grid[x, y, zi, c] += select(c == 0, val, 1.0) # Blur the grid using a five-tap filter blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz') blurx[x, y, z] = grid[x - 2, y, z] + grid[x - 1, y, z] * 4 + grid[ x, y, z] * 6 + grid[x + 1, y, z] * 4 + grid[x + 2, y, z] blury[x, y, z] = blurx[x, y - 2, z] + blurx[x, y - 1, z] * 4 + blurx[ x, y, z] * 6 + blurx[x, y + 1, z] * 4 + blurx[x, y + 2, z] blurz[x, y, z] = blury[x, y, z - 2] + blury[x, y, z - 1] * 4 + blury[ x, y, z] * 6 + blury[x, y, z + 1] * 4 + blury[x, y, z + 2] # Take trilinear samples to compute the output val = clamp(clamped[x, y], 0.0, 1.0) zv = val * (1.0 / r_sigma) zi = cast(int_t, zv) zf = zv - zi xf = cast(float_t, x % s_sigma) / s_sigma yf = cast(float_t, y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = Func('interpolated') interpolated[x, y] = lerp( lerp(lerp(blurz[xi, yi, zi], blurz[xi + 1, yi, zi], xf), lerp(blurz[xi, yi + 1, zi], blurz[xi + 1, yi + 1, zi], xf), yf), lerp( lerp(blurz[xi, yi, zi + 1], blurz[xi + 1, yi, zi + 1], xf), lerp(blurz[xi, yi + 1, zi + 1], blurz[xi + 1, yi + 1, zi + 1], xf), yf), zf) # Normalize smoothed = Func('smoothed') smoothed[x, y, c] = interpolated[x, y, 0] / interpolated[x, y, 1] schedule = 1 if schedule == 0: pass elif schedule == 1: # Best schedule for CPU grid.root().parallel(z) grid.update().reorder(c, x, y).parallel(y) blurx.root().parallel(z).vectorize(x, 4) blury.root().parallel(z).vectorize(x, 4) blurz.root().parallel(z).vectorize(x, 4) smoothed.root().parallel(y).vectorize(x, 4) elif schedule == 2: # Best schedule for GPU gridz = grid.arg(2) grid.root().cudaTile(x, y, 16, 16) grid.update().root().cudaTile(x, y, 16, 16) blurx.root().cudaTile(x, y, 8, 8) blury.root().cudaTile(x, y, 8, 8) blurz.root().cudaTile(x, y, 8, 8) smoothed.root().cudaTile(x, y, s_sigma, s_sigma) else: raise ValueError tune_ref_schedules = { 'human': 'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' + 'blurx.root().parallel(z).vectorize(x, 4)\n' + 'blury.root().parallel(z).vectorize(x, 4)\n' + 'blurz.root().parallel(z).vectorize(x, 4)\n' + 'smoothed.root().parallel(y).vectorize(x, 4)\n' } # GPU gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \ 'blurx.root().cudaTile(x, y, 8, 8)\n' + \ 'blury.root().cudaTile(x, y, 8, 8)\n' + \ 'blurz.root().cudaTile(x, y, 8, 8)\n' + \ 'smoothed.root().cudaTile(x, y, 8, 8)\n' if autotune.is_cuda(): tune_ref_schedules['human'] = gpu_human tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3) #print tune_constraints #autotune.print_tunables(smoothed) #for i in range(123,10000): # random.seed(i) # print '-'*40 # print 'Schedule %d'%i # p = autotune.AutotuneParams() # print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth) # std::vector<Func::Arg> args; # args.push_back(r_sigma); # args.push_back(input); # smoothed.compileToFile("bilateral_grid", args); return (input, smoothed, None, locals())
def filter_func(dtype=UInt(16), use_uniforms=False): def lerp(a, b, alpha): return (1.0-alpha)*a + alpha*b input = UniformImage(float_t, 3, 'input') if use_uniforms: r_sigma = Uniform(float_t, 0.1) else: r_sigma = 0.1 s_sigma = 8 x = Var('x') y = Var('y') z = Var('z') c = Var('c') clamped = Func('clamped') clamped[x, y] = input[clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1),0] r = RDom(0, s_sigma, 0, s_sigma, 'r') val = clamped[x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2] val = clamp(val, 0.0, 1.0) zi = cast(int_t, val * (1.0/r_sigma) + 0.5) grid = Func('grid') grid[x, y, z, c] = 0.0 grid[x, y, zi, c] += select(c == 0, val, 1.0) # Blur the grid using a five-tap filter blurx, blury, blurz = Func('blurx'), Func('blury'), Func('blurz') blurx[x, y, z] = grid[x-2, y, z] + grid[x-1, y, z]*4 + grid[x, y, z]*6 + grid[x+1, y, z]*4 + grid[x+2, y, z] blury[x, y, z] = blurx[x, y-2, z] + blurx[x, y-1, z]*4 + blurx[x, y, z]*6 + blurx[x, y+1, z]*4 + blurx[x, y+2, z] blurz[x, y, z] = blury[x, y, z-2] + blury[x, y, z-1]*4 + blury[x, y, z]*6 + blury[x, y, z+1]*4 + blury[x, y, z+2] # Take trilinear samples to compute the output val = clamp(clamped[x, y], 0.0, 1.0) zv = val * (1.0/r_sigma) zi = cast(int_t, zv) zf = zv - zi xf = cast(float_t, x % s_sigma) / s_sigma yf = cast(float_t, y % s_sigma) / s_sigma xi = x/s_sigma yi = y/s_sigma interpolated = Func('interpolated') interpolated[x, y] = lerp(lerp(lerp(blurz[xi, yi, zi], blurz[xi+1, yi, zi], xf), lerp(blurz[xi, yi+1, zi], blurz[xi+1, yi+1, zi], xf), yf), lerp(lerp(blurz[xi, yi, zi+1], blurz[xi+1, yi, zi+1], xf), lerp(blurz[xi, yi+1, zi+1], blurz[xi+1, yi+1, zi+1], xf), yf), zf) # Normalize smoothed = Func('smoothed') smoothed[x, y, c] = interpolated[x, y, 0]/interpolated[x, y, 1] schedule = 1 if schedule == 0: pass elif schedule == 1: # Best schedule for CPU grid.root().parallel(z) grid.update().reorder(c, x, y).parallel(y) blurx.root().parallel(z).vectorize(x, 4) blury.root().parallel(z).vectorize(x, 4) blurz.root().parallel(z).vectorize(x, 4) smoothed.root().parallel(y).vectorize(x, 4) elif schedule == 2: # Best schedule for GPU gridz = grid.arg(2) grid.root().cudaTile(x, y, 16, 16) grid.update().root().cudaTile(x, y, 16, 16) blurx.root().cudaTile(x, y, 8, 8) blury.root().cudaTile(x, y, 8, 8) blurz.root().cudaTile(x, y, 8, 8) smoothed.root().cudaTile(x, y, s_sigma, s_sigma) else: raise ValueError tune_ref_schedules = {'human': 'grid.root().parallel(z).update().reorder(c, x, y).parallel(y)\n' + 'blurx.root().parallel(z).vectorize(x, 4)\n' + 'blury.root().parallel(z).vectorize(x, 4)\n' + 'blurz.root().parallel(z).vectorize(x, 4)\n' + 'smoothed.root().parallel(y).vectorize(x, 4)\n'} # GPU gpu_human = 'grid.root().cudaTile(x, y, 16, 16).update().root().cudaTile(x, y, 16, 16)\n' + \ 'blurx.root().cudaTile(x, y, 8, 8)\n' + \ 'blury.root().cudaTile(x, y, 8, 8)\n' + \ 'blurz.root().cudaTile(x, y, 8, 8)\n' + \ 'smoothed.root().cudaTile(x, y, 8, 8)\n' if autotune.is_cuda(): tune_ref_schedules['human'] = gpu_human tune_constraints = autotune.bound_recursive(smoothed, 'c', 0, 3) #print tune_constraints #autotune.print_tunables(smoothed) #for i in range(123,10000): # random.seed(i) # print '-'*40 # print 'Schedule %d'%i # p = autotune.AutotuneParams() # print valid_schedules.random_schedule(smoothed, p.min_depth, p.max_depth) # std::vector<Func::Arg> args; # args.push_back(r_sigma); # args.push_back(input); # smoothed.compileToFile("bilateral_grid", args); return (input, smoothed, None, locals())
def filter_func(J=8, dtype=UInt(16), use_uniforms=False): "Local Laplacian." downsample_counter = [0] upsample_counter = [0] def downsample(f): downx, downy = Func('downx%d' % downsample_counter[0]), Func( 'downy%d' % downsample_counter[0]) downsample_counter[0] += 1 downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0 downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0 return downy def upsample(f): upx, upy = Func('upx%d' % upsample_counter[0]), Func( 'upy%d' % upsample_counter[0]) upsample_counter[0] += 1 upx[x, y] = 0.25 * f[(x / 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x / 2, y] upy[x, y] = 0.25 * upx[x, (y / 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y / 2] return upy if use_uniforms: levels = Uniform(int_t, 'levels', 8) alpha = Uniform(float_t, 'alpha', 1.0) #1.0) beta = Uniform(float_t, 'beta', 1.0) else: levels = 8 alpha = 1.0 beta = 1.0 input = UniformImage(dtype, 3, 'input') x = Var('x') y = Var('y') c = Var('c') k = Var('k') fx = cast(float_t, x / 256.0) remap = Func('remap') remap[x] = (alpha / cast(float_t, levels - 1)) * fx * exp(-fx * fx / 2.0) floating = Func('floating') floating[x, y, c] = cast(float_t, input[x, y, c]) / float(dtype.maxval()) clamped = Func('clamped') clamped[x, y, c] = floating[ clamp(x, cast(int_t, 0), cast(int_t, input.width() - 1)), clamp(y, cast(int_t, 0), cast(int_t, input.height() - 1)), c] gray = Func('gray') gray[x, y] = 0.299 * clamped[x, y, 0] + 0.587 * clamped[ x, y, 1] + 0.114 * clamped[x, y, 2] gPyramid = [Func('gPyramid%d' % i) for i in range(J)] idx = gray[x, y] * cast(float_t, levels - 1) * 256.0 idx = clamp(cast(int_t, idx), cast(int_t, 0), cast(int_t, (levels - 1) * 256)) gPyramid[0][x, y, k] = beta * gray[x, y] + remap[idx - 256 * k] for j in range(1, J): gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k] lPyramid = [Func('lPyramid%d' % i) for i in range(J)] lPyramid[J - 1] = gPyramid[J - 1] for j in range(J - 1)[::-1]: lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample( gPyramid[j + 1])[x, y, k] inGPyramid = [Func('inGPyramid%d' % i) for i in range(J)] inGPyramid[0] = gray for j in range(1, J): inGPyramid[j][x, y] = downsample(inGPyramid[j - 1])[x, y] outLPyramid = [Func('outLPyramid%d' % i) for i in range(J)] for j in range(J): level = inGPyramid[j][x, y] * cast(float_t, levels - 1) li = clamp(cast(int_t, level), cast(int_t, 0), cast(int_t, levels - 2)) lf = level - cast(float_t, li) outLPyramid[j][x, y] = ( 1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1] outGPyramid = [Func('outGPyramid%d' % i) for i in range(J)] outGPyramid[J - 1] = outLPyramid[J - 1] for j in range(J - 1)[::-1]: outGPyramid[j][x, y] = upsample( outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] color = Func('color') #color[x,y,c] = outGPyramid[0][x,y] * clamped[x,y,c] / gray[x,y] color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + 0.01) / (gray[x, y] + 0.01) output = Func('output') output[x, y, c] = cast( dtype, clamp(color[x, y, c], cast(float_t, 0.0), cast(float_t, 1.0)) * float(dtype.maxval())) root_all(output) #import autotune #print autotune.root_all_str(output) #autotune.print_root_all(output) human_schedule = 'remap.root()\noutput.root().split(y, y, _c0, 32).parallel(y).vectorize(x, 4)\n' for j in range(J): human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % inGPyramid[ j].name() if j > 0: human_schedule += 'gPyramid%d.root().parallel(k).vectorize(x, 4)\n' % j human_schedule += '%s.root().split(y, y, _c0, 4).parallel(y).vectorize(x, 4)\n' % outGPyramid[ j].name() if autotune.is_cuda(): human_schedule = 'remap.root()\n' human_schedule += 'output.root().cudaTile(x, y, 32, 32)\n' for j in range(J): blockw = blockh = 32 if j > 3: blockw = blockh = 2 if j == 0: human_schedule += 'gray.root().cudaTile(x, y, %d, %d)\n' % ( blockw, blockh) else: human_schedule += 'inGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) human_schedule += 'gPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) if j == J - 1: human_schedule += 'outLPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) else: human_schedule += 'outGPyramid%d.root().cudaTile(x, y, %d, %d)\n' % ( j, blockw, blockh) # Special variables interpreted by autotuner tune_ref_schedules = {'human': human_schedule} tune_constraints = autotune.bound_recursive(output, 'c', 0, 3) #print '# schedules:' #import math #print math.log(autotune.lower_bound_schedules(output),10) #sys.exit(1) return (input, output, None, locals())