def gen_calc_variance_fullintegral(code, out_reg, position, integral_ptr, sq_integral_ptr, haar_size, block_size): ''' Variance calculation for full integral image. ''' # calculate split shape width, height = block_size haar_width, haar_height = haar_size # in order to produce same results as the python VJ implementation, x,y: [0, w], [0, h] shape = (0, 0, haar_width + 1, haar_height + 1) with scoped_alloc(code, 1) as tmp: # calculate sum of int_im # out_reg = val_sum = integral_sum(integral, xx, yy, w, h) for x in gen_fullintegral_sum(code, out_reg, position, shape, integral_ptr, block_size): yield x # calculate sum of sq_int_im # tmp = val_sum = integral_sum(sq_integral, xx, yy, w, h) for x in gen_fullintegral_sum(code, tmp, position, shape, sq_integral_ptr, block_size): yield x # calculate variance with scoped_alloc(code, 1) as area_r: yield Imm(area_r, haar_width*haar_height) # area yield Mul(tmp, tmp, area_r) # sq_acc = sq_integral_sum*(haar_width*haar_height) yield Mul(out_reg, out_reg, out_reg) # out_reg = integral_sum^2 yield Sub(out_reg, tmp, out_reg) # out_reg = sq_integral_sum*(haar_width*haar_height) - integral_sum^2 with scoped_alloc(code, 1) as const_0: yield Imm(const_0, 0.) yield Cmp(out_reg, const_0) # comp out_reg - 0 yield Sqrt(out_reg, out_reg, cond='GT') # out_reg = sqrt(sq_integral_sum*(haar_width*haar_height) - integral_sum^2) yield Imm(out_reg, 1, cond='LE') # if out_reg <= 0: variance = 1
def gen_gather_local_max(code, block_size, args): ''' Gather local maximum from mask code generation. ''' f = args['filter'] rows, cols = block_size frows, fcols = f.size() hfrow, hfcol = [x//2 for x in f.size()] out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols in_ptr = args['in_ptr'] if 'in_ptr' in args else 0 for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as max_v: yield Imm(max_v, -float('inf')) for ii in xrange(frows): for jj in xrange(fcols): if not f.mask[ii][jj]: continue # skip if not enabled iii = i + ii - hfrow jjj = j + jj - hfcol with scoped_alloc(code, 1) as v: for instr in load_mem_value(code, in_ptr, (jjj, iii), v, block_size): yield instr yield Cmp(v, max_v) yield Mov(max_v, v, cond='GT') yield MemWImm(out_ptr+i*cols + j, max_v)
def gen_threshold(code, block_size, args): th = args['th'] with scoped_alloc(code, 6) as (out_ptr_r, th_r, const_1, const_255, const_0, in_ptr_r): # out pointer yield Imm(out_ptr_r, block_size[0]*block_size[1]) # constants yield Imm(th_r, th) yield Imm(const_1, 1) yield Imm(const_255, 255) yield Imm(const_0, 0) # in pointer yield Mov(in_ptr_r, const_0) for i in xrange(block_size[0]): for j in xrange(block_size[1]): with scoped_alloc(code, 1) as tmp: yield MemR(tmp, in_ptr_r) yield Cmp(tmp, th_r) yield Mov(tmp, const_0) yield Mov(tmp, const_255, 'GT') yield MemW(out_ptr_r, tmp) if(not (j == block_size[1]-1 and i == block_size[0]-1)): yield Add(in_ptr_r, in_ptr_r, const_1) yield Add(out_ptr_r, out_ptr_r, const_1)
def codegen(code, block_size, args): with scoped_alloc(code, 1) as acc: yield Xor(acc, acc, acc) with scoped_alloc(code, 1) as m: for i in xrange(3): yield MemRImm(m, 2) yield Add(acc, acc, m) yield Sub(acc, acc, m) yield MemRImm(m, i) yield Add(acc, acc, m) yield MemRImm(m, i) yield Sub(acc, acc, m) yield Add(m, m, m)
def gen_abs_value(code, block_size, args): ''' Generate element-wise absolute value of a buffer. ''' rows, cols = block_size out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols in_ptr = args['in_ptr'] if 'in_ptr' in args else 0 with scoped_alloc(code, 1) as const0: yield Imm(const0, 0) for i in xrange(rows): for j in xrange(cols): addr = i*cols + j with scoped_alloc(code, 1) as tmp: yield MemRImm(tmp, in_ptr + addr) yield Cmp(tmp, const0) yield Neg(tmp, tmp, cond='LT') yield MemWImm(out_ptr + addr, tmp)
def gen_full_integral_image(code, src_ptr, integral_ptr, sq_integral_ptr, pe_array_size, block_size): width, height = block_size pe_width, pe_height = pe_array_size for x in gen_integral_image(code, src_ptr, integral_ptr, sq_integral_ptr, block_size): yield x for buffer_ptr in [integral_ptr, sq_integral_ptr]: with scoped_alloc(code, 1) as acc: # horizontal propagation for row in xrange(height): yield MemRImm(code.out, buffer_ptr+(row+1)*width-1) for bid in xrange(pe_width-1): for x in xrange(width): ptr = buffer_ptr + row*width + x yield MemRImm(acc, ptr) yield Add(acc, acc, code.west) yield MemWImm(ptr, acc) yield Mov(code.out, code.west) # vertical propagation for col in xrange(width): yield MemRImm(code.out, buffer_ptr + width*(height-1) + col) for bid in xrange(pe_height-1): for y in xrange(height): ptr = buffer_ptr + y*width + col yield MemRImm(acc, ptr) yield Add(acc, acc, code.north) yield MemWImm(ptr, acc) yield Mov(code.out, code.north)
def map_neighborhood_to_pixel(code, in_ptr, out_ptr, neighborhood, pixel_op, args, block_size): """ Apply neigborhood to pixel operations. """ bwidth, bheight = block_size nheight, nwidth = len(neighborhood[0]), len(neighborhood) assert nheight % 2 != 0 and nwidth % 2 != 0 # mask size must be odd h_nheight = nheight // 2 h_nwidth = nwidth // 2 def process_pixel(code, in_ptr, pos, acc, neigborhood, pixel_op, args, block_size): j, i = pos for ii, row in enumerate(neighborhood): for jj, m in enumerate(row): if m: # works implicitly for booleans and coefficients apos = (j + jj - h_nwidth, i + ii - h_nheight) with scoped_alloc(code, 1) as v: for x in load_mem_value(code, in_ptr, apos, v, block_size): yield x for x in pixel_op(code, m, v, acc, args, block_size): yield x for i in xrange(bheight): for j in xrange(bwidth): pos = (j, i) with scoped_alloc(code, 1) as acc: # XXX apply assignment-instead-of-accum-on-first-iteration optimalisation yield Xor(acc, acc, acc) for x in process_pixel(code, in_ptr, pos, acc, neighborhood, pixel_op, args, block_size): yield x yield MemWImm(out_ptr + bwidth * i + j, acc)
def codegen(code, block_size, args): with scoped_alloc(code, 2) as (a, b): yield MemRImm(a, 3) yield MemRImm(b, 4) yield Add(a, a, b) yield MemWImm(3, a) yield MemRImm(a, 3)
def pixel_op(code, pin, pout, args, block_size): th = args['th'] with scoped_alloc(code, 3) as (th_r, v, const_255): yield Imm(th_r, th) yield Cmp(pin, th_r) yield Imm(pout, 255, cond='GT') yield Xor(pout, pout, pout, cond='LE')
def gen_bbs(code, block_size, args): th = args['th'] alpha = args['alpha'] width, height = block_size block_mem_size = width * height # pointers: src_ptr = 0 res_ptr = block_mem_size back_ptr = 2*block_mem_size with scoped_alloc(code, 3) as (const_alpha, const_1_m_alpha, const_th): # setup parameters yield Imm(const_alpha, alpha) yield Imm(const_1_m_alpha, 1-alpha) yield Imm(const_th, th) # regs: # ip_n : I_p[n] # ib_n_1 : Ibackground_p[n-1] # ib_n : Ibackground_p[n] # abbsdiff : abs(I_p[n] - Ibackground_p[n-1] for i in xrange(block_mem_size): with scoped_alloc(code, 5) as (ip_n, ib_n_1, ib_n, absdiff, res): # I_background(n) = I*alpha + I_background(n-1)*(1-alpha) yield MemRImm(ip_n, src_ptr + i) yield Mul(ib_n, ip_n, const_alpha) yield MemRImm(ib_n_1, back_ptr + i) yield Mul(ib_n_1, ib_n_1, const_1_m_alpha) yield Add(ib_n, ib_n, ib_n_1) yield MemWImm(back_ptr + i, ib_n) # I_res = abs(I - I_background) > th # equivalent to: # if I >= I_background: # I_res = (I - I_background) > th # else: # I_res = (I_background - I) > th yield Cmp(ip_n, ib_n) yield Sub(absdiff, ip_n, ib_n, cond='GE') yield Sub(absdiff, ib_n, ip_n, cond='LT') yield Cmp(absdiff, const_th) yield Imm(res, 0) yield Imm(res, 255, cond='GT') yield MemWImm(res_ptr + i, res)
def codegen(code, block_size, args): with scoped_alloc(code, 4) as (a, b, c, d): yield Imm(a, 1) yield Imm(b, 2) yield Cmp(a, b) yield Inv(c, a, cond='LE') yield Imm(a, 3) # to check if a is captured before new assignment yield Mov(c, b, cond='GT') yield Mov(d, c)
def codegen(code, block_size, args): for x in xrange(4): with scoped_alloc(code, 3) as (a, b, c): yield Imm(b, x*4) yield Mov(a, b) yield Imm(a, x) yield Mov(b, a) yield Add(a, a, b) yield Sub(c, a, b) yield Mul(c, c, c)
def codegen(code, block_size, args): with scoped_alloc(code, 2) as (acc, imm_r): yield Xor(acc, acc, acc) for i in xrange(3): yield Imm(imm_r, 2) yield Add(acc, acc, imm_r) yield Sub(acc, acc, imm_r) yield Imm(imm_r, i) yield Add(acc, acc, imm_r) yield Sub(acc, acc, imm_r)
def gen_copy_to_out(code, block_size, args): with scoped_alloc(code, 3) as (out_ptr_r, in_ptr_r, const_1): # init pointer to output memory yield Imm(out_ptr_r, block_size[0]*block_size[1]) # init src ptr yield Xor(in_ptr_r, in_ptr_r, in_ptr_r) # inc value yield Imm(const_1, 1) for i in xrange(block_size[0]): for j in xrange(block_size[1]): with scoped_alloc(code, 1) as tmp: yield MemR(tmp, in_ptr_r) yield MemW(out_ptr_r, tmp) yield Add(in_ptr_r, in_ptr_r, const_1) yield Add(out_ptr_r, out_ptr_r, const_1)
def gen_integral_sum(code, out_reg, position, shape, ptr, block_size): ''' Gen integral sum code. this code assumes that each shape is in a single block maximum one block away from the originating block note that in contrast with the python implementation, a block has the ranges: x[0,w[, y[0,h[ so width and height of the shape need to be incremented by one to be compatible with the violajones sum function ''' px, py = position x, y, w, h = shape width, height = block_size xx = px + x yy = py + y # val_sum [r4], tmp [r5] # to handle values outside the block range: # first detect cases and adapt the xx,yy coordinates # calculate the value as usual # copy the value to the correct block copy_from_right = False copy_from_below = False if not ((xx+w-1) < width): copy_from_right = True xx -= width if not ((yy+h-1) < height): copy_from_below = True yy -= height # v1 = im[yy ][xx ] # v2 = im[yy ][xx+w-1] # v3 = im[yy+h-1][xx ] # v4 = im[yy+h-1][xx+w-1] # val_sum = v1 - v2 - v3 + v4 with scoped_alloc(code, 1) as tmp: yield MemRImm(out_reg, ptr + yy * width + xx) # r = v1 yield MemRImm(tmp, ptr + yy * width + (xx+w-1)) # v2 yield Sub(out_reg, out_reg, tmp) # r = v1 - v2 yield MemRImm(tmp, ptr + (yy+h-1) * width + xx ) # v3 yield Sub(out_reg, out_reg, tmp) # r = v1 - v2 - v3 yield MemRImm(tmp, ptr + (yy+h-1) * width + (xx+w-1)) # v4 yield Add(out_reg, out_reg, tmp) # r = v1 - v2 - v3 + v4 # now handle the shapes out of PE block if copy_from_right and copy_from_below: yield Mov(code.out, out_reg) yield Mov(code.out, code.east) yield Mov(out_reg, code.south) elif copy_from_right: yield Mov(code.out, out_reg) yield Mov(out_reg, code.east) elif copy_from_below: yield Mov(code.out, out_reg) yield Mov(out_reg, code.south)
def gen_calc_variance(code, out_reg, position, integral_ptr, sq_integral_ptr, haar_size, block_size): ''' Variance calculation. ''' # calculte split shape width, height = block_size haar_width, haar_height = haar_size # in order to produce same results as the python VJ implementation, x,y: [0, w], [0, h] shape = (0, 0, haar_width + 1, haar_height + 1) shapes = split_shape_across_blocks(shape, position, block_size) with scoped_alloc(code, 2) as (int_acc, sq_acc): # int_acc: integral sum accum yield Xor(int_acc, int_acc, int_acc) # sq_acc: square integral sum accum yield Xor(sq_acc, sq_acc, sq_acc) for i, s in enumerate(shapes): # calculate sum of int_im with scoped_alloc(code, 1) as sum_out: for x in gen_integral_sum(code, sum_out, position, s, integral_ptr, block_size): if i > 0: code.tag_com_overhead_instr(x) yield x yield Add(int_acc, int_acc, sum_out) # calculate sum of sq_int_im with scoped_alloc(code, 1) as sum_out: for x in gen_integral_sum(code, sum_out, position, s, sq_integral_ptr, block_size): if i > 0: code.tag_com_overhead_instr(x) yield x yield Add(sq_acc, sq_acc, sum_out) # calculate variance with scoped_alloc(code, 1) as area_r: yield Imm(area_r, haar_width*haar_height) # area yield Mul(sq_acc, sq_acc, area_r) # sq_acc = sq_integral_sum*(haar_width*haar_height) yield Mul(int_acc, int_acc, int_acc) # int_acc = integral_sum^2 yield Sub(int_acc, sq_acc, int_acc) # int_acc = sq_integral_sum*(haar_width*haar_height) - integral_sum^2 with scoped_alloc(code, 1) as const_0: yield Imm(const_0, 0.) yield Cmp(int_acc, const_0) # comp int_acc - 0 yield Sqrt(out_reg, int_acc, cond='GT') # r7 = sqrt(sq_integral_sum*(haar_width*haar_height) - integral_sum^2) yield Imm(out_reg, 1., cond='LE') # if int_acc <= 0: variance = 1
def map_pixel_to_pixel(code, in_ptr, out_ptr, pixel_op, args, block_size): """ Apply one to one pixel operations. """ bwidth, bheight = block_size for i in xrange(bheight): for j in xrange(bwidth): off = bwidth * i + j with scoped_alloc(code, 2) as (in_reg, out_reg): yield MemRImm(in_reg, in_ptr + off) for x in pixel_op(code, in_reg, out_reg, args, block_size): yield x yield MemWImm(out_ptr + off, out_reg)
def pixel_op(code, pos, in_ptr, out_ptr, args, block_size): ''' Simple image shift implementation. ''' offset = args['offset'] x, y = pos width, height = block_size c_in_ptr = in_ptr + width*y + (x + offset) c_out_ptr = out_ptr + width*y + x with scoped_alloc(code, 1) as v: for instr in load_mem_value(code, c_in_ptr, pos, v, block_size): yield instr yield MemWImm(c_out_ptr, v)
def process_pixel(code, in_ptr, pos, acc, neigborhood, pixel_op, args, block_size): j, i = pos for ii, row in enumerate(neighborhood): for jj, m in enumerate(row): if m: # works implicitly for booleans and coefficients apos = (j + jj - h_nwidth, i + ii - h_nheight) with scoped_alloc(code, 1) as v: for x in load_mem_value(code, in_ptr, apos, v, block_size): yield x for x in pixel_op(code, m, v, acc, args, block_size): yield x
def gen_integral_image(code, src_ptr, integral_ptr, sq_integral_ptr, block_size): ''' Generate instructions integral image of the image and squared image calculation.''' width, height = block_size with scoped_alloc(code, 2) as (acc, tmp): for i in xrange(height): for j in xrange(width): ptr = width*i + j # r2: acc # r3: prev addr # r4: tmp val if j > 0: #int_im[i][j] += (float(image[i][j]) + float(int_im[i][j-1])) yield MemRImm(tmp, src_ptr + ptr) # tmp = image[i][j] yield MemRImm(acc, integral_ptr + ptr -1) # acc = int_im[i][j-1] yield Add(acc, acc, tmp) # acc = int_im[i][j-1] + image[i][j] yield MemWImm(integral_ptr + ptr, acc) #sq_int_im[i][j] += (float(image[i][j]*float_image[i][j]) + float(sq_int_im[i][j-1])) yield Mul(tmp, tmp, tmp) # tmp = image[i][j] * image[i][j] yield MemRImm(acc, sq_integral_ptr + ptr -1) # acc = sq_int_im[i][j-1] yield Add(acc, acc, tmp) # acc = (image[i][j]*image[i][j]) + sq_int_im[i][j-1] yield MemWImm(sq_integral_ptr + ptr, acc) else: #int_im[i][j] = float(image[i][j]) yield MemRImm(acc, src_ptr + ptr) yield MemWImm(integral_ptr + ptr, acc) #sq_int_im[i][j] = float(image[i][j]*image[i][j]) yield Mul(acc, acc, acc) yield MemWImm(sq_integral_ptr + ptr, acc) for j in xrange(width): for i in xrange(height): if i > 0: #int_im[i][j] += float(int_im[i-1][j]) int_ptr_i_j = integral_ptr + i*width + j yield MemRImm(acc, int_ptr_i_j) int_ptr_im1_j = integral_ptr + (i-1)*width + j yield MemRImm(tmp, int_ptr_im1_j) yield Add(acc, acc, tmp) yield MemWImm(int_ptr_i_j, acc) #sq_int_im[i][j] += float(sq_int_im[i-1][j]) sq_int_ptr_i_j = sq_integral_ptr + i*width + j yield MemRImm(acc, sq_int_ptr_i_j) sq_int_ptr_im1_j = sq_integral_ptr + (i-1)*width + j yield MemRImm(tmp, sq_int_ptr_im1_j) yield Add(acc, acc, tmp) yield MemWImm(sq_int_ptr_i_j, acc)
def gen_calc_planarity_inlined(code, block_size, args): ''' Optimised version by manually inlining all code. ''' filterbank = args['filterbank'] f = filterbank.filters[0] rows, cols = block_size frows, fcols = f.size() hfrow, hfcol = [x//2 for x in f.size()] out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols in_ptr = args['in_ptr'] if 'in_ptr' in args else 0 buffer_ptr = rows*cols*2 assert buffer_ptr != in_ptr assert buffer_ptr != out_ptr for filter_nr, f in enumerate(filterbank.filters): # convolution + abs for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as acc: # convolution yield Xor(acc, acc, acc) for x, y, coeff in f.coefficients: ii = i + y - hfrow jj = j + x - hfcol with scoped_alloc(code, 2) as (coeff_reg, v): yield Imm(coeff_reg, coeff) for instr in load_mem_value(code, in_ptr, (jj, ii), v, block_size): yield instr yield Mul(v, v, coeff_reg) yield Add(acc, acc, v) # take max with scoped_alloc(code, 1) as const0: yield Imm(const0, 0) yield Cmp(acc, const0) yield Neg(acc, acc, cond='LT') yield MemWImm(buffer_ptr+i*cols + j, acc) # gather for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as max_v: # local max yield Imm(max_v, -float('inf')) for ii in xrange(frows): for jj in xrange(fcols): if not f.mask[ii][jj]: continue # skip if not enabled iii = i + ii - hfrow jjj = j + jj - hfcol with scoped_alloc(code, 1) as v: for instr in load_mem_value(code, buffer_ptr, (jjj, iii), v, block_size): yield instr yield Cmp(v, max_v) yield Mov(max_v, v, cond='GT') # global max if filter_nr != 0: with scoped_alloc(code, 1) as old_v: yield MemRImm(old_v, out_ptr+i*cols+j) yield Cmp(old_v, max_v) yield Mov(max_v, old_v, cond='GT') yield MemWImm(out_ptr+i*cols + j, max_v)
def test_codegen(code, block_size, args): with scoped_alloc(code, 2) as (a, b): yield Imm(b, 3) yield Mov(a, b) with scoped_alloc(code, 1) as c: for x in xrange(3): with scoped_alloc(code, 2) as (e, f): yield Imm(f, 2) yield Add(e, b, f) with scoped_alloc(code, 2) as (g, h): yield Imm(g, 1) yield Add(c, a, g) yield Add(c, a, g) yield Imm(h, 1) yield Mov(c, h) yield Mov(c, h) yield Xor(a, a, a) with scoped_alloc(code, 1) as const_1: yield Imm(const_1, 1) yield Mov(a, const_1) yield Mov(code.out, a) #yield Mov(b, code.east) yield Mov(b, a)
def gen_apply_sparse_filter(code, block_size, args): ''' Apply sparse filter code generation. ''' f = args['filter'] rows, cols = block_size hfrow, hfcol = [x//2 for x in f.size()] out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols in_ptr = args['in_ptr'] if 'in_ptr' in args else 0 for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as acc: yield Xor(acc, acc, acc) for x, y, coeff in f.coefficients: ii = i + y - hfrow jj = j + x - hfcol with scoped_alloc(code, 2) as (coeff_reg, v): yield Imm(coeff_reg, coeff) for instr in load_mem_value(code, in_ptr, (jj, ii), v, block_size): yield instr yield Mul(v, v, coeff_reg) yield Add(acc, acc, v) yield MemWImm(out_ptr+i*cols + j, acc)
def gen_gray_image_code(code, block_size, args): ''' generate flat gray image ''' with scoped_alloc(code, 3) as (out_ptr_r, const_1, const_gray): # init pointer to output memory yield Imm(out_ptr_r, block_size[0]*block_size[1]) yield Imm(const_1, 1) # gen gray image yield Imm(const_gray, 128) for i in xrange(block_size[0]): for j in xrange(block_size[1]): yield MemW(out_ptr_r, const_gray) yield Add(out_ptr_r, out_ptr_r, const_1)
def gen_global_max(code, block_size, args): ''' Calculate element-wise max over two buffers. ''' rows, cols = block_size in_ptr_1 = args['in_ptr_1'] in_ptr_2 = args['in_ptr_2'] out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols for i in xrange(rows): for j in xrange(cols): addr = i*cols + j with scoped_alloc(code, 2) as (v2_res, v1): yield MemRImm(v1, in_ptr_1 + addr) yield MemRImm(v2_res, in_ptr_2 + addr) yield Cmp(v1, v2_res) yield Mov(v2_res, v1, cond='GT') yield MemWImm(out_ptr + addr, v2_res)
def pixel_op(code, mask_val, image_val, acc, args, block_size): ''' Simple convolution implementation. ''' with scoped_alloc(code, 2) as (v, mask_val_r): yield Imm(mask_val_r, mask_val) yield Mul(v, mask_val_r, image_val) yield Add(acc, acc, v)
def codegen(code): with scoped_alloc(code, 3) as (a, b, c): yield Imm(b, 3) yield Mov(a, b) yield Add(c, b, a)
def codegen(code, block_size, args): with scoped_alloc(code, 2) as (a, b): yield Imm(a, 1) yield Mov(code.out, a) yield Mov(b, code.east) yield Add(a, a, b)
def codegen(code, block_size, args): with scoped_alloc(code, 2) as (a, b): yield Imm(a, 1) yield Imm(b, 2) yield Add(a, a, b)
def codegen(code, block_size, args): with scoped_alloc(code, 2) as (a, b): yield MemRImm(code.out, 3) yield Mov(a, code.west) yield MemRImm(code.out, 3) yield Mov(b, code.east)