예제 #1
0
def gen_calc_variance_fullintegral(code, out_reg, position, integral_ptr, sq_integral_ptr, haar_size, block_size):
	''' Variance calculation for full integral image.
	'''
	# calculate split shape
	width, height = block_size
	haar_width, haar_height = haar_size
	
	# in order to produce same results as the python VJ implementation, x,y: [0, w], [0, h]
	shape = (0, 0, haar_width + 1, haar_height + 1) 

	with scoped_alloc(code, 1) as tmp:
		# calculate sum of int_im 
		# out_reg = val_sum = integral_sum(integral, xx, yy, w, h)
		for x in gen_fullintegral_sum(code, out_reg, position, shape, integral_ptr, block_size):
			yield x

		# calculate sum of sq_int_im
		# tmp = val_sum = integral_sum(sq_integral, xx, yy, w, h)
		for x in gen_fullintegral_sum(code, tmp, position, shape, sq_integral_ptr, block_size):
			yield x

		# calculate variance
		with scoped_alloc(code, 1) as area_r:
			yield Imm(area_r, haar_width*haar_height) # area
			yield Mul(tmp, tmp, area_r)     # sq_acc = sq_integral_sum*(haar_width*haar_height) 
		yield Mul(out_reg, out_reg, out_reg)    # out_reg = integral_sum^2
		yield Sub(out_reg, tmp, out_reg)    # out_reg = sq_integral_sum*(haar_width*haar_height) - integral_sum^2 
		with scoped_alloc(code, 1) as const_0:
			yield Imm(const_0, 0.)
			yield Cmp(out_reg, const_0)                # comp out_reg - 0
		yield Sqrt(out_reg, out_reg, cond='GT')   # out_reg = sqrt(sq_integral_sum*(haar_width*haar_height) - integral_sum^2)
		yield Imm(out_reg, 1, cond='LE')     # if out_reg <= 0: variance = 1
예제 #2
0
def gen_gather_local_max(code, block_size, args):
	''' Gather local maximum from mask code generation. '''
	f = args['filter']
	rows, cols = block_size
	frows, fcols = f.size()
	hfrow, hfcol = [x//2 for x in f.size()]

	out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols
	in_ptr = args['in_ptr'] if 'in_ptr' in args else 0

	for i in xrange(rows):
		for j in xrange(cols):
			with scoped_alloc(code, 1) as max_v:
				yield Imm(max_v, -float('inf'))
				for ii in xrange(frows):
					for jj in xrange(fcols):
						if not f.mask[ii][jj]: continue # skip if not enabled
						iii = i + ii - hfrow
						jjj = j + jj - hfcol
						with scoped_alloc(code, 1) as v:
							for instr in load_mem_value(code, in_ptr, (jjj, iii), v, block_size):
								yield instr
							yield Cmp(v, max_v)
							yield Mov(max_v, v, cond='GT')
				yield MemWImm(out_ptr+i*cols + j, max_v)
예제 #3
0
def gen_threshold(code, block_size, args):
    th = args['th']
    with scoped_alloc(code, 6) as (out_ptr_r, th_r, const_1, const_255, const_0, in_ptr_r):
        # out pointer
        yield Imm(out_ptr_r, block_size[0]*block_size[1]) 

	# constants
        yield Imm(th_r, th) 
        yield Imm(const_1, 1) 
        yield Imm(const_255, 255) 
        yield Imm(const_0, 0) 

        # in pointer
        yield Mov(in_ptr_r, const_0) 
    
        for i in xrange(block_size[0]):
            for j in xrange(block_size[1]):
                with scoped_alloc(code, 1) as tmp:
                    yield MemR(tmp, in_ptr_r) 

                    yield Cmp(tmp, th_r)  
                    yield Mov(tmp, const_0)  
                    yield Mov(tmp, const_255, 'GT')   

                    yield MemW(out_ptr_r, tmp) 
                    if(not (j == block_size[1]-1 and i == block_size[0]-1)):
                        yield Add(in_ptr_r, in_ptr_r, const_1)  
                        yield Add(out_ptr_r, out_ptr_r, const_1)
예제 #4
0
	def codegen(code, block_size, args):
		with scoped_alloc(code, 1) as acc:
			yield Xor(acc, acc, acc)
			with scoped_alloc(code, 1) as m:
				for i in xrange(3):
					yield MemRImm(m, 2)
					yield Add(acc, acc, m)
					yield Sub(acc, acc, m)
					yield MemRImm(m, i)
					yield Add(acc, acc, m)
					yield MemRImm(m, i)
					yield Sub(acc, acc, m)
				yield Add(m, m, m)
예제 #5
0
def gen_abs_value(code, block_size, args):
	''' Generate element-wise absolute value of a buffer. '''
	rows, cols = block_size
	out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols
	in_ptr = args['in_ptr'] if 'in_ptr' in args else 0

	with scoped_alloc(code, 1) as const0:
		yield Imm(const0, 0)
		for i in xrange(rows):
			for j in xrange(cols):
				addr = i*cols + j
				with scoped_alloc(code, 1) as tmp:
					yield MemRImm(tmp, in_ptr + addr)
					yield Cmp(tmp, const0)
					yield Neg(tmp, tmp, cond='LT')
					yield MemWImm(out_ptr + addr, tmp)
예제 #6
0
def gen_full_integral_image(code, src_ptr, integral_ptr, sq_integral_ptr, pe_array_size, block_size):
	width, height = block_size
	pe_width, pe_height = pe_array_size
	for x in gen_integral_image(code, src_ptr, integral_ptr, sq_integral_ptr, block_size):
		yield x
	for buffer_ptr in [integral_ptr, sq_integral_ptr]:
		with scoped_alloc(code, 1) as acc:
			# horizontal propagation
			for row in xrange(height):
				yield MemRImm(code.out, buffer_ptr+(row+1)*width-1)
				for bid in xrange(pe_width-1):
					for x in xrange(width):
						ptr = buffer_ptr + row*width + x
						yield MemRImm(acc, ptr)
						yield Add(acc, acc, code.west)
						yield MemWImm(ptr, acc)
					yield Mov(code.out, code.west)

			# vertical propagation
			for col in xrange(width):
				yield MemRImm(code.out, buffer_ptr + width*(height-1) + col)
				for bid in xrange(pe_height-1):
					for y in xrange(height):
						ptr = buffer_ptr + y*width + col
						yield MemRImm(acc, ptr)
						yield Add(acc, acc, code.north)
						yield MemWImm(ptr, acc)
					yield Mov(code.out, code.north)
예제 #7
0
def map_neighborhood_to_pixel(code, in_ptr, out_ptr, neighborhood, pixel_op, args, block_size):
    """ Apply neigborhood to pixel operations. """

    bwidth, bheight = block_size
    nheight, nwidth = len(neighborhood[0]), len(neighborhood)
    assert nheight % 2 != 0 and nwidth % 2 != 0  # mask size must be odd
    h_nheight = nheight // 2
    h_nwidth = nwidth // 2

    def process_pixel(code, in_ptr, pos, acc, neigborhood, pixel_op, args, block_size):
        j, i = pos
        for ii, row in enumerate(neighborhood):
            for jj, m in enumerate(row):
                if m:  # works implicitly for booleans and coefficients
                    apos = (j + jj - h_nwidth, i + ii - h_nheight)
                    with scoped_alloc(code, 1) as v:
                        for x in load_mem_value(code, in_ptr, apos, v, block_size):
                            yield x
                        for x in pixel_op(code, m, v, acc, args, block_size):
                            yield x

    for i in xrange(bheight):
        for j in xrange(bwidth):
            pos = (j, i)
            with scoped_alloc(code, 1) as acc:
                # XXX apply assignment-instead-of-accum-on-first-iteration optimalisation
                yield Xor(acc, acc, acc)
                for x in process_pixel(code, in_ptr, pos, acc, neighborhood, pixel_op, args, block_size):
                    yield x
                yield MemWImm(out_ptr + bwidth * i + j, acc)
예제 #8
0
	def codegen(code, block_size, args):
		with scoped_alloc(code, 2) as (a, b):
			yield MemRImm(a, 3)
			yield MemRImm(b, 4)
			yield Add(a, a, b)
			yield MemWImm(3, a)
			yield MemRImm(a, 3)
예제 #9
0
	def pixel_op(code, pin, pout, args, block_size):
		th = args['th']
		with scoped_alloc(code, 3) as (th_r, v, const_255):
			yield Imm(th_r, th)
			yield Cmp(pin, th_r)
			yield Imm(pout, 255, cond='GT')
			yield Xor(pout, pout, pout, cond='LE')
예제 #10
0
def gen_bbs(code, block_size, args):
    th = args['th']
    alpha = args['alpha']
    width, height = block_size
    block_mem_size = width * height
    # pointers:
    src_ptr = 0
    res_ptr = block_mem_size
    back_ptr = 2*block_mem_size

    with scoped_alloc(code, 3) as (const_alpha, const_1_m_alpha, const_th):
        # setup parameters
        yield Imm(const_alpha, alpha)
        yield Imm(const_1_m_alpha, 1-alpha)
        yield Imm(const_th, th)

        # regs:
        # ip_n     : I_p[n]
        # ib_n_1   : Ibackground_p[n-1]
        # ib_n     : Ibackground_p[n]
        # abbsdiff : abs(I_p[n] - Ibackground_p[n-1]

        for i in xrange(block_mem_size):
            with scoped_alloc(code, 5) as (ip_n, ib_n_1, ib_n, absdiff, res):
                # I_background(n) = I*alpha + I_background(n-1)*(1-alpha)
                yield MemRImm(ip_n, src_ptr + i)
                yield Mul(ib_n, ip_n, const_alpha)
                yield MemRImm(ib_n_1, back_ptr + i)
                yield Mul(ib_n_1, ib_n_1, const_1_m_alpha)
                yield Add(ib_n, ib_n, ib_n_1)
                yield MemWImm(back_ptr + i, ib_n)

                # I_res = abs(I - I_background) > th
                # equivalent to:
                # if I >= I_background:
                #   I_res = (I - I_background) > th
                # else:
                #   I_res = (I_background - I) > th
                yield Cmp(ip_n, ib_n)
                yield Sub(absdiff, ip_n, ib_n, cond='GE')
                yield Sub(absdiff, ib_n, ip_n, cond='LT')
                yield Cmp(absdiff, const_th)
                yield Imm(res, 0)
                yield Imm(res, 255, cond='GT')
                yield MemWImm(res_ptr + i, res)
예제 #11
0
	def codegen(code, block_size, args):
		with scoped_alloc(code, 4) as (a, b, c, d):
			yield Imm(a, 1)
			yield Imm(b, 2)
			yield Cmp(a, b)
			yield Inv(c, a, cond='LE')
			yield Imm(a, 3) # to check if a is captured before new assignment
			yield Mov(c, b, cond='GT')
			yield Mov(d, c)
예제 #12
0
	def codegen(code, block_size, args):
		for x in xrange(4):
			with scoped_alloc(code, 3) as (a, b, c):
				yield Imm(b, x*4)
				yield Mov(a, b)
				yield Imm(a, x)
				yield Mov(b, a)
				yield Add(a, a, b)
				yield Sub(c, a, b)
				yield Mul(c, c, c)
예제 #13
0
	def codegen(code, block_size, args):
		with scoped_alloc(code, 2) as (acc, imm_r):
			yield Xor(acc, acc, acc)
			for i in xrange(3):
				yield Imm(imm_r, 2)
				yield Add(acc, acc, imm_r)
				yield Sub(acc, acc, imm_r)
				yield Imm(imm_r, i)
				yield Add(acc, acc, imm_r)
				yield Sub(acc, acc, imm_r)
예제 #14
0
def gen_copy_to_out(code, block_size, args):
    with scoped_alloc(code, 3) as (out_ptr_r, in_ptr_r, const_1):
        # init pointer to output memory
        yield Imm(out_ptr_r, block_size[0]*block_size[1]) 

	# init src ptr
	yield Xor(in_ptr_r, in_ptr_r, in_ptr_r)

        # inc value
        yield Imm(const_1, 1) 

        for i in xrange(block_size[0]):
            for j in xrange(block_size[1]):
                with scoped_alloc(code, 1) as tmp:
                    yield MemR(tmp, in_ptr_r) 
                    yield MemW(out_ptr_r, tmp) 
    
                    yield Add(in_ptr_r, in_ptr_r, const_1)
                    yield Add(out_ptr_r, out_ptr_r, const_1)
예제 #15
0
def gen_integral_sum(code, out_reg, position, shape, ptr, block_size):
	''' Gen integral sum code.
	this code assumes that each shape is in a single block
	maximum one block away from the originating block

	note that in contrast with the python implementation,
	a block has the ranges: x[0,w[, y[0,h[
	so width and height of the shape need to be incremented by one
	to be compatible with the violajones sum function
	'''
	px, py = position
	x, y, w, h = shape
	width, height = block_size
	xx = px + x
	yy = py + y
	# val_sum [r4], tmp [r5]
	# to handle values outside the block range:
	# first detect cases and adapt the xx,yy coordinates
	# calculate the value as usual
	# copy the value to the correct block
	copy_from_right = False
	copy_from_below = False
	if not ((xx+w-1) < width):
		copy_from_right = True
		xx -= width
	if not ((yy+h-1) < height):
		copy_from_below = True
		yy -= height

	# v1 = im[yy    ][xx    ]
	# v2 = im[yy    ][xx+w-1]
	# v3 = im[yy+h-1][xx    ]
	# v4 = im[yy+h-1][xx+w-1]
	# val_sum =  v1 - v2 - v3 + v4

	with scoped_alloc(code, 1) as tmp:
		yield MemRImm(out_reg, ptr + yy * width + xx) # r = v1
		yield MemRImm(tmp, ptr +  yy    * width +   (xx+w-1)) # v2
		yield Sub(out_reg, out_reg, tmp) # r = v1 - v2
		yield MemRImm(tmp, ptr + (yy+h-1) * width +  xx     ) # v3
		yield Sub(out_reg, out_reg, tmp) # r = v1 - v2 - v3
		yield MemRImm(tmp, ptr + (yy+h-1) * width + (xx+w-1)) # v4
		yield Add(out_reg, out_reg, tmp) # r = v1 - v2 - v3 + v4

	# now handle the shapes out of PE block
	if copy_from_right and copy_from_below:
		yield Mov(code.out, out_reg)
		yield Mov(code.out, code.east)
		yield Mov(out_reg, code.south)
	elif copy_from_right:
		yield Mov(code.out, out_reg)
		yield Mov(out_reg, code.east)
	elif copy_from_below:
		yield Mov(code.out, out_reg)
		yield Mov(out_reg, code.south)
예제 #16
0
def gen_calc_variance(code, out_reg, position, integral_ptr, sq_integral_ptr, haar_size, block_size):
	''' Variance calculation.
	'''
	# calculte split shape
	width, height = block_size
	haar_width, haar_height = haar_size
	
	# in order to produce same results as the python VJ implementation, x,y: [0, w], [0, h]
	shape = (0, 0, haar_width + 1, haar_height + 1) 
	shapes = split_shape_across_blocks(shape, position, block_size)

	with scoped_alloc(code, 2) as (int_acc, sq_acc):
		# int_acc: integral sum accum
		yield Xor(int_acc, int_acc, int_acc)
		# sq_acc: square integral sum accum
		yield Xor(sq_acc, sq_acc, sq_acc)
		for i, s in enumerate(shapes):
			# calculate sum of int_im 
			with scoped_alloc(code, 1) as sum_out:
				for x in gen_integral_sum(code, sum_out, position, s, integral_ptr, block_size):
					if i > 0: code.tag_com_overhead_instr(x)
					yield x
				yield Add(int_acc, int_acc, sum_out)

			# calculate sum of sq_int_im
			with scoped_alloc(code, 1) as sum_out:
				for x in gen_integral_sum(code, sum_out, position, s, sq_integral_ptr, block_size):
					if i > 0: code.tag_com_overhead_instr(x)
					yield x
				yield Add(sq_acc, sq_acc, sum_out)

		# calculate variance
		with scoped_alloc(code, 1) as area_r:
			yield Imm(area_r, haar_width*haar_height) # area
			yield Mul(sq_acc, sq_acc, area_r)     # sq_acc = sq_integral_sum*(haar_width*haar_height) 
		yield Mul(int_acc, int_acc, int_acc)    # int_acc = integral_sum^2
		yield Sub(int_acc, sq_acc, int_acc)    # int_acc = sq_integral_sum*(haar_width*haar_height) - integral_sum^2 
		with scoped_alloc(code, 1) as const_0:
			yield Imm(const_0, 0.)
			yield Cmp(int_acc, const_0)                # comp int_acc - 0
		yield Sqrt(out_reg, int_acc, cond='GT')   # r7 = sqrt(sq_integral_sum*(haar_width*haar_height) - integral_sum^2)
		yield Imm(out_reg, 1., cond='LE')          # if int_acc <= 0: variance = 1
예제 #17
0
def map_pixel_to_pixel(code, in_ptr, out_ptr, pixel_op, args, block_size):
    """ Apply one to one pixel operations. """
    bwidth, bheight = block_size
    for i in xrange(bheight):
        for j in xrange(bwidth):
            off = bwidth * i + j
            with scoped_alloc(code, 2) as (in_reg, out_reg):
                yield MemRImm(in_reg, in_ptr + off)
                for x in pixel_op(code, in_reg, out_reg, args, block_size):
                    yield x
                yield MemWImm(out_ptr + off, out_reg)
예제 #18
0
	def pixel_op(code, pos, in_ptr, out_ptr, args, block_size):
		''' Simple image shift implementation. '''
		offset = args['offset']
		x, y = pos
		width, height = block_size
		c_in_ptr = in_ptr + width*y + (x + offset)
		c_out_ptr = out_ptr + width*y + x
		with scoped_alloc(code, 1) as v:
			for instr in load_mem_value(code, c_in_ptr, pos, v, block_size):
				yield instr
			yield MemWImm(c_out_ptr, v)
예제 #19
0
 def process_pixel(code, in_ptr, pos, acc, neigborhood, pixel_op, args, block_size):
     j, i = pos
     for ii, row in enumerate(neighborhood):
         for jj, m in enumerate(row):
             if m:  # works implicitly for booleans and coefficients
                 apos = (j + jj - h_nwidth, i + ii - h_nheight)
                 with scoped_alloc(code, 1) as v:
                     for x in load_mem_value(code, in_ptr, apos, v, block_size):
                         yield x
                     for x in pixel_op(code, m, v, acc, args, block_size):
                         yield x
예제 #20
0
def gen_integral_image(code, src_ptr, integral_ptr, sq_integral_ptr, block_size):
	''' Generate instructions integral image of the image and squared image calculation.'''
	width, height = block_size
	with scoped_alloc(code, 2) as (acc, tmp):
		for i in xrange(height):
			for j in xrange(width):
				ptr = width*i + j
				# r2: acc
				# r3: prev addr
				# r4: tmp val
				if j > 0:
					#int_im[i][j] += (float(image[i][j]) + float(int_im[i][j-1]))
					yield MemRImm(tmp, src_ptr + ptr)           # tmp = image[i][j]
					yield MemRImm(acc, integral_ptr + ptr -1)        # acc = int_im[i][j-1]
					yield Add(acc, acc, tmp) # acc = int_im[i][j-1] + image[i][j]
					yield MemWImm(integral_ptr + ptr, acc)				

					#sq_int_im[i][j] += (float(image[i][j]*float_image[i][j]) + float(sq_int_im[i][j-1]))
					yield Mul(tmp, tmp, tmp) # tmp = image[i][j] * image[i][j]
					yield MemRImm(acc, sq_integral_ptr + ptr -1)           # acc = sq_int_im[i][j-1]
					yield Add(acc, acc, tmp) # acc = (image[i][j]*image[i][j]) + sq_int_im[i][j-1]

					yield MemWImm(sq_integral_ptr + ptr, acc)

				else:
					#int_im[i][j] = float(image[i][j])
					yield MemRImm(acc, src_ptr + ptr)
					yield MemWImm(integral_ptr + ptr, acc)

					#sq_int_im[i][j] = float(image[i][j]*image[i][j])
					yield Mul(acc, acc, acc)
					yield MemWImm(sq_integral_ptr + ptr, acc)

		for j in xrange(width):
			for i in xrange(height):
				if i > 0:
					#int_im[i][j] += float(int_im[i-1][j])
					int_ptr_i_j = integral_ptr + i*width + j
					yield MemRImm(acc, int_ptr_i_j)
					int_ptr_im1_j = integral_ptr + (i-1)*width + j
					yield MemRImm(tmp, int_ptr_im1_j)

					yield Add(acc, acc, tmp)
					yield MemWImm(int_ptr_i_j, acc)

					#sq_int_im[i][j] += float(sq_int_im[i-1][j])
					sq_int_ptr_i_j = sq_integral_ptr + i*width + j
					yield MemRImm(acc, sq_int_ptr_i_j)
					sq_int_ptr_im1_j = sq_integral_ptr + (i-1)*width + j
					yield MemRImm(tmp, sq_int_ptr_im1_j)

					yield Add(acc, acc, tmp)
					yield MemWImm(sq_int_ptr_i_j, acc)
예제 #21
0
def gen_calc_planarity_inlined(code, block_size, args):
	''' Optimised version by manually inlining all code. '''
	filterbank = args['filterbank']
	f = filterbank.filters[0]
	rows, cols = block_size
	frows, fcols = f.size()
	hfrow, hfcol = [x//2 for x in f.size()]

	out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols
	in_ptr = args['in_ptr'] if 'in_ptr' in args else 0
	buffer_ptr = rows*cols*2
	assert buffer_ptr != in_ptr
	assert buffer_ptr != out_ptr

	for filter_nr, f in enumerate(filterbank.filters):
		# convolution + abs
		for i in xrange(rows):
			for j in xrange(cols):
				with scoped_alloc(code, 1) as acc:
					# convolution
					yield Xor(acc, acc, acc)
					for x, y, coeff in f.coefficients:
						ii = i + y - hfrow
						jj = j + x - hfcol
						with scoped_alloc(code, 2) as (coeff_reg, v):
							yield Imm(coeff_reg, coeff)
							for instr in load_mem_value(code, in_ptr, (jj, ii), v, block_size):
								yield instr
							yield Mul(v, v, coeff_reg)
							yield Add(acc, acc, v)

					# take max
					with scoped_alloc(code, 1) as const0:
						yield Imm(const0, 0)
						yield Cmp(acc, const0)
					yield Neg(acc, acc, cond='LT')
					yield MemWImm(buffer_ptr+i*cols + j, acc)
		# gather
		for i in xrange(rows):
			for j in xrange(cols):
				with scoped_alloc(code, 1) as max_v:
					# local max
					yield Imm(max_v, -float('inf'))
					for ii in xrange(frows):
						for jj in xrange(fcols):
							if not f.mask[ii][jj]: continue # skip if not enabled
							iii = i + ii - hfrow
							jjj = j + jj - hfcol
							with scoped_alloc(code, 1) as v:
								for instr in load_mem_value(code, buffer_ptr, (jjj, iii), v, block_size):
									yield instr
								yield Cmp(v, max_v)
								yield Mov(max_v, v, cond='GT')
					# global max
					if filter_nr != 0:
						with scoped_alloc(code, 1) as old_v:
							yield MemRImm(old_v, out_ptr+i*cols+j)
							yield Cmp(old_v, max_v)
							yield Mov(max_v, old_v, cond='GT')
					yield MemWImm(out_ptr+i*cols + j, max_v)
예제 #22
0
	def test_codegen(code, block_size, args):
		with scoped_alloc(code, 2) as (a, b):
			yield Imm(b, 3)
			yield Mov(a, b)
			with scoped_alloc(code, 1) as c:
				for x in xrange(3):
					with scoped_alloc(code, 2) as (e, f):
						yield Imm(f, 2)
						yield Add(e, b, f)
					with scoped_alloc(code, 2) as (g, h):
						yield Imm(g, 1)
						yield Add(c, a, g)
						yield Add(c, a, g)
						yield Imm(h, 1)
						yield Mov(c, h)
						yield Mov(c, h)
			yield Xor(a, a, a)
			with scoped_alloc(code, 1) as const_1:
				yield Imm(const_1, 1)
				yield Mov(a, const_1)
			yield Mov(code.out, a)
			#yield Mov(b, code.east)
			yield Mov(b, a)
예제 #23
0
def gen_apply_sparse_filter(code, block_size, args):
	''' Apply sparse filter code generation. '''
	f = args['filter']
	rows, cols = block_size
	hfrow, hfcol = [x//2 for x in f.size()]

	out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols
	in_ptr = args['in_ptr'] if 'in_ptr' in args else 0

	for i in xrange(rows):
		for j in xrange(cols):
			with scoped_alloc(code, 1) as acc:
				yield Xor(acc, acc, acc)
				for x, y, coeff in f.coefficients:
					ii = i + y - hfrow
					jj = j + x - hfcol
					with scoped_alloc(code, 2) as (coeff_reg, v):
						yield Imm(coeff_reg, coeff)
						for instr in load_mem_value(code, in_ptr, (jj, ii), v, block_size):
							yield instr
						yield Mul(v, v, coeff_reg)
						yield Add(acc, acc, v)
				yield MemWImm(out_ptr+i*cols + j, acc)
예제 #24
0
def gen_gray_image_code(code, block_size, args):
    ''' generate flat gray image '''

    with scoped_alloc(code, 3) as (out_ptr_r, const_1, const_gray):
        # init pointer to output memory
        yield Imm(out_ptr_r, block_size[0]*block_size[1]) 
        yield Imm(const_1, 1) 
      
        # gen gray image
        yield Imm(const_gray, 128) 
        for i in xrange(block_size[0]):
            for j in xrange(block_size[1]):
                yield MemW(out_ptr_r, const_gray) 
                yield Add(out_ptr_r, out_ptr_r, const_1)
예제 #25
0
def gen_global_max(code, block_size, args):
	''' Calculate element-wise max over two buffers. '''
	rows, cols = block_size

	in_ptr_1 = args['in_ptr_1']
	in_ptr_2 = args['in_ptr_2']
	out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols

	for i in xrange(rows):
		for j in xrange(cols):
			addr = i*cols + j
			with scoped_alloc(code, 2) as (v2_res, v1):
				yield MemRImm(v1, in_ptr_1 + addr)
				yield MemRImm(v2_res, in_ptr_2 + addr)
				yield Cmp(v1, v2_res)
				yield Mov(v2_res, v1, cond='GT')
				yield MemWImm(out_ptr + addr, v2_res)
예제 #26
0
	def pixel_op(code, mask_val, image_val, acc, args, block_size):
		''' Simple convolution implementation. '''
		with scoped_alloc(code, 2) as (v, mask_val_r):
			yield Imm(mask_val_r, mask_val)
			yield Mul(v, mask_val_r, image_val)
			yield Add(acc, acc, v)
예제 #27
0
	def codegen(code):
		with scoped_alloc(code, 3) as (a, b, c):
			yield Imm(b, 3)
			yield Mov(a, b)
			yield Add(c, b, a)
예제 #28
0
	def codegen(code, block_size, args):
		with scoped_alloc(code, 2) as (a, b):
			yield Imm(a, 1)
			yield Mov(code.out, a)
			yield Mov(b, code.east)
			yield Add(a, a, b)
예제 #29
0
	def codegen(code, block_size, args):
		with scoped_alloc(code, 2) as (a, b):
			yield Imm(a, 1)
			yield Imm(b, 2)
			yield Add(a, a, b)
예제 #30
0
	def codegen(code, block_size, args):
		with scoped_alloc(code, 2) as (a, b):
			yield MemRImm(code.out, 3)
			yield Mov(a, code.west)
			yield MemRImm(code.out, 3)
			yield Mov(b, code.east)