def test_4comp(): proc = env.Processor(0) prgm = env.Program() code = prgm.get_stream() inp = proc.alloc_remote('i', 1, 4, 1) out = proc.alloc_remote('i', 4, 1, 1) for i in xrange(0, 4): inp[i] = i + 1 out[i] = 0 print "inp", inp[0:4] print "out", out[0:4] cal.set_active_code(code) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions r_cnt = prgm.acquire_register() r = prgm.acquire_registers(4) cal.mov(r_cnt, r_cnt('0000')) for i in xrange(0, 4): cal.sample(0, 0, r[i].x000, r_cnt.x) cal.add(r_cnt, r_cnt, r_cnt('1111')) cal.iadd(r[0], r[0], r[1]('0x00')) cal.iadd(r[0], r[0], r[2]('00x0')) cal.iadd(r[0], r[0], r[3]('000x')) cal.iadd(r[0], r[0], r[0]) cal.mov(reg.o0, r[0]) prgm.set_binding(reg.i0, inp) prgm.set_binding(reg.o0, out) prgm.add(code) prgm.print_code() proc.execute(prgm, (0, 0, 1, 1)) print "inp", inp[0:4] print "out", out[0:4] for i in xrange(0, 4): assert (out[i] == (i + 1) * 2) return
def test_4comp(): proc = env.Processor(0) prgm = env.Program() code = prgm.get_stream() inp = proc.alloc_remote('i', 1, 4, 1) out = proc.alloc_remote('i', 4, 1, 1) for i in xrange(0, 4): inp[i] = i + 1 out[i] = 0 print "inp", inp[0:4] print "out", out[0:4] cal.set_active_code(code) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions r_cnt = prgm.acquire_register() r = prgm.acquire_registers(4) cal.mov(r_cnt, r_cnt('0000')) for i in xrange(0, 4): cal.sample(0, 0, r[i].x000, r_cnt.x) cal.add(r_cnt, r_cnt, r_cnt('1111')) cal.iadd(r[0], r[0], r[1]('0x00')) cal.iadd(r[0], r[0], r[2]('00x0')) cal.iadd(r[0], r[0], r[3]('000x')) cal.iadd(r[0], r[0], r[0]) cal.mov(reg.o0, r[0]) prgm.set_binding(reg.i0, inp) prgm.set_binding(reg.o0, out) prgm.add(code) prgm.print_code() proc.execute(prgm, (0, 0, 1, 1)) print "inp", inp[0:4] print "out", out[0:4] for i in xrange(0, 4): assert(out[i] == (i + 1) * 2) return
def TestSimpleKernelNPy(): import corepy.arch.cal.isa as isa SIZE = 128 proc = Processor(0) arr_input = proc.alloc_remote_npy('f', 4, SIZE, SIZE) arr_output = proc.alloc_remote_npy('f', 4, SIZE, SIZE) #for i in xrange(0, SIZE * SIZE * 4): # arr_input[i] = float(i + 1) # arr_output[i] = 0.0 #print arr_input.shape #print arr_output.shape #print type(arr_input.data) val = 0.0 for i in xrange(0, SIZE): for j in xrange(0, SIZE): for k in xrange(0, 4): arr_input[i][j][k] = val arr_output[i][j][k] = 0.0 val += 1.0 # build and run the kernel prgm = Program() code = prgm.get_stream() #code.add(isa.dcl_input('v0', USAGE=isa.usage.pos, INTERP='linear_noperspective')) code.add("dcl_input_position_interp(constant) v0.xy__") code.add(isa.dcl_output('o0', USAGE=isa.usage.generic)) code.add(isa.dcl_resource(0, '2d', isa.fmt.float, UNNORM=True)) code.add(isa.sample(0, 0, 'o0', 'v0.xy')) #code.add(isa.load(0, 'o0', 'v0.g')) domain = (0, 0, SIZE, SIZE) prgm.set_binding("o0", arr_output) prgm.set_binding("i0", arr_input) prgm.add(code) prgm.cache_code() prgm.print_code() proc.execute(prgm, domain) # Check the output val = 0.0 for i in xrange(0, SIZE): for j in xrange(0, SIZE): for k in xrange(0, 4): if arr_output[i][j][k] != val: print "ERROR index %d is %f, should be %f" % (i, arr_output[i], val) val += 1.0 return
def TestSimpleKernel(): import corepy.arch.cal.isa as isa SIZE = 128 proc = Processor(0) ext_input = proc.alloc_remote('f', 4, SIZE, SIZE) ext_output = proc.alloc_remote('f', 4, SIZE, SIZE) for i in xrange(0, SIZE * SIZE * 4): ext_input[i] = float(i + 1) ext_output[i] = 0.0 # build and run the kernel prgm = Program() code = prgm.get_stream() #code.add(isa.dcl_input('v0', USAGE=isa.usage.pos, INTERP='linear_noperspective')) code.add("dcl_input_position_interp(constant) v0.xy__") code.add(isa.dcl_output('o0', USAGE=isa.usage.generic)) code.add(isa.dcl_resource(0, '2d', isa.fmt.float, UNNORM=True)) code.add(isa.sample(0, 0, 'o0', 'v0.xy')) #code.add(isa.load(0, 'o0', 'v0.g')) domain = (0, 0, SIZE, SIZE) prgm.set_binding("o0", ext_output) prgm.set_binding("i0", ext_input) prgm.add(code) prgm.cache_code() prgm.print_code() proc.execute(prgm, domain) # Check the output for i in xrange(0, SIZE * SIZE * 4): if ext_output[i] != float(i + 1): print "ERROR index %d is %f, should be %f" % (i, ext_output[i], float(i + 1)) proc.free(ext_input) proc.free(ext_output) return
code = prgm.get_stream() inp = proc.alloc_remote('f', 4, 64) out = proc.alloc_remote('f', 4, 64) out.clear() for i in xrange(0, 64): inp[i] = float(i + 1) cal.set_active_code(code) cal.dcl_input(reg.v0.x, USAGE=cal.usage.pos) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM = True) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.sample(0, 0, reg.o0, reg.v0.x) prgm.set_binding(reg.i0, inp) prgm.set_binding(reg.o0, out) prgm.add(code) prgm.print_code() proc.execute(prgm) print "inp", inp print "out", out import corepy.lib.printer as printer printer.PrintInstructionStream(code, printer.CAL_Asm())
def cal_nb_generate(n_bodies, dt): code = env.InstructionStream() cal.set_active_code(code) fn_bodies = float(n_bodies) r_count = code.acquire_register() r_lpos = code.acquire_register() r_rpos = code.acquire_register() r_force = code.acquire_register() r_diff = code.acquire_register() r_dist_vec = code.acquire_register() r_dist = code.acquire_register() r_force_tmp = code.acquire_register() r_force_vec = code.acquire_register() r_vel = code.acquire_register() #code.add("dcl_input_position_interp(linear_noperspective) v0.x___") cal.dcl_input(reg.v0.x___, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective) r_bodies = code.acquire_register((fn_bodies,) * 4) r_G = code.acquire_register((G,) * 4) r_dt = code.acquire_register((dt,) * 4) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions cal.dcl_resource(1, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # velocities # Loop over all other points to calculate the force cal.mov(r_count, r_count('0000')) # loop counter cal.sample(0, 0, r_lpos, reg.v0.x) # Local position cal.mov(r_force, r_force('0000')) # total force # Compute force using input from every other point cal.whileloop() # Break if end of points reached cal.breakc(cal.relop.ge, r_count, r_bodies) cal.sample(0, 0, r_rpos, r_count.x) # Remote position # d_xyz cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0) # local pos - remote pos # dist_tmp cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx) cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec) cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec) # distance # TODO - skip rest of force computation if distance is 0 cal.sqrt_vec(r_dist, r_dist_vec) # force G * ((m[i] * m[j]) / dist_tmp) cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww) cal.div(cal.zeroop.zero, r_force_tmp, r_force_tmp, r_dist_vec) cal.mul(r_force_tmp, r_force_tmp, r_G) # f_xyz cal.div(cal.zeroop.zero, r_force_vec, r_diff.xyz0, r_dist.xyz1) cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0) cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0) # Increment loop counter, end loop cal.add(r_count, r_count, r_count('1111')) cal.endloop() # Acceleration cal.div(cal.zeroop.zero, r_force, r_force.xyz0, r_lpos.wwww) # Velocity cal.sample(1, 1, r_vel, reg.v0.x) # Load velocity cal.mad(r_vel, r_force, r_dt, r_vel) cal.mov(reg.o1, r_vel) # Position cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw) return code
def cal_nb_generate_local(n_bodies, dt, steps): code = env.InstructionStream() cal.set_active_code(code) fn_bodies = float(n_bodies) steps = float(steps) r_count = code.acquire_register() r_step = code.acquire_register() r_lpos = code.acquire_register() r_rpos = code.acquire_register() r_force = code.acquire_register() r_diff = code.acquire_register() r_dist_vec = code.acquire_register() r_dist = code.acquire_register() r_force_tmp = code.acquire_register() r_force_vec = code.acquire_register() r_vel = code.acquire_register() print "fn_bodies", fn_bodies code.add("dcl_input_position_interp(linear_noperspective) v0.xy__") #cal.dcl_input(reg.v0.x___, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective) r_numsteps = code.acquire_register((steps,) * 4) r_bodies = code.acquire_register((fn_bodies,) * 4) #r_bodiesquare = code.acquire_register((float(fn_bodies**2),) * 4) r_G = code.acquire_register((G,) * 4) r_dt = code.acquire_register((dt,) * 4) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_output(reg.o2, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # positions cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # velocities r_foo = code.acquire_register() cal.mov(r_foo, r_foo('0000')) r_gpos = code.acquire_register() cal.mad(r_gpos, reg.v0.y, r_bodies.x, reg.v0.x) r_gvel = code.acquire_register() cal.mad(r_gvel, r_bodies.x, r_bodies.x, r_gpos) cal.ftoi(r_gpos, r_gpos) cal.ftoi(r_gvel, r_gvel) cal.sample(0, 0, r_lpos, reg.v0.xy) # Local position cal.sample(1, 1, r_vel, reg.v0.xy) # Load velocity cal.mov(reg.g[r_gpos.x], r_lpos) cal.mov(reg.g[r_gvel.x], r_vel) cal.mov(r_step, r_step('0000')) cal.whileloop() cal.breakc(cal.relop.ge, r_step.x, r_numsteps) cal.mov(r_count, r_count('0000')) # loop counter cal.whileloop() cal.breakc(cal.relop.ge, r_count.x, r_bodies) cal.add(r_foo, r_foo, r_foo('1111')) # calculate force r_tmp = code.acquire_register() cal.ftoi(r_tmp, r_count) cal.mov(r_rpos, reg.g[r_tmp.x]) # d_xyz cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0) # local pos - remote pos # dist_tmp cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx) cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec) cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec) # distance # TODO - skip rest of force computation if distance is 0 cal.sqrt_vec(r_dist, r_dist_vec) # force G * ((m[i] * m[j]) / dist_tmp) cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww) cal.div(r_force_tmp, r_force_tmp, r_dist_vec, ZEROOP = cal.zeroop.zero) cal.mul(r_force_tmp, r_force_tmp, r_G) # f_xyz # TODO - whats going on, is this right? cal.div(r_force_vec, r_diff.xyz0, r_dist.xyz1, ZEROOP = cal.zeroop.zero) cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0) cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0) cal.add(r_count, r_count, r_count('1111')) cal.endloop() # Acceleration cal.div(r_force, r_force.xyz0, r_lpos.wwww, ZEROOP = cal.zeroop.zero) # Velocity cal.mad(r_vel, r_force, r_dt, r_vel) # Position cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw) # store updated pos and vel cal.mov(reg.g[r_gpos.x], r_lpos) cal.mov(reg.g[r_gvel.x], r_vel) cal.add(r_step, r_step, r_step('1111')) cal.endloop() cal.mov(reg.o0, r_lpos) cal.mov(reg.o1, r_vel) cal.mov(reg.o2, r_foo) return code
def cal_nb_generate_2d(prgm, n_bodies, dt): code = prgm.get_stream() cal.set_active_code(code) fn_bodies = float(n_bodies) #r_cx = prgm.acquire_register() #r_cy = prgm.acquire_register() r_count = prgm.acquire_register() r_lpos = prgm.acquire_register() r_rpos = prgm.acquire_register() r_force = prgm.acquire_register() r_diff = prgm.acquire_register() r_dist_vec = prgm.acquire_register() r_dist = prgm.acquire_register() r_force_tmp = prgm.acquire_register() r_force_vec = prgm.acquire_register() r_vel = prgm.acquire_register() #code.add("dcl_input_position_interp(linear_noperspective) v0.xy__") cal.dcl_input(reg.v0.xy__, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective) r_bodies = prgm.acquire_register((fn_bodies,) * 4) r_G = prgm.acquire_register((G,) * 4) r_dt = prgm.acquire_register((dt,) * 4) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # positions cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # velocities # Loop over all other points to calculate the force cal.mov(r_count, r_count('0000')) # loop counter #cal.mov(r_cx, r_cx('0000')) # loop counter #cal.mov(r_cy, r_cy('0000')) # loop counter cal.sample(0, 0, r_lpos, reg.v0.xy) # Local position cal.mov(r_force, r_force('0000')) # total force # Compute force using input from every other point cal.whileloop() cal.breakc(cal.relop.ge, r_count.x, r_bodies) cal.mov(r_count, r_count.x0zw) cal.whileloop() cal.breakc(cal.relop.ge, r_count.y, r_bodies) #for i in xrange(0, 4): #cal.add(r_count, r_cx('x000'), r_cy('0x00')) cal.sample(0, 0, r_rpos, r_count.xy) # Remote position # d_xyz cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0) # local pos - remote pos # dist_tmp #cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx) #cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec) #cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec) cal.dp3(r_dist_vec, r_diff, r_diff, IEEE = False) # distance # TODO - skip rest of force computation if distance is 0 cal.sqrt_vec(r_dist, r_dist_vec) # force G * ((m[i] * m[j]) / dist_tmp) cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww, IEEE = False) cal.div(r_force_tmp, r_force_tmp, r_dist_vec, ZEROOP = cal.zeroop.zero) cal.mul(r_force_tmp, r_force_tmp, r_G, IEEE = False) # f_xyz # TODO - whats going on, is this right? cal.div(r_force_vec, r_diff.xyz0, r_dist.xyz1, ZEROOP = cal.zeroop.zero) cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0, IEEE = False) cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0) #cal.add(r_cy, r_cy, r_count('1111')) #cal.add(r_count, r_count, r_count('0100')) #cal.ifc(cal.relop.ge, r_count.y, r_bodies.y) ## TODO - can I merge these two? #cal.mov(r_count('_y__'), r_count('x0zw')) #cal.add(r_count, r_count, r_count('1000')) #cal.endif() # Increment loop counter, end loop cal.add(r_count, r_count, r_count('0100')) cal.endloop() cal.add(r_count, r_count, r_count('1000')) #cal.add(r_cx, r_cx, r_cx('1111')) cal.endloop() # Acceleration cal.div(r_force, r_force.xyz0, r_lpos.wwww, ZEROOP = cal.zeroop.zero) # Velocity cal.sample(1, 1, r_vel, reg.v0.xy) # Load velocity cal.mad(r_vel, r_force, r_dt, r_vel, IEEE = False) cal.mov(reg.o1, r_vel) # Position cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw, IEEE = False) #cal.mov(reg.g[0], r_vel) return code
def ParMD5Transform(parcontext, parblock, blocki): num = parcontext.number temp_block = extarray.extarray('I', 16 * num) ParDecode(num, temp_block, parblock, blocki, 64) proc = env.Processor(0) N = int(math.sqrt(num / 4)) #print "N = ", N def address_4_1d(i, pitch=64): x = i % N y = i // 64 * 4 #return x*4 + y*pitch*4*4 return i def address_4_2d(x, y, pitch=64): return x * 4 + y * pitch * 4 input_statea = proc.alloc_remote('I', 4, N, N) input_stateb = proc.alloc_remote('I', 4, N, N) input_statec = proc.alloc_remote('I', 4, N, N) input_stated = proc.alloc_remote('I', 4, N, N) input_block = [proc.alloc_remote('I', 4, N, N) for i in range(16)] outputa = proc.alloc_remote('I', 4, N, N) outputb = proc.alloc_remote('I', 4, N, N) outputc = proc.alloc_remote('I', 4, N, N) outputd = proc.alloc_remote('I', 4, N, N) for j in range(N): for i in range(N): for k in range(4): input_statea[address_4_2d(i, j) + k] = parcontext.statea[k + (i + j * N) * 4] input_stateb[address_4_2d(i, j) + k] = parcontext.stateb[k + (i + j * N) * 4] input_statec[address_4_2d(i, j) + k] = parcontext.statec[k + (i + j * N) * 4] input_stated[address_4_2d(i, j) + k] = parcontext.stated[k + (i + j * N) * 4] for k in range(N): for j in range(N): for l in range(4): for i in range(16): input_block[i][address_4_2d(j, k) + l] = temp_block[i + (j + k * N) * 4 * 16 + l * 16] global xcode if xcode == None: xcode = env.InstructionStream() cal.set_active_code(xcode) S11 = xcode.acquire_register((7, 7, 7, 7)) S12 = xcode.acquire_register((12, 12, 12, 12)) S13 = xcode.acquire_register((17, 17, 17, 17)) S14 = xcode.acquire_register((22, 22, 22, 22)) S21 = xcode.acquire_register((5, 5, 5, 5)) S22 = xcode.acquire_register((9, 9, 9, 9)) S23 = xcode.acquire_register((14, 14, 14, 14)) S24 = xcode.acquire_register((20, 20, 20, 20)) S31 = xcode.acquire_register((4, 4, 4, 4)) S32 = xcode.acquire_register((11, 11, 11, 11)) S33 = xcode.acquire_register((16, 16, 16, 16)) S34 = xcode.acquire_register((23, 23, 23, 23)) S41 = xcode.acquire_register((6, 6, 6, 6)) S42 = xcode.acquire_register((10, 10, 10, 10)) S43 = xcode.acquire_register((15, 15, 15, 15)) S44 = xcode.acquire_register((21, 21, 21, 21)) a = xcode.acquire_register() b = xcode.acquire_register() c = xcode.acquire_register() d = xcode.acquire_register() x = [xcode.acquire_register() for i in range(16)] r = xcode.acquire_register() cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statea cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stateb cal.dcl_resource(2, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statec cal.dcl_resource(3, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stated for i in range(16): cal.dcl_resource(i + 4, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_output(reg.o2, USAGE=cal.usage.generic) cal.dcl_output(reg.o3, USAGE=cal.usage.generic) cal.sample(0, 0, a, reg.v0.xy) cal.sample(1, 0, b, reg.v0.xy) cal.sample(2, 0, c, reg.v0.xy) cal.sample(3, 0, d, reg.v0.xy) for i in range(16): cal.sample(i + 4, 0, x[i], reg.v0.xy) # Round 1 FF(a, b, c, d, x[0], S11, 0xd76aa478) # 1 FF(d, a, b, c, x[1], S12, 0xe8c7b756) # 2 FF(c, d, a, b, x[2], S13, 0x242070db) # 3 FF(b, c, d, a, x[3], S14, 0xc1bdceee) # 4 FF(a, b, c, d, x[4], S11, 0xf57c0faf) # 5 FF(d, a, b, c, x[5], S12, 0x4787c62a) # 6 FF(c, d, a, b, x[6], S13, 0xa8304613) # 7 FF(b, c, d, a, x[7], S14, 0xfd469501) # 8 FF(a, b, c, d, x[8], S11, 0x698098d8) # 9 FF(d, a, b, c, x[9], S12, 0x8b44f7af) # 10 FF(c, d, a, b, x[10], S13, 0xffff5bb1) # 11 FF(b, c, d, a, x[11], S14, 0x895cd7be) # 12 FF(a, b, c, d, x[12], S11, 0x6b901122) # 13 FF(d, a, b, c, x[13], S12, 0xfd987193) # 14 FF(c, d, a, b, x[14], S13, 0xa679438e) # 15 FF(b, c, d, a, x[15], S14, 0x49b40821) # 16 # Round 2 GG(a, b, c, d, x[1], S21, 0xf61e2562) # 17 GG(d, a, b, c, x[6], S22, 0xc040b340) # 18 GG(c, d, a, b, x[11], S23, 0x265e5a51) # 19 GG(b, c, d, a, x[0], S24, 0xe9b6c7aa) # 20 GG(a, b, c, d, x[5], S21, 0xd62f105d) # 21 GG(d, a, b, c, x[10], S22, 0x2441453) # 22 GG(c, d, a, b, x[15], S23, 0xd8a1e681) # 23 GG(b, c, d, a, x[4], S24, 0xe7d3fbc8) # 24 GG(a, b, c, d, x[9], S21, 0x21e1cde6) # 25 GG(d, a, b, c, x[14], S22, 0xc33707d6) # 26 GG(c, d, a, b, x[3], S23, 0xf4d50d87) # 27 GG(b, c, d, a, x[8], S24, 0x455a14ed) # 28 GG(a, b, c, d, x[13], S21, 0xa9e3e905) # 29 GG(d, a, b, c, x[2], S22, 0xfcefa3f8) # 30 GG(c, d, a, b, x[7], S23, 0x676f02d9) # 31 GG(b, c, d, a, x[12], S24, 0x8d2a4c8a) # 32 # Round 3 HH(a, b, c, d, x[5], S31, 0xfffa3942) # 33 HH(d, a, b, c, x[8], S32, 0x8771f681) # 34 HH(c, d, a, b, x[11], S33, 0x6d9d6122) # 35 HH(b, c, d, a, x[14], S34, 0xfde5380c) # 36 HH(a, b, c, d, x[1], S31, 0xa4beea44) # 37 HH(d, a, b, c, x[4], S32, 0x4bdecfa9) # 38 HH(c, d, a, b, x[7], S33, 0xf6bb4b60) # 39 HH(b, c, d, a, x[10], S34, 0xbebfbc70) # 40 HH(a, b, c, d, x[13], S31, 0x289b7ec6) # 41 HH(d, a, b, c, x[0], S32, 0xeaa127fa) # 42 HH(c, d, a, b, x[3], S33, 0xd4ef3085) # 43 HH(b, c, d, a, x[6], S34, 0x4881d05) # 44 HH(a, b, c, d, x[9], S31, 0xd9d4d039) # 45 HH(d, a, b, c, x[12], S32, 0xe6db99e5) # 46 HH(c, d, a, b, x[15], S33, 0x1fa27cf8) # 47 HH(b, c, d, a, x[2], S34, 0xc4ac5665) # 48 # Round 4 II(a, b, c, d, x[0], S41, 0xf4292244) # 49 II(d, a, b, c, x[7], S42, 0x432aff97) # 50 II(c, d, a, b, x[14], S43, 0xab9423a7) # 51 II(b, c, d, a, x[5], S44, 0xfc93a039) # 52 II(a, b, c, d, x[12], S41, 0x655b59c3) # 53 II(d, a, b, c, x[3], S42, 0x8f0ccc92) # 54 II(c, d, a, b, x[10], S43, 0xffeff47d) # 55 II(b, c, d, a, x[1], S44, 0x85845dd1) # 56 II(a, b, c, d, x[8], S41, 0x6fa87e4f) # 57 II(d, a, b, c, x[15], S42, 0xfe2ce6e0) # 58 II(c, d, a, b, x[6], S43, 0xa3014314) # 59 II(b, c, d, a, x[13], S44, 0x4e0811a1) # 60 II(a, b, c, d, x[4], S41, 0xf7537e82) # 61 II(d, a, b, c, x[11], S42, 0xbd3af235) # 62 II(c, d, a, b, x[2], S43, 0x2ad7d2bb) # 63 II(b, c, d, a, x[9], S44, 0xeb86d391) # 64 cal.mov('o0', a) cal.mov('o1', b) cal.mov('o2', c) cal.mov('o3', d) xcode.release_register(a) xcode.release_register(b) xcode.release_register(c) xcode.release_register(d) for xi in x: xcode.release_register(xi) xcode.set_remote_binding('i0', input_statea) xcode.set_remote_binding('i1', input_stateb) xcode.set_remote_binding('i2', input_statec) xcode.set_remote_binding('i3', input_stated) for i in range(16): #range(len(input_block)): xcode.set_remote_binding('i' + str(i + 4), input_block[i]) xcode.set_remote_binding('o0', outputa) xcode.set_remote_binding('o1', outputb) xcode.set_remote_binding('o2', outputc) xcode.set_remote_binding('o3', outputd) domain = (0, 0, N, N) global TIME start_time = time.time() proc.execute(xcode, domain) end_time = time.time() TIME += (end_time - start_time) for j in range(N): for i in range(N): for k in range(4): parcontext.statea[k + (i + j * N) * 4] += outputa[address_4_2d(i, j) + k] parcontext.stateb[k + (i + j * N) * 4] += outputb[address_4_2d(i, j) + k] parcontext.statec[k + (i + j * N) * 4] += outputc[address_4_2d(i, j) + k] parcontext.stated[k + (i + j * N) * 4] += outputd[address_4_2d(i, j) + k] proc.free_remote(input_statea) proc.free_remote(input_stateb) proc.free_remote(input_statec) proc.free_remote(input_stated) for block in input_block: proc.free_remote(block) proc.free_remote(outputa) proc.free_remote(outputb) proc.free_remote(outputc) proc.free_remote(outputd)
code = prgm.get_stream() inp = proc.alloc_remote('f', 4, 64) out = proc.alloc_remote('f', 4, 64) out.clear() for i in xrange(0, 64): inp[i] = float(i + 1) cal.set_active_code(code) cal.dcl_input(reg.v0.x, USAGE=cal.usage.pos) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.sample(0, 0, reg.o0, reg.v0.x) prgm.set_binding(reg.i0, inp) prgm.set_binding(reg.o0, out) prgm.add(code) prgm.print_code() proc.execute(prgm) print "inp", inp print "out", out import corepy.lib.printer as printer printer.PrintInstructionStream(code, printer.CAL_Asm())
def ParMD5Transform(parcontext, parblock, blocki): num = parcontext.number temp_block = extarray.extarray('I', 16*num) ParDecode(num, temp_block, parblock, blocki, 64) proc = env.Processor(0) N = int(math.sqrt(num/4)) #print "N = ", N def address_4_1d(i, pitch=64): x = i % N y = i // 64*4 #return x*4 + y*pitch*4*4 return i def address_4_2d(x, y, pitch=64): return x*4 + y*pitch*4 input_statea = proc.alloc_remote('I', 4, N, N) input_stateb = proc.alloc_remote('I', 4, N, N) input_statec = proc.alloc_remote('I', 4, N, N) input_stated = proc.alloc_remote('I', 4, N, N) input_block = proc.alloc_remote('I', 4, N*4*4, N) outputa = proc.alloc_remote('I', 4, N, N) outputb = proc.alloc_remote('I', 4, N, N) outputc = proc.alloc_remote('I', 4, N, N) outputd = proc.alloc_remote('I', 4, N, N) for j in range(N): for i in range(N): for k in range(4): input_statea[address_4_2d(i, j) + k] = parcontext.statea[k + (i + j*N)*4] input_stateb[address_4_2d(i, j) + k] = parcontext.stateb[k + (i + j*N)*4] input_statec[address_4_2d(i, j) + k] = parcontext.statec[k + (i + j*N)*4] input_stated[address_4_2d(i, j) + k] = parcontext.stated[k + (i + j*N)*4] for k in range(N): for j in range(0, N*4): for i in range(16): input_block[address_4_2d(j*4, k) + i] = temp_block[i + (j + k*N)*16] #print address_4_2d(j*4, k) + i, i + (j + k*N)*16 #print "N = ", N #for i in range(num): # print i, map(hex, [input_block[i*16+j] for j in range(16)]) global xcode if xcode == None: xcode = env.InstructionStream() cal.set_active_code(xcode) S11 = xcode.acquire_register((7, 7, 7, 7)) S12 = xcode.acquire_register((12, 12, 12, 12)) S13 = xcode.acquire_register((17, 17, 17, 17)) S14 = xcode.acquire_register((22, 22, 22, 22)) S21 = xcode.acquire_register((5, 5, 5, 5)) S22 = xcode.acquire_register((9, 9, 9, 9)) S23 = xcode.acquire_register((14, 14, 14, 14)) S24 = xcode.acquire_register((20, 20, 20, 20)) S31 = xcode.acquire_register((4, 4, 4, 4)) S32 = xcode.acquire_register((11, 11, 11, 11)) S33 = xcode.acquire_register((16, 16, 16, 16)) S34 = xcode.acquire_register((23, 23, 23, 23)) S41 = xcode.acquire_register((6, 6, 6, 6)) S42 = xcode.acquire_register((10, 10, 10, 10)) S43 = xcode.acquire_register((15, 15, 15, 15)) S44 = xcode.acquire_register((21, 21, 21, 21)) a = xcode.acquire_register() b = xcode.acquire_register() c = xcode.acquire_register() d = xcode.acquire_register() # TODO: Ensure these are all contiguous - necessary for the transposes x = [xcode.acquire_register() for i in range(16)] r = xcode.acquire_register() cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statea cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stateb cal.dcl_resource(2, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statec cal.dcl_resource(3, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stated cal.dcl_resource(4, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_output(reg.o2, USAGE=cal.usage.generic) cal.dcl_output(reg.o3, USAGE=cal.usage.generic) cal.sample(0, 0, a, reg.v0.xy) cal.sample(1, 0, b, reg.v0.xy) cal.sample(2, 0, c, reg.v0.xy) cal.sample(3, 0, d, reg.v0.xy) cal.dclpi(('0', '0', '-', '-'), reg.vWinCoord0, CENTERED=True) buffer_index = xcode.acquire_register() temp = xcode.acquire_register() consts = xcode.acquire_register((1.0, 2.0, 3.0, 4.0)) one = consts.x two = consts.y three = consts.z four = consts.w sixteen = xcode.acquire_register((16.0,)*4) cal.mov(buffer_index, reg.vWinCoord0.xy) cal.mul(buffer_index.x, buffer_index, sixteen) for i in range(4): cal.mov(temp.xy, buffer_index.xy) cal.sample(4, 0, x[i*4], buffer_index.xy) cal.add(buffer_index.x, buffer_index, four) cal.sample(4, 0, x[i*4+1], buffer_index.xy) cal.add(buffer_index.x, buffer_index, four) cal.sample(4, 0, x[i*4+2], buffer_index.xy) cal.add(buffer_index.x, buffer_index, four) cal.sample(4, 0, x[i*4+3], buffer_index.xy) cal.mov(buffer_index.xy, temp.xy) cal.add(buffer_index.x, buffer_index, one) cal.transpose(x[0], x[0]) cal.transpose(x[4], x[4]) cal.transpose(x[8], x[8]) cal.transpose(x[12], x[12]) # Round 1 FF (a, b, c, d, x[ 0], S11, 0xd76aa478); # 1 FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); # 2 FF (c, d, a, b, x[ 2], S13, 0x242070db); # 3 FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); # 4 FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); # 5 FF (d, a, b, c, x[ 5], S12, 0x4787c62a); # 6 FF (c, d, a, b, x[ 6], S13, 0xa8304613); # 7 FF (b, c, d, a, x[ 7], S14, 0xfd469501); # 8 FF (a, b, c, d, x[ 8], S11, 0x698098d8); # 9 FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); # 10 FF (c, d, a, b, x[10], S13, 0xffff5bb1); # 11 FF (b, c, d, a, x[11], S14, 0x895cd7be); # 12 FF (a, b, c, d, x[12], S11, 0x6b901122); # 13 FF (d, a, b, c, x[13], S12, 0xfd987193); # 14 FF (c, d, a, b, x[14], S13, 0xa679438e); # 15 FF (b, c, d, a, x[15], S14, 0x49b40821); # 16 # Round 2 GG (a, b, c, d, x[ 1], S21, 0xf61e2562); # 17 GG (d, a, b, c, x[ 6], S22, 0xc040b340); # 18 GG (c, d, a, b, x[11], S23, 0x265e5a51); # 19 GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); # 20 GG (a, b, c, d, x[ 5], S21, 0xd62f105d); # 21 GG (d, a, b, c, x[10], S22, 0x2441453); # 22 GG (c, d, a, b, x[15], S23, 0xd8a1e681); # 23 GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); # 24 GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); # 25 GG (d, a, b, c, x[14], S22, 0xc33707d6); # 26 GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); # 27 GG (b, c, d, a, x[ 8], S24, 0x455a14ed); # 28 GG (a, b, c, d, x[13], S21, 0xa9e3e905); # 29 GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); # 30 GG (c, d, a, b, x[ 7], S23, 0x676f02d9); # 31 GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); # 32 # Round 3 HH (a, b, c, d, x[ 5], S31, 0xfffa3942); # 33 HH (d, a, b, c, x[ 8], S32, 0x8771f681); # 34 HH (c, d, a, b, x[11], S33, 0x6d9d6122); # 35 HH (b, c, d, a, x[14], S34, 0xfde5380c); # 36 HH (a, b, c, d, x[ 1], S31, 0xa4beea44); # 37 HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); # 38 HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); # 39 HH (b, c, d, a, x[10], S34, 0xbebfbc70); # 40 HH (a, b, c, d, x[13], S31, 0x289b7ec6); # 41 HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); # 42 HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); # 43 HH (b, c, d, a, x[ 6], S34, 0x4881d05); # 44 HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); # 45 HH (d, a, b, c, x[12], S32, 0xe6db99e5); # 46 HH (c, d, a, b, x[15], S33, 0x1fa27cf8); # 47 HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); # 48 # Round 4 II (a, b, c, d, x[ 0], S41, 0xf4292244); # 49 II (d, a, b, c, x[ 7], S42, 0x432aff97); # 50 II (c, d, a, b, x[14], S43, 0xab9423a7); # 51 II (b, c, d, a, x[ 5], S44, 0xfc93a039); # 52 II (a, b, c, d, x[12], S41, 0x655b59c3); # 53 II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); # 54 II (c, d, a, b, x[10], S43, 0xffeff47d); # 55 II (b, c, d, a, x[ 1], S44, 0x85845dd1); # 56 II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); # 57 II (d, a, b, c, x[15], S42, 0xfe2ce6e0); # 58 II (c, d, a, b, x[ 6], S43, 0xa3014314); # 59 II (b, c, d, a, x[13], S44, 0x4e0811a1); # 60 II (a, b, c, d, x[ 4], S41, 0xf7537e82); # 61 II (d, a, b, c, x[11], S42, 0xbd3af235); # 62 II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); # 63 II (b, c, d, a, x[ 9], S44, 0xeb86d391); # 64 #cal.mov(buffer_index, reg.vWinCoord0.xy) #cal.mul(buffer_index.x, buffer_index, sixteen) #cal.add(buffer_index, buffer_index, reg.v0) #cal.mov('o0', buffer_index) cal.mov('o0', a) cal.mov('o1', b) cal.mov('o2', c) cal.mov('o3', d) xcode.release_register(a) xcode.release_register(b) xcode.release_register(c) xcode.release_register(d) for xi in x: xcode.release_register(xi) #xcode.cache_code() #print xcode.render_string xcode.set_remote_binding('i0', input_statea) xcode.set_remote_binding('i1', input_stateb) xcode.set_remote_binding('i2', input_statec) xcode.set_remote_binding('i3', input_stated) xcode.set_remote_binding('i4', input_block) xcode.set_remote_binding('o0', outputa) xcode.set_remote_binding('o1', outputb) xcode.set_remote_binding('o2', outputc) xcode.set_remote_binding('o3', outputd) domain = (0, 0, N, N) global TIME start_time = time.time() proc.execute(xcode, domain) end_time = time.time() TIME += (end_time - start_time) #print map(hex, [outputa[i] for i in range(4)]) #print map(hex, outputa) #print outputa for j in range(N): for i in range(N): for k in range(4): parcontext.statea[k + (i + j*N)*4] += outputa[address_4_2d(i, j) + k] parcontext.stateb[k + (i + j*N)*4] += outputb[address_4_2d(i, j) + k] parcontext.statec[k + (i + j*N)*4] += outputc[address_4_2d(i, j) + k] parcontext.stated[k + (i + j*N)*4] += outputd[address_4_2d(i, j) + k] proc.free_remote(input_statea) proc.free_remote(input_stateb) proc.free_remote(input_statec) proc.free_remote(input_stated) proc.free_remote(input_block) proc.free_remote(outputa) proc.free_remote(outputb) proc.free_remote(outputc) proc.free_remote(outputd)