def test_4comp(): proc = env.Processor(0) prgm = env.Program() code = prgm.get_stream() inp = proc.alloc_remote('i', 1, 4, 1) out = proc.alloc_remote('i', 4, 1, 1) for i in xrange(0, 4): inp[i] = i + 1 out[i] = 0 print "inp", inp[0:4] print "out", out[0:4] cal.set_active_code(code) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions r_cnt = prgm.acquire_register() r = prgm.acquire_registers(4) cal.mov(r_cnt, r_cnt('0000')) for i in xrange(0, 4): cal.sample(0, 0, r[i].x000, r_cnt.x) cal.add(r_cnt, r_cnt, r_cnt('1111')) cal.iadd(r[0], r[0], r[1]('0x00')) cal.iadd(r[0], r[0], r[2]('00x0')) cal.iadd(r[0], r[0], r[3]('000x')) cal.iadd(r[0], r[0], r[0]) cal.mov(reg.o0, r[0]) prgm.set_binding(reg.i0, inp) prgm.set_binding(reg.o0, out) prgm.add(code) prgm.print_code() proc.execute(prgm, (0, 0, 1, 1)) print "inp", inp[0:4] print "out", out[0:4] for i in xrange(0, 4): assert (out[i] == (i + 1) * 2) return
def test_4comp(): proc = env.Processor(0) prgm = env.Program() code = prgm.get_stream() inp = proc.alloc_remote('i', 1, 4, 1) out = proc.alloc_remote('i', 4, 1, 1) for i in xrange(0, 4): inp[i] = i + 1 out[i] = 0 print "inp", inp[0:4] print "out", out[0:4] cal.set_active_code(code) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions r_cnt = prgm.acquire_register() r = prgm.acquire_registers(4) cal.mov(r_cnt, r_cnt('0000')) for i in xrange(0, 4): cal.sample(0, 0, r[i].x000, r_cnt.x) cal.add(r_cnt, r_cnt, r_cnt('1111')) cal.iadd(r[0], r[0], r[1]('0x00')) cal.iadd(r[0], r[0], r[2]('00x0')) cal.iadd(r[0], r[0], r[3]('000x')) cal.iadd(r[0], r[0], r[0]) cal.mov(reg.o0, r[0]) prgm.set_binding(reg.i0, inp) prgm.set_binding(reg.o0, out) prgm.add(code) prgm.print_code() proc.execute(prgm, (0, 0, 1, 1)) print "inp", inp[0:4] print "out", out[0:4] for i in xrange(0, 4): assert(out[i] == (i + 1) * 2) return
def block(self, d, a, value): code = self.get_active_code() temp = code.prgm.acquire_register((value, value, value, value)) code.add(cal.add(d, a, temp)) code.prgm.release_register(temp) return
def cleanup(self): """Do end-of-loop iterator code""" # Update the current count if self.mode == DEC: self.code.add(cal.sub(self.r_count, self.r_count, self.r_step)) elif self.mode == INC: self.code.add(cal.add(self.r_count, self.r_count, self.r_step)) return
def generate(prgm): code = prgm.get_stream() cal.set_active_code(code) r_count = prgm.acquire_register() #r_cx = prgm.acquire_register() #r_cy = prgm.acquire_register() r_sum = prgm.acquire_register() r_limit = prgm.acquire_register((64.0, ) * 4) r_cmp = prgm.acquire_register() cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.mov(r_count, r_count('0000')) #cal.mov(r_cx, r_cx('0000')) #cal.mov(r_cy, r_cy('0000')) cal.mov(r_sum, r_sum('0000')) cal.whileloop() #cal.ge(r_cmp, r_count.x, r_limit) #cal.break_logicalnz(r_cmp) cal.breakc(cal.relop.ge, r_count.x, r_limit) cal.mov(r_count, r_count('x0zw')) cal.whileloop() #cal.ge(r_cmp, r_count.y, r_limit) #cal.break_logicalnz(r_cmp) cal.breakc(cal.relop.ge, r_count.y, r_limit) cal.add(r_sum, r_sum, r_sum('1111')) cal.add(r_count, r_count, r_count('0100')) cal.endloop() cal.add(r_count, r_count, r_count('1000')) cal.endloop() cal.mov(reg.o0, r_sum) return code
def generate(prgm): code = prgm.get_stream() cal.set_active_code(code) r_count = prgm.acquire_register() #r_cx = prgm.acquire_register() #r_cy = prgm.acquire_register() r_sum = prgm.acquire_register() r_limit = prgm.acquire_register((64.0,) * 4) r_cmp = prgm.acquire_register() cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.mov(r_count, r_count('0000')) #cal.mov(r_cx, r_cx('0000')) #cal.mov(r_cy, r_cy('0000')) cal.mov(r_sum, r_sum('0000')) cal.whileloop() #cal.ge(r_cmp, r_count.x, r_limit) #cal.break_logicalnz(r_cmp) cal.breakc(cal.relop.ge, r_count.x, r_limit) cal.mov(r_count, r_count('x0zw')) cal.whileloop() #cal.ge(r_cmp, r_count.y, r_limit) #cal.break_logicalnz(r_cmp) cal.breakc(cal.relop.ge, r_count.y, r_limit) cal.add(r_sum, r_sum, r_sum('1111')) cal.add(r_count, r_count, r_count('0100')) cal.endloop() cal.add(r_count, r_count, r_count('1000')) cal.endloop() cal.mov(reg.o0, r_sum) return code
def block(self, d, a, b): code = self.get_active_code() code.add(cal.add(d, b, a)) return
def cal_nb_generate(n_bodies, dt): code = env.InstructionStream() cal.set_active_code(code) fn_bodies = float(n_bodies) r_count = code.acquire_register() r_lpos = code.acquire_register() r_rpos = code.acquire_register() r_force = code.acquire_register() r_diff = code.acquire_register() r_dist_vec = code.acquire_register() r_dist = code.acquire_register() r_force_tmp = code.acquire_register() r_force_vec = code.acquire_register() r_vel = code.acquire_register() #code.add("dcl_input_position_interp(linear_noperspective) v0.x___") cal.dcl_input(reg.v0.x___, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective) r_bodies = code.acquire_register((fn_bodies,) * 4) r_G = code.acquire_register((G,) * 4) r_dt = code.acquire_register((dt,) * 4) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions cal.dcl_resource(1, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # velocities # Loop over all other points to calculate the force cal.mov(r_count, r_count('0000')) # loop counter cal.sample(0, 0, r_lpos, reg.v0.x) # Local position cal.mov(r_force, r_force('0000')) # total force # Compute force using input from every other point cal.whileloop() # Break if end of points reached cal.breakc(cal.relop.ge, r_count, r_bodies) cal.sample(0, 0, r_rpos, r_count.x) # Remote position # d_xyz cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0) # local pos - remote pos # dist_tmp cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx) cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec) cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec) # distance # TODO - skip rest of force computation if distance is 0 cal.sqrt_vec(r_dist, r_dist_vec) # force G * ((m[i] * m[j]) / dist_tmp) cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww) cal.div(cal.zeroop.zero, r_force_tmp, r_force_tmp, r_dist_vec) cal.mul(r_force_tmp, r_force_tmp, r_G) # f_xyz cal.div(cal.zeroop.zero, r_force_vec, r_diff.xyz0, r_dist.xyz1) cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0) cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0) # Increment loop counter, end loop cal.add(r_count, r_count, r_count('1111')) cal.endloop() # Acceleration cal.div(cal.zeroop.zero, r_force, r_force.xyz0, r_lpos.wwww) # Velocity cal.sample(1, 1, r_vel, reg.v0.x) # Load velocity cal.mad(r_vel, r_force, r_dt, r_vel) cal.mov(reg.o1, r_vel) # Position cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw) return code
def cal_nb_generate_local(n_bodies, dt, steps): code = env.InstructionStream() cal.set_active_code(code) fn_bodies = float(n_bodies) steps = float(steps) r_count = code.acquire_register() r_step = code.acquire_register() r_lpos = code.acquire_register() r_rpos = code.acquire_register() r_force = code.acquire_register() r_diff = code.acquire_register() r_dist_vec = code.acquire_register() r_dist = code.acquire_register() r_force_tmp = code.acquire_register() r_force_vec = code.acquire_register() r_vel = code.acquire_register() print "fn_bodies", fn_bodies code.add("dcl_input_position_interp(linear_noperspective) v0.xy__") #cal.dcl_input(reg.v0.x___, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective) r_numsteps = code.acquire_register((steps,) * 4) r_bodies = code.acquire_register((fn_bodies,) * 4) #r_bodiesquare = code.acquire_register((float(fn_bodies**2),) * 4) r_G = code.acquire_register((G,) * 4) r_dt = code.acquire_register((dt,) * 4) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_output(reg.o2, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # positions cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # velocities r_foo = code.acquire_register() cal.mov(r_foo, r_foo('0000')) r_gpos = code.acquire_register() cal.mad(r_gpos, reg.v0.y, r_bodies.x, reg.v0.x) r_gvel = code.acquire_register() cal.mad(r_gvel, r_bodies.x, r_bodies.x, r_gpos) cal.ftoi(r_gpos, r_gpos) cal.ftoi(r_gvel, r_gvel) cal.sample(0, 0, r_lpos, reg.v0.xy) # Local position cal.sample(1, 1, r_vel, reg.v0.xy) # Load velocity cal.mov(reg.g[r_gpos.x], r_lpos) cal.mov(reg.g[r_gvel.x], r_vel) cal.mov(r_step, r_step('0000')) cal.whileloop() cal.breakc(cal.relop.ge, r_step.x, r_numsteps) cal.mov(r_count, r_count('0000')) # loop counter cal.whileloop() cal.breakc(cal.relop.ge, r_count.x, r_bodies) cal.add(r_foo, r_foo, r_foo('1111')) # calculate force r_tmp = code.acquire_register() cal.ftoi(r_tmp, r_count) cal.mov(r_rpos, reg.g[r_tmp.x]) # d_xyz cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0) # local pos - remote pos # dist_tmp cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx) cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec) cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec) # distance # TODO - skip rest of force computation if distance is 0 cal.sqrt_vec(r_dist, r_dist_vec) # force G * ((m[i] * m[j]) / dist_tmp) cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww) cal.div(r_force_tmp, r_force_tmp, r_dist_vec, ZEROOP = cal.zeroop.zero) cal.mul(r_force_tmp, r_force_tmp, r_G) # f_xyz # TODO - whats going on, is this right? cal.div(r_force_vec, r_diff.xyz0, r_dist.xyz1, ZEROOP = cal.zeroop.zero) cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0) cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0) cal.add(r_count, r_count, r_count('1111')) cal.endloop() # Acceleration cal.div(r_force, r_force.xyz0, r_lpos.wwww, ZEROOP = cal.zeroop.zero) # Velocity cal.mad(r_vel, r_force, r_dt, r_vel) # Position cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw) # store updated pos and vel cal.mov(reg.g[r_gpos.x], r_lpos) cal.mov(reg.g[r_gvel.x], r_vel) cal.add(r_step, r_step, r_step('1111')) cal.endloop() cal.mov(reg.o0, r_lpos) cal.mov(reg.o1, r_vel) cal.mov(reg.o2, r_foo) return code
def cal_nb_generate_2d(prgm, n_bodies, dt): code = prgm.get_stream() cal.set_active_code(code) fn_bodies = float(n_bodies) #r_cx = prgm.acquire_register() #r_cy = prgm.acquire_register() r_count = prgm.acquire_register() r_lpos = prgm.acquire_register() r_rpos = prgm.acquire_register() r_force = prgm.acquire_register() r_diff = prgm.acquire_register() r_dist_vec = prgm.acquire_register() r_dist = prgm.acquire_register() r_force_tmp = prgm.acquire_register() r_force_vec = prgm.acquire_register() r_vel = prgm.acquire_register() #code.add("dcl_input_position_interp(linear_noperspective) v0.xy__") cal.dcl_input(reg.v0.xy__, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective) r_bodies = prgm.acquire_register((fn_bodies,) * 4) r_G = prgm.acquire_register((G,) * 4) r_dt = prgm.acquire_register((dt,) * 4) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # positions cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # velocities # Loop over all other points to calculate the force cal.mov(r_count, r_count('0000')) # loop counter #cal.mov(r_cx, r_cx('0000')) # loop counter #cal.mov(r_cy, r_cy('0000')) # loop counter cal.sample(0, 0, r_lpos, reg.v0.xy) # Local position cal.mov(r_force, r_force('0000')) # total force # Compute force using input from every other point cal.whileloop() cal.breakc(cal.relop.ge, r_count.x, r_bodies) cal.mov(r_count, r_count.x0zw) cal.whileloop() cal.breakc(cal.relop.ge, r_count.y, r_bodies) #for i in xrange(0, 4): #cal.add(r_count, r_cx('x000'), r_cy('0x00')) cal.sample(0, 0, r_rpos, r_count.xy) # Remote position # d_xyz cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0) # local pos - remote pos # dist_tmp #cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx) #cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec) #cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec) cal.dp3(r_dist_vec, r_diff, r_diff, IEEE = False) # distance # TODO - skip rest of force computation if distance is 0 cal.sqrt_vec(r_dist, r_dist_vec) # force G * ((m[i] * m[j]) / dist_tmp) cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww, IEEE = False) cal.div(r_force_tmp, r_force_tmp, r_dist_vec, ZEROOP = cal.zeroop.zero) cal.mul(r_force_tmp, r_force_tmp, r_G, IEEE = False) # f_xyz # TODO - whats going on, is this right? cal.div(r_force_vec, r_diff.xyz0, r_dist.xyz1, ZEROOP = cal.zeroop.zero) cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0, IEEE = False) cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0) #cal.add(r_cy, r_cy, r_count('1111')) #cal.add(r_count, r_count, r_count('0100')) #cal.ifc(cal.relop.ge, r_count.y, r_bodies.y) ## TODO - can I merge these two? #cal.mov(r_count('_y__'), r_count('x0zw')) #cal.add(r_count, r_count, r_count('1000')) #cal.endif() # Increment loop counter, end loop cal.add(r_count, r_count, r_count('0100')) cal.endloop() cal.add(r_count, r_count, r_count('1000')) #cal.add(r_cx, r_cx, r_cx('1111')) cal.endloop() # Acceleration cal.div(r_force, r_force.xyz0, r_lpos.wwww, ZEROOP = cal.zeroop.zero) # Velocity cal.sample(1, 1, r_vel, reg.v0.xy) # Load velocity cal.mad(r_vel, r_force, r_dt, r_vel, IEEE = False) cal.mov(reg.o1, r_vel) # Position cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw, IEEE = False) #cal.mov(reg.g[0], r_vel) return code
def ParMD5Transform(parcontext, parblock, blocki): num = parcontext.number temp_block = extarray.extarray('I', 16*num) ParDecode(num, temp_block, parblock, blocki, 64) proc = env.Processor(0) N = int(math.sqrt(num/4)) #print "N = ", N def address_4_1d(i, pitch=64): x = i % N y = i // 64*4 #return x*4 + y*pitch*4*4 return i def address_4_2d(x, y, pitch=64): return x*4 + y*pitch*4 input_statea = proc.alloc_remote('I', 4, N, N) input_stateb = proc.alloc_remote('I', 4, N, N) input_statec = proc.alloc_remote('I', 4, N, N) input_stated = proc.alloc_remote('I', 4, N, N) input_block = proc.alloc_remote('I', 4, N*4*4, N) outputa = proc.alloc_remote('I', 4, N, N) outputb = proc.alloc_remote('I', 4, N, N) outputc = proc.alloc_remote('I', 4, N, N) outputd = proc.alloc_remote('I', 4, N, N) for j in range(N): for i in range(N): for k in range(4): input_statea[address_4_2d(i, j) + k] = parcontext.statea[k + (i + j*N)*4] input_stateb[address_4_2d(i, j) + k] = parcontext.stateb[k + (i + j*N)*4] input_statec[address_4_2d(i, j) + k] = parcontext.statec[k + (i + j*N)*4] input_stated[address_4_2d(i, j) + k] = parcontext.stated[k + (i + j*N)*4] for k in range(N): for j in range(0, N*4): for i in range(16): input_block[address_4_2d(j*4, k) + i] = temp_block[i + (j + k*N)*16] #print address_4_2d(j*4, k) + i, i + (j + k*N)*16 #print "N = ", N #for i in range(num): # print i, map(hex, [input_block[i*16+j] for j in range(16)]) global xcode if xcode == None: xcode = env.InstructionStream() cal.set_active_code(xcode) S11 = xcode.acquire_register((7, 7, 7, 7)) S12 = xcode.acquire_register((12, 12, 12, 12)) S13 = xcode.acquire_register((17, 17, 17, 17)) S14 = xcode.acquire_register((22, 22, 22, 22)) S21 = xcode.acquire_register((5, 5, 5, 5)) S22 = xcode.acquire_register((9, 9, 9, 9)) S23 = xcode.acquire_register((14, 14, 14, 14)) S24 = xcode.acquire_register((20, 20, 20, 20)) S31 = xcode.acquire_register((4, 4, 4, 4)) S32 = xcode.acquire_register((11, 11, 11, 11)) S33 = xcode.acquire_register((16, 16, 16, 16)) S34 = xcode.acquire_register((23, 23, 23, 23)) S41 = xcode.acquire_register((6, 6, 6, 6)) S42 = xcode.acquire_register((10, 10, 10, 10)) S43 = xcode.acquire_register((15, 15, 15, 15)) S44 = xcode.acquire_register((21, 21, 21, 21)) a = xcode.acquire_register() b = xcode.acquire_register() c = xcode.acquire_register() d = xcode.acquire_register() # TODO: Ensure these are all contiguous - necessary for the transposes x = [xcode.acquire_register() for i in range(16)] r = xcode.acquire_register() cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statea cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stateb cal.dcl_resource(2, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statec cal.dcl_resource(3, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stated cal.dcl_resource(4, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_output(reg.o2, USAGE=cal.usage.generic) cal.dcl_output(reg.o3, USAGE=cal.usage.generic) cal.sample(0, 0, a, reg.v0.xy) cal.sample(1, 0, b, reg.v0.xy) cal.sample(2, 0, c, reg.v0.xy) cal.sample(3, 0, d, reg.v0.xy) cal.dclpi(('0', '0', '-', '-'), reg.vWinCoord0, CENTERED=True) buffer_index = xcode.acquire_register() temp = xcode.acquire_register() consts = xcode.acquire_register((1.0, 2.0, 3.0, 4.0)) one = consts.x two = consts.y three = consts.z four = consts.w sixteen = xcode.acquire_register((16.0,)*4) cal.mov(buffer_index, reg.vWinCoord0.xy) cal.mul(buffer_index.x, buffer_index, sixteen) for i in range(4): cal.mov(temp.xy, buffer_index.xy) cal.sample(4, 0, x[i*4], buffer_index.xy) cal.add(buffer_index.x, buffer_index, four) cal.sample(4, 0, x[i*4+1], buffer_index.xy) cal.add(buffer_index.x, buffer_index, four) cal.sample(4, 0, x[i*4+2], buffer_index.xy) cal.add(buffer_index.x, buffer_index, four) cal.sample(4, 0, x[i*4+3], buffer_index.xy) cal.mov(buffer_index.xy, temp.xy) cal.add(buffer_index.x, buffer_index, one) cal.transpose(x[0], x[0]) cal.transpose(x[4], x[4]) cal.transpose(x[8], x[8]) cal.transpose(x[12], x[12]) # Round 1 FF (a, b, c, d, x[ 0], S11, 0xd76aa478); # 1 FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); # 2 FF (c, d, a, b, x[ 2], S13, 0x242070db); # 3 FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); # 4 FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); # 5 FF (d, a, b, c, x[ 5], S12, 0x4787c62a); # 6 FF (c, d, a, b, x[ 6], S13, 0xa8304613); # 7 FF (b, c, d, a, x[ 7], S14, 0xfd469501); # 8 FF (a, b, c, d, x[ 8], S11, 0x698098d8); # 9 FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); # 10 FF (c, d, a, b, x[10], S13, 0xffff5bb1); # 11 FF (b, c, d, a, x[11], S14, 0x895cd7be); # 12 FF (a, b, c, d, x[12], S11, 0x6b901122); # 13 FF (d, a, b, c, x[13], S12, 0xfd987193); # 14 FF (c, d, a, b, x[14], S13, 0xa679438e); # 15 FF (b, c, d, a, x[15], S14, 0x49b40821); # 16 # Round 2 GG (a, b, c, d, x[ 1], S21, 0xf61e2562); # 17 GG (d, a, b, c, x[ 6], S22, 0xc040b340); # 18 GG (c, d, a, b, x[11], S23, 0x265e5a51); # 19 GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); # 20 GG (a, b, c, d, x[ 5], S21, 0xd62f105d); # 21 GG (d, a, b, c, x[10], S22, 0x2441453); # 22 GG (c, d, a, b, x[15], S23, 0xd8a1e681); # 23 GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); # 24 GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); # 25 GG (d, a, b, c, x[14], S22, 0xc33707d6); # 26 GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); # 27 GG (b, c, d, a, x[ 8], S24, 0x455a14ed); # 28 GG (a, b, c, d, x[13], S21, 0xa9e3e905); # 29 GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); # 30 GG (c, d, a, b, x[ 7], S23, 0x676f02d9); # 31 GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); # 32 # Round 3 HH (a, b, c, d, x[ 5], S31, 0xfffa3942); # 33 HH (d, a, b, c, x[ 8], S32, 0x8771f681); # 34 HH (c, d, a, b, x[11], S33, 0x6d9d6122); # 35 HH (b, c, d, a, x[14], S34, 0xfde5380c); # 36 HH (a, b, c, d, x[ 1], S31, 0xa4beea44); # 37 HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); # 38 HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); # 39 HH (b, c, d, a, x[10], S34, 0xbebfbc70); # 40 HH (a, b, c, d, x[13], S31, 0x289b7ec6); # 41 HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); # 42 HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); # 43 HH (b, c, d, a, x[ 6], S34, 0x4881d05); # 44 HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); # 45 HH (d, a, b, c, x[12], S32, 0xe6db99e5); # 46 HH (c, d, a, b, x[15], S33, 0x1fa27cf8); # 47 HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); # 48 # Round 4 II (a, b, c, d, x[ 0], S41, 0xf4292244); # 49 II (d, a, b, c, x[ 7], S42, 0x432aff97); # 50 II (c, d, a, b, x[14], S43, 0xab9423a7); # 51 II (b, c, d, a, x[ 5], S44, 0xfc93a039); # 52 II (a, b, c, d, x[12], S41, 0x655b59c3); # 53 II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); # 54 II (c, d, a, b, x[10], S43, 0xffeff47d); # 55 II (b, c, d, a, x[ 1], S44, 0x85845dd1); # 56 II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); # 57 II (d, a, b, c, x[15], S42, 0xfe2ce6e0); # 58 II (c, d, a, b, x[ 6], S43, 0xa3014314); # 59 II (b, c, d, a, x[13], S44, 0x4e0811a1); # 60 II (a, b, c, d, x[ 4], S41, 0xf7537e82); # 61 II (d, a, b, c, x[11], S42, 0xbd3af235); # 62 II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); # 63 II (b, c, d, a, x[ 9], S44, 0xeb86d391); # 64 #cal.mov(buffer_index, reg.vWinCoord0.xy) #cal.mul(buffer_index.x, buffer_index, sixteen) #cal.add(buffer_index, buffer_index, reg.v0) #cal.mov('o0', buffer_index) cal.mov('o0', a) cal.mov('o1', b) cal.mov('o2', c) cal.mov('o3', d) xcode.release_register(a) xcode.release_register(b) xcode.release_register(c) xcode.release_register(d) for xi in x: xcode.release_register(xi) #xcode.cache_code() #print xcode.render_string xcode.set_remote_binding('i0', input_statea) xcode.set_remote_binding('i1', input_stateb) xcode.set_remote_binding('i2', input_statec) xcode.set_remote_binding('i3', input_stated) xcode.set_remote_binding('i4', input_block) xcode.set_remote_binding('o0', outputa) xcode.set_remote_binding('o1', outputb) xcode.set_remote_binding('o2', outputc) xcode.set_remote_binding('o3', outputd) domain = (0, 0, N, N) global TIME start_time = time.time() proc.execute(xcode, domain) end_time = time.time() TIME += (end_time - start_time) #print map(hex, [outputa[i] for i in range(4)]) #print map(hex, outputa) #print outputa for j in range(N): for i in range(N): for k in range(4): parcontext.statea[k + (i + j*N)*4] += outputa[address_4_2d(i, j) + k] parcontext.stateb[k + (i + j*N)*4] += outputb[address_4_2d(i, j) + k] parcontext.statec[k + (i + j*N)*4] += outputc[address_4_2d(i, j) + k] parcontext.stated[k + (i + j*N)*4] += outputd[address_4_2d(i, j) + k] proc.free_remote(input_statea) proc.free_remote(input_stateb) proc.free_remote(input_statec) proc.free_remote(input_stated) proc.free_remote(input_block) proc.free_remote(outputa) proc.free_remote(outputb) proc.free_remote(outputc) proc.free_remote(outputd)