def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async=True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert (result == (i + 0x10)) proc.join(spe_id) return
def TestStreamBufferDouble(n_spus = 1): n = 2048 a = extarray.extarray('I', range(n)) buffer_size = 32 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] n_bytes = n * 4 #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True) if n_spus > 1: stream = parallel(stream) for buffer in stream: for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, len(a)): assert(a[i] == i + i) return
def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode='fp') x_test = array.array('f', [1.0, 2.0, 3.0, 4.0]) y_test = array.array('f', [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert (result == r_test) return
def TestVecIter(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) for buffer in stream: for current in spu_vec_iter(code, md): current.v = current + current prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def TestAll(): import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) a = code.prgm.acquire_register() b = code.prgm.acquire_register() c = code.prgm.acquire_register() shr(c, a, b) cneq(c, a, b) cge(c, a, b) cgei(c, a, 10) lt(c, a, b) lti(c, a, 10) a_immediate(c, a, 10) a_immediate(c, a, 10000) sf_immediate(c, a, 10000) prgm.add(code) prgm.print_code() proc = env.Processor() proc.execute(prgm) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name=55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='int', stop=True, debug=True) assert (r[0] == 42) assert (r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='fp') print r return
def fb_draw(): prgm0 = synspu.Program() prgm1 = synspu.Program() code0 = prgm0.get_stream() code1 = prgm1.get_stream() prgm0 += code0 prgm1 += code1 proc = synspu.Processor() fb = cell_fb.framebuffer() cell_fb.fb_open(fb) draw0 = FBDraw() draw0.set_buffers(cell_fb.fb_addr(fb, 0), cell_fb.fb_addr(fb, 1)) draw0.set_stride(fb.stride) draw0.synthesize(code0) draw1 = FBDraw() draw1.set_buffers(cell_fb.fb_addr(fb, 1), cell_fb.fb_addr(fb, 0))cell_fb.fb_addr(fb, 0)) draw1.set_stride(fb.stride) draw1.synthesize(code1) while True: # cell_fb.fb_clear(fb, 0) proc.execute(prgm0) cell_fb.fb_wait_vsync(fb) cell_fb.fb_flip(fb, 0) # cell_fb.fb_clear(fb, 1) proc.execute(prgm1) cell_fb.fb_wait_vsync(fb) cell_fb.fb_flip(fb, 1) cell_fb.fb_close(fb) return
def RunTest(test): import corepy.arch.spu.platform as env #from corepy.arch.spu.platform import InstructionStream, Processor prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) test() prgm.add(code) prgm.print_code() proc = env.Processor() proc.execute(prgm) return
def TestStreamBufferSingle(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 128 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) #r_bufsize = code.acquire_register() #r_lsa = code.acquire_register() #r_current = code.acquire_register() for buffer in stream: #util.load_word(code, r_bufsize, buffer_size) #code.add(spu.il(r_lsa, 0)) #loop = code.size() #code.add(spu.lqx(r_current, buffer, r_lsa)) #code.add(spu.a(r_current, r_current, r_current)) #code.add(spu.stqx(r_current, buffer, r_lsa)) #code.add(spu.ai(r_bufsize, r_bufsize, -16)) #code.add(spu.ai(r_lsa, r_lsa, 16)) #code.add(spu.brnz(r_bufsize, loop - code.size())) for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current #current.v = 5 code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def GenerateStream(self, step=None): prgm = env.Program() code = prgm.get_stream() txt = self.editCtrl.GetText().split('\n') txtlen = len(txt) for i in xrange(0, txtlen): # For the stop case, want all instructions except the current one to be # STOP instructions. cmd = txt[i].strip() if step != None and i != step: if cmd == "" or cmd[0] == '#': continue if cmd[-1] == ":": # Label - better parsing? #code.add(spe.Label(cmd[:-1])) code.add(code.prgm.get_label(cmd[:-1])) else: code.add(spu.stop(0x2FFF)) continue if self.editCtrl.IsBreakSet(i): code.add(spu.stop(0x2FFF)) continue if cmd != "" and cmd[0] != '#': inst = None if cmd[-1] == ":": # Label - better parsing? #inst = spe.Label(cmd[:-1]) inst = code.prgm.get_label(cmd[:-1]) else: # Instruction strcmd = re.sub("Label\((.*?)\)", "code.prgm.get_label('\\1')", cmd) try: inst = eval('spu.%s' % strcmd) except: print 'Error creating instruction: %s' % cmd code.add(inst) prgm.add(code) prgm.cache_code() return code
def TestContinueLabel(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) test = var.SignedWord(0, code) four = var.SignedWord(4, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) lsa_iter = spu_vec_iter(code, md) for buffer in stream: for current in lsa_iter: current.v = current + current test.v = (current == four) code.add(spu.gbb(test, test)) #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl)) code.add(spu.brz(test.reg, lsa_iter.continue_label)) current.v = current + current #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): if i >= 4: assert(a[i] == i + i) else: #print a[i] assert(a[i] == i * 4) return
def TestFloatScalar(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat(1.0) y = SingleFloat(2.0) r = SingleFloat(0.0, reg=code.fp_return) r.v = spu.fa.ex(x, y) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode='fp') assert (result == (1.0 + 2.0)) return
def _startSPU(self): self.ctx = ctx = env.spu_exec.alloc_context() # Execute a no-op instruction stream so the prolog is executed prgm = env.Program() code = prgm.get_stream() code.add(spu.nop(code.r_zero)) prgm.cache_code() itemsize = prgm.render_code.itemsize code_len = len(prgm.render_code) * itemsize if code_len % 16 != 0: code_len += 16 - (code_len % 16) code_lsa = 0x40000 - code_len env.spu_exec.run_stream(ctx, prgm.inst_addr(), code_len, code_lsa, code_lsa) self.localstore = extarray.extarray('I', 262144 / 4) print "spuls %x" % (ctx.spuls), ctx.spuls, type(ctx.spuls) self.localstore.set_memory(ctx.spuls, 262144) return
def SpeedTest(n_spus=6, n_floats=6): """ Get a rough estimate of the maximum flop count. On a PS3 using all 6 spus, this is 152 GFlops. """ if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) f_range = range(n_floats) a = [SingleFloat(0.0) for i in f_range] b = [SingleFloat(0.0) for i in f_range] c = [SingleFloat(0.0) for i in f_range] t = [SingleFloat(0.0) for i in f_range] outer = 2**12 inner = 2**16 unroll = 128 fuse = 2 simd = 4 for x in syn_iter(code, outer): for y in syn_iter(code, inner): for u in xrange(unroll): for i in f_range: t[i].v = spu.fma.ex(a[i], b[i], c[i]) # Run the synthetic program and copy the results back to the array # TODO - AWF - use the SPU decrementers to time this proc = env.Processor() prgm += code start = time.time() r = proc.execute(prgm, n_spus=n_spus) stop = time.time() total = stop - start n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long( fuse) * long(simd) * long(n_spus) print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) # # Run the native program and copy the results back to the array # outer = 2**14 # inner = 2**16 # unroll = 1 # fuse = 1 # simd = 1 # proc = Processor() # # ncode = NativeInstructionStream("a.out") # start = time.time() # r = proc.execute(ncode, n_spus = n_spus) # stop = time.time() # total = stop - start # n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) # print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) results = """ --> No optimizations Executing native code: a.out 14.805322 sec, 20.89 GFlops --> Synthetic Platform: linux.spre_linux_spu no raw data 65.023350 sec, 152.19 GFlops --> -O3 (fuse: 2, simd: 4) Executing native code: a.out 7.407939 sec, 41.74 GFlops --> -O3 (fuse: 1, simd: 1) Executing native code: a.out 7.403702 sec, 5.22 GFlops """ return
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) prgm = env.Program() code = prgm.get_stream() r_ea_data = prgm.acquire_register() r_ls_data = prgm.acquire_register() r_size = prgm.acquire_register() r_tag = prgm.acquire_register() #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, code.r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, code.r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, code.r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, code.r_zero, lsa)) # code.prgm.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, code.r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Cleanup prgm.release_register(r_ea_data) prgm.release_register(r_ls_data) prgm.release_register(r_size) prgm.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code prgm.add(code) proc = env.Processor() r = proc.execute(prgm) for i in range(0, size): assert(data[i] == i + i) return
def generate(self, results, patterns, r1_range, r2_range, max_init, max_n, size, n_spus=6): # Connect to the framebuffer fb = cell_fb.framebuffer() cell_fb.fb_open(fb) # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = [0 for i in range(n_spus)] #a_ranges = [0 for i in range(n_spus)] # Slice and dice for parallel execution spu_slices = [[size[0], size[1] / n_spus] for ispu in range(n_spus)] spu_slices[-1][1] += size[1] % n_spus offset = 0.0 for ispu in range(n_spus): ranges[ispu] = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[ispu][ i] = r1_range[0] + float(i) * r1_inc # horizontal is simd ranges[ispu][4 + i] = r2_range[0] + offset ranges[ispu][8 + i] = r1_inc * 4.0 ranges[ispu][12 + i] = r2_inc # print ranges # Copy the paramters to aligned buffers #a_ranges[ispu] = synspu.aligned_memory(len(ranges[ispu]), typecode='I') #a_ranges[ispu].copy_to(ranges[ispu].buffer_info()[0], len(ranges[ispu])) offset += r2_inc * spu_slices[ispu][1] # Setup the pattern vector for pattern in patterns: if len(pattern) != len(patterns[0]): raise Exception('All patterns must be the same length') bits = [_pattern2vector(pattern) for pattern in patterns] #a_pattern = synspu.aligned_memory(len(bits[0]), typecode='I') pattern = extarray.extarray('I', len(bits[0])) # Create the instruction streams prgms = [] n = len(patterns) * 10 offset = 0 for ispu in range(n_spus): renderer = FBRenderer() renderer.set_lsa(0x100) renderer.set_addr(cell_fb.fb_addr(fb, 0) + offset) renderer.set_width(size[0]) renderer.set_stride(fb.stride) ly_block = LyapunovBlock() ly_block.set_size(*spu_slices[i]) #ly_block.set_range(a_ranges[ispu]) ly_block.set_range(ranges[ispu]) #ly_block.set_pattern(a_pattern) ly_block.set_pattern(pattern) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) prgm = synspu.Program() code = env.get_stream() prgm += code # code.set_debug(True) prgms.append(prgm) offset += spu_slices[i][1] * fb.stride * 4 # for i in spuiter.syn_range(code, n): ly_block.synthesize(code) # code.print_code() proc = synspu.Processor() cell_fb.fb_clear(fb, 0) import time ids = [0 for i in range(n_spus)] start = time.time() ipattern = 0 n_patterns = len(patterns) len_bits = len(bits[0]) pattern_inc = 1 for i in range(n): #a_pattern.copy_to(bits[ipattern].buffer_info()[0], len_bits) # TODO - better/faster for j in xrange(0, len_bits): pattern[j] = bits[ipattern][j] for ispu in range(n_spus): ids[ispu] = proc.execute(prgms[ispu], async=True) for ispu in range(n_spus): proc.join(ids[ispu]) cell_fb.fb_wait_vsync(fb) cell_fb.fb_flip(fb, 0) ipattern += pattern_inc if (ipattern == (n_patterns - 1)) or (ipattern == 0): pattern_inc *= -1 print ipattern stop = time.time() print '%.2f fps (%.6f)' % (float(n) / (stop - start), (stop - start)) cell_fb.fb_close(fb) return
def generate(self, results, pattern, r1_range, r2_range, max_init, max_n, size): # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[i] = r1_range[0] ranges[4 + i] = r2_range[0] ranges[8 + i] = r1_inc ranges[12 + i] = r2_inc # Setup the pattern vector bits = _pattern2vector(pattern) # Copy the paramters to aligned buffers #a_ranges = synspu.aligned_memory(len(ranges), typecode='I') #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges)) #a_pattern = synspu.aligned_memory(len(bits), typecode='I') #a_pattern.copy_to(bits.buffer_info()[0], len(bits)) renderer = MailboxRenderer() ly_block = LyapunovBlock() ly_block.set_size(size[0], size[1]) #ly_block.set_range(a_ranges) #ly_block.set_pattern(a_pattern) ly_block.set_range(ranges) ly_block.set_pattern(bits) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) prgm = synspu.Program() code = prgm.get_stream() prgm += code ly_block.synthesize(code) proc = synspu.Processor() spe_id = proc.execute(prgm, async=True) for i in range(size[0] * size[1]): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) # for x in range(size[0]): # r2 = r2_range[0] + r2_inc # print 'col:', x, r1, r2 # for y in range(size[1]): # results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n) # r2 += r2_inc # r1 += r1_inc return
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import corepy.lib.extarray as extarray import corepy.arch.spu.isa as spu import corepy.arch.spu.platform as env import corepy.arch.spu.lib.dma as dma from corepy.arch.spu.lib.util import load_word import time if __name__ == '__main__': ITERS = 500000 #ITERS = 15 prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) psmap = extarray.extarray('I', 131072 / 4) data = extarray.extarray('I', range(0, 16)) r_sum = prgm.gp_return r_cnt = prgm.acquire_register() spu.xor(r_sum, r_sum, r_sum) load_word(code, r_cnt, ITERS) lbl_loop = prgm.get_label("loop") code.add(lbl_loop)
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = prgm.gp_return test = prgm.acquire_register() lbl_brz = prgm.get_label("BRZ") lbl_skip = prgm.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode='int', stop=True) print "ret", r assert (r[0] == 42) assert (r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) lbl_loop = prgm.get_label("LOOP") lbl_break = prgm.get_label("BREAK") r_cnt = prgm.acquire_register() r_stop = prgm.acquire_register() r_cmp = prgm.acquire_register() r_foo = prgm.gp_return spu.ori(r_foo, prgm.r_zero, 0) spu.ori(r_cnt, prgm.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode='int', stop=True) print "ret", r assert (r[0] == 55) return