def _load_buffer(self): # TODO - AWF - some optimization is possible here. # rather than skipping around the DMA get on the last iteration, short out # of the loop completely. Saves doing the check twice.. # Also as soon as we do this first check, we know we are going to go # through the loop again. Again, no need for a second conditional at the # end, just increment counters and always branch. A hint could be added # right before the DMA get. # Don't perform the load the last time through the loop r_cmp = self.code.prgm.acquire_register() # Compare count == step self.code.add(spu.ceq(r_cmp, self.r_stop, self.r_count)) # Create a skip label and add the branch skip_label = self.code.prgm.get_unique_label("STREAM_BUFFER_SKIP") self.code.add(spu.brnz(r_cmp, skip_label)) # Start the DMA get dma.mfc_get(self.code, self.ls, syn_range.get_current(self), self.buffer_size, self.tag) # Add the branch label self.code.add(skip_label) self.code.prgm.release_register(r_cmp) return
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) prgm = env.Program() code = prgm.get_stream() r_ea_data = prgm.acquire_register() r_ls_data = prgm.acquire_register() r_size = prgm.acquire_register() r_tag = prgm.acquire_register() #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, code.r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, code.r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, code.r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, code.r_zero, lsa)) # code.prgm.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, code.r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Cleanup prgm.release_register(r_ea_data) prgm.release_register(r_ls_data) prgm.release_register(r_size) prgm.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code prgm.add(code) proc = env.Processor() r = proc.execute(prgm) for i in range(0, size): assert(data[i] == i + i) return
r_lsa = prgm.acquire_register() # Local Store address r_mma = prgm.acquire_register() # Main Memory address r_size = prgm.acquire_register() # Size in bytes r_tag = prgm.acquire_register() # DMA Tag # Set the parameters for a GET command abi = a.buffer_info() spu.il(r_lsa, 0x1000) # Local Store address 0x1000 load_word(code, r_mma, abi[0]) # Main Memory address of array a spu.il(r_size, a.itemsize * abi[1]) # Size of array a in bytes spu.il(r_tag, 12) # DMA tag 12 # Issue a DMA GET command dma.mfc_get(code, r_lsa, r_mma, r_size, r_tag) # Wait for completion # Set the completion mask; here we complete tag 12 spu.il(r_tag, 1 << 12) dma.mfc_write_tag_mask(code, r_tag) dma.mfc_read_tag_status_all(code) # Set the parameters for a PUT command bbi = b.buffer_info() spu.il(r_lsa, 0x1000) # Local Store address 0x1000 load_word(code, r_mma, bbi[0]) # Main Memory address of array b spu.il(r_size, b.itemsize * bbi[1]) # Size of array b in bytes spu.il(r_tag, 12) # DMA tag 12
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True): import time # n_spus = 8 # buffer_size = 16 # 16 ints/buffer # n_buffers = 4 # 4 buffers/spu # n_buffers = size / buffer_size # size = buffer_size * n_buffers * n_spus # data = array.array('I', range(size + 2)) #data = env.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16) code = env.ParallelInstructionStream() # code = env.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0])) # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % ( # r_zero, r_ea_data, r_ls_data, r_size, r_tag) # Load the effective address if data.buffer_info()[0] % 16 == 0: util.load_word(code, r_ea_data, data.buffer_info()[0]) else: util.load_word(code, r_ea_data, data.buffer_info()[0] + 8) ea_start = data.buffer_info()[0] # Iterate over each buffer for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)): # ea = var.SignedWord(code = code, reg = r_ea_data) # print 'n_iters:', size / buffer_size # for i in syn_range(code, size / buffer_size): # code.add(spu.stop(0xB)) # Load the size util.load_word(code, r_size, buffer_size * 4) # Load the tag code.add(spu.ai(r_tag, r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, ea, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) count = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, buffer_size * 4, 16): code.add(spu.lqx(current, r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, r_zero, lsa)) count.v = count + 1 code.add(spu.stqx(count, r_zero, 0)) # code.release_register(r_current) current.release_registers(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag) # Set the tag bit to 13 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # code.add(spu.stop(0xB)) # Update ea # ea.v = ea + (buffer_size * 4) # /for ea address # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) if not run_code: return code # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = env.Processor() #data.copy_from(data_array.buffer_info()[0], len(data_array)) def print_blocks(): for i in range(0, size, buffer_size): # print data[i:(i + buffer_size)] print data[i + buffer_size], print '' # print_blocks() s = time.time() r = proc.execute(code, n_spus = n_spus) # r = proc.execute(code) t = time.time() - s # print_blocks() return t