Ejemplo n.º 1
0
def TestSetSlotValue():
    import corepy.arch.spu.platform as synspu
    import corepy.arch.spu.types.spu_types as var
    import corepy.arch.spu.lib.dma as dma

    prgm = synspu.Program()
    code = prgm.get_stream()
    proc = synspu.Processor()
    spu.set_active_code(code)
    a = var.SignedWord(0x11)
    b = var.SignedWord(0x13)
    r = var.SignedWord(0xFFFFFFFF)

    set_slot_value(code, r, 0, 0x10)
    set_slot_value(code, r, 1, a)
    set_slot_value(code, r, 2, 0x12)
    set_slot_value(code, r, 3, b)

    for i in range(4):
        spu.wrch(r, dma.SPU_WrOutMbox)
        spu.rotqbyi(r, r, 4)

    prgm.add(code)
    spe_id = proc.execute(prgm, async=True)

    for i in range(4):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        result = synspu.spu_exec.read_out_mbox(spe_id)
        assert (result == (i + 0x10))

    proc.join(spe_id)

    return
Ejemplo n.º 2
0
  def setup(self):
    self._count = var.SignedWord(0)
    self._save_value = var.SignedWord(0)
    self._word_mask = var.SignedWord(array.array('I', [0xFFFFFFFF, 0, 0, 0]))


    if self._save_op is not None:
      self._save_op.setup()
    
    return
Ejemplo n.º 3
0
def TestStreamBufferDouble(n_spus = 1):
  n = 2048
  a = extarray.extarray('I', range(n))
  
  buffer_size = 32

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  n_bytes = n * 4
  #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size

  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  for buffer in stream:
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      code.add(spu.stqx(current, lsa, buffer))

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, len(a)):
    assert(a[i] == i + i)
  
  return
Ejemplo n.º 4
0
def TestVecIter(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  
  buffer_size = 16

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  md = memory_desc('i', 0, buffer_size)

  for buffer in stream:
    for current in spu_vec_iter(code, md):
      current.v = current + current

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)

  return
Ejemplo n.º 5
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    if self.buffers is None: raise Exception('Please set buffers')
    if self.stride is None: raise Exception('Please set stride')
    
    # Draw a square
    color  = var.SignedWord(0x0F0F0FFF)
    fb0    = var.Word(self.buffers[0])
    fb1    = var.Word(self.buffers[1])
    stride = var.Word(self.stride)
    addr   = var.Word(0)
    
    # Draw one line
    line_pixels = 256
    for i in spuiter.syn_iter(code, line_pixels*4, step = 16):
      spu.stqx(color, addr, i)

    # Transfer the line to the frame buffer
    md_fb = spuiter.memory_desc('I', size = line_pixels)
    md_fb.set_addr_reg(addr.reg)
    
    addr.v = fb0

    for i in spuiter.syn_iter(code, 128):
      md_fb.put(code, 0)
      addr.v = addr + stride
    
    spu.set_active_code(old_code)
    return
Ejemplo n.º 6
0
def TestContinueLabel(n_spus=1):
    n = 1024
    a = extarray.extarray('I', range(n))

    buffer_size = 16

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = var.SignedWord(0, code)
    test = var.SignedWord(0, code)
    four = var.SignedWord(4, code)

    stream = stream_buffer(code,
                           a.buffer_info()[0],
                           n * 4,
                           buffer_size,
                           0,
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    md = memory_desc('i', 0, buffer_size)
    lsa_iter = spu_vec_iter(code, md)

    for buffer in stream:
        for current in lsa_iter:
            current.v = current + current

            test.v = (current == four)
            code.add(spu.gbb(test, test))
            #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue
            #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl))
            code.add(spu.brz(test.reg, lsa_iter.continue_label))
            current.v = current + current

        #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next))

    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(0, n):
        if i >= 4:
            assert (a[i] == i + i)
        else:
            #print a[i]
            assert (a[i] == i * 4)
    return
Ejemplo n.º 7
0
 def _inc_ea(self):
   """
   Increment the ea/count register by step size.  This is used for double buffering.
   """
   if self.r_step is not None:
     vstep = var.SignedWord(code = self.code, reg = self.r_step)
     self.current_count.v = self.current_count + vstep 
   else:
     self.current_count.v = self.current_count + self.step_size()
   return
Ejemplo n.º 8
0
  def _start_post(self):
    # Initialize the buffer size
    self.buffer_size = var.SignedWord(self.ibuffer_size, self.code)
  
    # Initialize the ls and tag vectors with (optionally) alternating values
    if self.buffer_mode == 'single':
      self.ls  = var.SignedWord(self.lsa, self.code)
      self.tag = var.SignedWord(1, self.code)
    else:
      self.ls  = var.SignedWord(array.array('i', [self.lsa, self.lsb, self.lsa, self.lsb]),  self.code)
      self.tag = var.SignedWord(array.array('i', [1, 2, 1, 2]), self.code)

    # For double buffering, load the first buffer
      self._load_buffer()
  
    # Update the start label (make a new one and add it)
    self.start_label = self.code.prgm.get_unique_label("STREAM_BUFFER_START")
    self.code.add(self.start_label)
    return
Ejemplo n.º 9
0
def TestStreamBufferSingle(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  buffer_size = 128

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()
  
  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  #r_bufsize = code.acquire_register()
  #r_lsa = code.acquire_register()
  #r_current = code.acquire_register()
  
  for buffer in stream:
    #util.load_word(code, r_bufsize, buffer_size)
    #code.add(spu.il(r_lsa, 0))

    #loop = code.size()
    
    #code.add(spu.lqx(r_current, buffer, r_lsa))
    #code.add(spu.a(r_current, r_current, r_current))
    #code.add(spu.stqx(r_current, buffer, r_lsa))

    #code.add(spu.ai(r_bufsize, r_bufsize, -16))
    #code.add(spu.ai(r_lsa, r_lsa, 16))
    #code.add(spu.brnz(r_bufsize, loop - code.size()))

    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      #current.v = 5
      code.add(spu.stqx(current, lsa, buffer))
      

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)
  
  return
Ejemplo n.º 10
0
    def start(self, align=True, branch=True):
        """Do pre-loop iteration initialization"""
        if self.r_count is None:
            self.r_count = self.code.acquire_register()

        if self.mode == DEC:
            if self._external_start:
                self.code.add(spu.ai(self.r_count, self.r_start, 0))
            else:
                util.load_word(self.code, self.r_count, self.get_count())

        elif self.mode == INC:
            if self.r_stop is None and branch:
                self.r_stop = self.code.acquire_register()

            if self._external_start:
                self.code.add(spu.ai(self.r_count, self.r_start, 0))
            else:
                util.load_word(self.code, self.r_count, self.get_start())

            if branch and not self._external_stop:
                util.load_word(self.code, self.r_stop, self.get_count())

        # /end mode if

        if self.r_count is not None:
            self.current_count = var.SignedWord(code=self.code,
                                                reg=self.r_count)

        # If the step size doesn't fit in an immediate value, store it in a register
        # (-512 < word < 511):
        if not (-512 < self.step_size() < 511):
            self.r_step = self.code.acquire_register()
            util.load_word(self.code, self.r_step, self.step_size())

        # Label
        self.start_label = self.code.get_label("SYN_ITER_START_%d" %
                                               random.randint(0, 2**32))
        self.code.add(self.start_label)

        # Create continue/branch labels so they can be referenced; they will be
        # added to the code in their appropriate locations.
        self.branch_label = self.code.get_label("SYN_ITER_BRANCH_%d" %
                                                random.randint(0, 2**32))
        self.continue_label = self.code.get_label("SYN_ITER_CONTINUE_%d" %
                                                  random.randint(0, 2**32))
        return
Ejemplo n.º 11
0
def TestSaveBuffer1():
    import array

    code = synspu.InstructionStream()
    proc = synspu.Processor()

    code.set_debug(True)
    spu.set_active_code(code)

    n = 2**14
    data = array.array('I', range(n))
    #data = synspu.aligned_memory(n, typecode = 'I')
    #data.copy_to(data_array.buffer_info()[0], len(data_array))

    save_buffer = SaveBuffer()

    save_buffer.setup()
    save_buffer.init_ls_buffer(0, 128)
    save_buffer.init_mm_buffer(data.buffer_info()[0], n)

    value = var.SignedWord(0xCAFEBABE)

    for i in spuiter.syn_iter(code, n / 4):
        save_buffer.save_register(value)

    code.print_code()
    spe_id = proc.execute(code, mode='async')

    for i in range(n / 4):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    #data.copy_from(data_array.buffer_info()[0], len(data_array))

    print data[:10]
    return
Ejemplo n.º 12
0
  def _wait_buffer(self):
    # TODO - BUG HERE!!
    # Here's what happens: a variable 'mask' is created, then used.  When this
    # code finishes with the variable, it calls mask.release_register() to
    # release the underlying register, which is no longer needed.  But,
    # release_register() sets mask.reg to None.  Although it appears mask would
    # go out of scope here and be garbage collected, it does not!  mask is
    # still referred to by self.code, since instructions have been added that
    # reference it.  The problem is that if these instructions ever need to be
    # rendered again -- like say, for print_code() -- mask.reg.reg is None,
    # which makes it impossible to render the instruction.
    mask = var.SignedWord(1, self.code)
    mask.v = mask << self.tag

    dma.mfc_write_tag_mask(self.code, mask)
    reg = dma.mfc_read_tag_status_all(self.code)
    self.code.prgm.release_register(reg)

    #mask.release_register()

    return
Ejemplo n.º 13
0
  def _transfer_data(self, code, kernel, lsa, tag):
    """
    Load the data into the SPU memory
    """

    # Check the types
    if not isinstance(code, spe.InstructionStream):
      raise Exception('Code must be an InstructionStream')
    if not (isinstance(lsa, int) or issubclass(type(lsa), (spe.Register, spe.Variable))):
      raise Exception('lsa must be an integer, Register, or Variable')
    
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    # Acquire registers for address and size, if they were not supplied by the user
    if self.r_addr is None: r_ea_data = code.prgm.acquire_register()
    else:                   r_ea_data = self.r_addr
      
    if self.r_size is None: r_size = code.prgm.acquire_register()
    else:                   r_size = self.r_size

    # Create variables 
    ea_addr      = var.SignedWord(reg = r_ea_data)
    aligned_size = var.SignedWord(0)
    mod_16       = var.SignedWord(0xF)

    # Initialize the lsa_addr variable. 
    if isinstance(lsa, int):
      # From a constant
      ls_addr   = var.SignedWord(lsa)
    elif issubclass(type(lsa), (spe.Register, spe.Variable)):
      # From a variable
      ls_addr   = var.SignedWord()      
      ls_addr.v = lsa
      
      
    tag_var = var.SignedWord(tag)
    cmp = var.SignedWord(0)

    # Load the effective address
    if self.r_addr is None:
      if self.addr % 16 != 0:
        print '[get_memory] Misaligned data'

      util.load_word(code, ea_addr, self.addr)

    # Load the size, rounding up as required to be 16-byte aligned
    if self.r_size is None:
      rnd_size = self.size * var.INT_SIZES[self.typecode]
      if rnd_size < 16:
        rnd_size = 16
      elif (rnd_size % 16) != 0:
        rnd_size += (16 - (rnd_size % 16))
      util.load_word(code, aligned_size, rnd_size)
    else:
      # TODO: !!! UNIT TEST THIS !!!
      # Same as above, but using SPU arithemtic to round
      size  = var.SignedWord(reg = r_size)
      sixteen  = var.SignedWord(16)
      cmp.v = ((size & mod_16) == size)
      aligned_size.v = size + (sixteen - (size & mod_16))
      spu.selb(aligned_size.reg, size.reg, aligned_size.reg, cmp.reg, order = _mi(spu.selb))
      code.release_register(sixteen.reg)

    # Use an auxillary register for the moving ea value if the
    # caller supplied the address register
    if self.r_addr is not None:
      ea_load   = var.SignedWord(0)
      ea_load.v = ea_addr
    else:
      ea_load = ea_addr # note that this is reference, not .v assignment

    # Transfer parameters
    buffer_size   = var.SignedWord(16384)
    remaining     = var.SignedWord(0)
    transfer_size = var.SignedWord(0)
    remaining.v   = aligned_size

    # Set up the iterators to transfer at most 16k at a time
    xfer_iter = syn_iter(code, 0, 16384)
    xfer_iter.set_stop_reg(aligned_size.reg)

    for offset in xfer_iter:
      cmp.v = buffer_size > remaining
      spu.selb(transfer_size, buffer_size, remaining, cmp)

      # Transfer the data
      kernel(code, ls_addr, ea_load, transfer_size, tag_var)
      ls_addr.v = ls_addr + buffer_size
      ea_load.v = ea_load + buffer_size

      remaining.v = remaining - buffer_size

    # Set the tag bit to tag
    dma.mfc_write_tag_mask(code, 1<<tag);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);

    # Release the registers
    code.release_register(buffer_size.reg)
    code.release_register(remaining.reg)
    code.release_register(aligned_size.reg)    
    code.release_register(transfer_size.reg)
    code.release_register(cmp.reg)
    code.release_register(ls_addr.reg)
    code.release_register(tag_var.reg)
    code.release_register(ea_load.reg)

    if old_code is not None:
      spu.set_active_code(old_code)
    return 
Ejemplo n.º 14
0
 def setup(self):
     self.ls_buffer = var.SignedWord(0)
     self.mm_buffer = var.SignedWord(0)
     return
Ejemplo n.º 15
0
def TestTanimotoBlock(n_vecs = 4):
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  code.set_debug(True)
  spu.set_active_code(code)
  
  tb = TanimotoBlock()
  ls_save = LocalSave()
  mm_save = MemorySave()

  code.set_debug(True)

  # Input block parameters
  m = 128
  n = 64
  # n_vecs = 9
  n_bits = 128 * n_vecs

  # Main memory results buffer
  # max_results = 2**16
  max_results = 16384
  words_per_result = 4

  mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)])
  #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I')
  # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data))

  mm_results = spuiter.memory_desc('I')
  #mm_results.from_array(mm_results_buffer)
  mm_results.from_array(mm_results_data)

  mm_save.set_md_save_buffer(mm_results)
    
  # Local Results buffer
  buffer_size = var.SignedWord(16384)
  buffer_addr = var.SignedWord(m * n * n_vecs * 4)
  ls_results = spuiter.memory_desc('B')
  ls_results.set_size_reg(buffer_size)
  ls_results.set_addr_reg(buffer_addr)

  ls_save.set_md_results(ls_results)
  ls_save.set_mm_save_op(mm_save)

  # Setup the TanimotoBlock class
  tb.set_n_bits(n_bits)
  tb.set_block_size(m, n)

  tb.set_x_addr(0)
  tb.set_y_addr(m * n_vecs * 16)
  tb.set_save_op(ls_save)

  # Main test loop
  n_samples = 10000
  for samples in spuiter.syn_iter(code, n_samples):
    tb.synthesize(code)

  spu.wrch(buffer_size, dma.SPU_WrOutMbox)
  
  spu.stop(0x2000) 

  # "Function" Calls
  ls_save.block()
  mm_save.block()

  # code.print_code()
  start = time.time()
  spe_id = proc.execute(code, async=True)
  
  while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
  # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))
  stop = time.time()

  # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data))
  
  proc.join(spe_id)
  total = stop - start
  bits_sec = (m * n * n_bits * n_samples) / total / 1e9
  ops_per_compare = 48 * 4 + 8  # 48 SIMD instructions, 8 scalar
  insts_per_compare = 56
  gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9
  ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9  
  print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % (
    total, bits_sec, gops, ginsts, code.size())
  return
Ejemplo n.º 16
0
def TestSPUIter():
  size = 32
  data = extarray.extarray('I', range(size))
  prgm = env.Program()
  code = prgm.get_stream()

  r_ea_data = prgm.acquire_register()
  r_ls_data = prgm.acquire_register()
  r_size    = prgm.acquire_register()
  r_tag     = prgm.acquire_register()  

  #print 'array ea: %X' % (data.buffer_info()[0])
  #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % (
  #  str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag))
  
  # Load the effective address
  util.load_word(code, r_ea_data, data.buffer_info()[0])

  # Load the size
  util.load_word(code, r_size, size * 4)

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 12))

  # Load the lsa
  code.add(spu.ai(r_ls_data, code.r_zero, 0))

  # Load the data into address 0
  dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<12);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Increment the data values by 1 using an unrolled loop (no branches)
  # r_current = code.acquire_register()
  current = var.SignedWord(0, code)
  
  # Use an SPU iter
  for lsa in syn_iter(code, size * 4, 16):
    code.add(spu.lqx(current, code.r_zero, lsa))
    # code.add(spu.ai(1, r_current, r_current))
    current.v = current + current
    code.add(spu.stqx(current, code.r_zero, lsa))    

  # code.prgm.release_register(r_current)
  #current.release_register(code)
  
  # Store the values back to main memory

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 13))

  # Load the data into address 0
  dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<13);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Cleanup
  prgm.release_register(r_ea_data)
  prgm.release_register(r_ls_data)  
  prgm.release_register(r_size)
  prgm.release_register(r_tag)  

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm)

  for i in range(0, size):
    assert(data[i] == i + i)

  return
Ejemplo n.º 17
0
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True):
  import time
  # n_spus = 8
  # buffer_size = 16 # 16 ints/buffer
  # n_buffers   = 4  # 4 buffers/spu
  # n_buffers = size / buffer_size
  # size = buffer_size * n_buffers * n_spus
  # data = array.array('I', range(size + 2))

  #data = env.aligned_memory(n, typecode = 'I')
  #data.copy_to(data_array.buffer_info()[0], len(data_array))


  # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16)

  code = env.ParallelInstructionStream()
  # code = env.InstructionStream()

  r_zero    = code.acquire_register()
  r_ea_data = code.acquire_register()
  r_ls_data = code.acquire_register()
  r_size    = code.acquire_register()
  r_tag     = code.acquire_register()  

  # Load zero
  util.load_word(code, r_zero, 0)

  # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0]))
  # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % (
  #   r_zero, r_ea_data, r_ls_data, r_size, r_tag)

  # Load the effective address
  if data.buffer_info()[0] % 16 == 0:
    util.load_word(code, r_ea_data, data.buffer_info()[0])
  else: 
    util.load_word(code, r_ea_data, data.buffer_info()[0] + 8)

  ea_start = data.buffer_info()[0]
  # Iterate over each buffer
  for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)):
    # ea = var.SignedWord(code = code, reg = r_ea_data)
  
    # print 'n_iters:', size / buffer_size
    # for i in syn_range(code, size / buffer_size):

    # code.add(spu.stop(0xB))
  
    # Load the size
    util.load_word(code, r_size, buffer_size * 4)

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 12))

    # Load the lsa
    code.add(spu.ai(r_ls_data, r_zero, 0))

    # Load the data into address 0
    dma.mfc_get(code, r_ls_data, ea, r_size, r_tag)

    # Set the tag bit to 12
    dma.mfc_write_tag_mask(code, 1<<12);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);

    # Increment the data values by 1 using an unrolled loop (no branches)
    # r_current = code.acquire_register()
    current = var.SignedWord(0, code)

    count = var.SignedWord(0, code)
    # Use an SPU iter
    for lsa in syn_iter(code, buffer_size * 4, 16):
      code.add(spu.lqx(current, r_zero, lsa))
      # code.add(spu.ai(1, r_current, r_current))
      current.v = current + current
      code.add(spu.stqx(current, r_zero, lsa))    
      count.v = count + 1

    code.add(spu.stqx(count, r_zero, 0))
  
    # code.release_register(r_current)
    current.release_registers(code)

    # Store the values back to main memory

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 13))

    # Load the data into address 0
    dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag)

    # Set the tag bit to 13
    dma.mfc_write_tag_mask(code, 1<<13);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);


    # code.add(spu.stop(0xB))

    # Update ea
    # ea.v = ea + (buffer_size * 4)
  # /for ea address 


  # Cleanup
  code.release_register(r_zero)
  code.release_register(r_ea_data)
  code.release_register(r_ls_data)  
  code.release_register(r_size)
  code.release_register(r_tag)  

  if not run_code:
    return code

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  proc = env.Processor()
  #data.copy_from(data_array.buffer_info()[0], len(data_array))  
  def print_blocks():
    for i in range(0, size, buffer_size):
      # print data[i:(i + buffer_size)]
      print data[i + buffer_size],
    print '' 
  
  # print_blocks()
  s = time.time()
  r = proc.execute(code, n_spus = n_spus)
  # r = proc.execute(code)
  t = time.time() - s
  # print_blocks()

  return t