Example #1
0
  def save_register(self, reg): # , branch_to_save = False):
    code = spu.get_active_code()

    offset = code.acquire_register()
    size = code.acquire_register()
    test = code.acquire_register()
    regs = [offset, size, test]
    
    spu.rotqbyi(offset, self.ls_buffer, 4)
    spu.rotqbyi(size,   self.ls_buffer, 8)

    spu.stqx(reg, self.ls_buffer, offset)
    
    spu.ai(offset, offset, 16)
    spu.ceq(test,  offset, size)

    spu.wrch(size, dma.SPU_WrOutMbox)
    spu.wrch(offset, dma.SPU_WrOutMbox)
    spu.wrch(test, dma.SPU_WrOutMbox)
    # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!!
    lbl_ls_full = code.size()
    spu.stop(0xB)
    self.save_ls_buffer(ls_size = size)

    spu.nop(0)
    code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active = True)

    code.release_registers(regs)
    return
Example #2
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    if self.buffers is None: raise Exception('Please set buffers')
    if self.stride is None: raise Exception('Please set stride')
    
    # Draw a square
    color  = var.SignedWord(0x0F0F0FFF)
    fb0    = var.Word(self.buffers[0])
    fb1    = var.Word(self.buffers[1])
    stride = var.Word(self.stride)
    addr   = var.Word(0)
    
    # Draw one line
    line_pixels = 256
    for i in spuiter.syn_iter(code, line_pixels*4, step = 16):
      spu.stqx(color, addr, i)

    # Transfer the line to the frame buffer
    md_fb = spuiter.memory_desc('I', size = line_pixels)
    md_fb.set_addr_reg(addr.reg)
    
    addr.v = fb0

    for i in spuiter.syn_iter(code, 128):
      md_fb.put(code, 0)
      addr.v = addr + stride
    
    spu.set_active_code(old_code)
    return
Example #3
0
    def save_register(self, reg):  # , branch_to_save = False):
        code = spu.get_active_code()

        offset = code.acquire_register()
        size = code.acquire_register()
        test = code.acquire_register()
        regs = [offset, size, test]

        spu.rotqbyi(offset, self.ls_buffer, 4)
        spu.rotqbyi(size, self.ls_buffer, 8)

        spu.stqx(reg, self.ls_buffer, offset)

        spu.ai(offset, offset, 16)
        spu.ceq(test, offset, size)

        spu.wrch(size, dma.SPU_WrOutMbox)
        spu.wrch(offset, dma.SPU_WrOutMbox)
        spu.wrch(test, dma.SPU_WrOutMbox)
        # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!!
        lbl_ls_full = code.size()
        spu.stop(0xB)
        self.save_ls_buffer(ls_size=size)

        spu.nop(0)
        code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full),
                                    ignore_active=True)

        code.release_registers(regs)
        return
Example #4
0
    def block(self):
        code = spu.get_active_code()
        self._block_idx = len(code)

        # --> add the branch instruction (use brz (?) to always branch, nop to never branch)
        code[self._branch_idx] = spu.nop(0, ignore_active=True)
        # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True)
        # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True)

        # Pack result into vector
        #   [x][y][score][--]

        # Zero the save value
        spu.xor(self._save_value, self._save_value, self._save_value)

        # Copy the score
        spu.selb(self._save_value, self._save_value, self._score,
                 self._word_mask)
        spu.rotqbyi(self._save_value, self._save_value, 12)

        # Copy the y value
        spu.selb(self._save_value, self._save_value, self._y_off,
                 self._word_mask)
        spu.rotqbyi(self._save_value, self._save_value, 12)

        # Copy the x value
        spu.selb(self._save_value, self._save_value, self._x_off,
                 self._word_mask)

        # Save value to local store
        spu.stqx(self._save_value, self._count, self._md_results.r_addr)

        self._count.v = self._count.v + 16

        # --> MemorySave test
        cmp = self._save_value  # reuse the save register
        spu.ceq.ex(cmp, self._count, self._md_results.r_size)

        if self._save_op is not None:
            self._save_op.test(cmp, self._count)

        # Just reset for now
        spu.selb(self._count, self._count, 0, cmp)

        # Return to the loop
        idx = len(code)
        spu.br(-(idx - self._branch_idx - 1))

        return
Example #5
0
def TestStreamBufferDouble(n_spus = 1):
  n = 2048
  a = extarray.extarray('I', range(n))
  
  buffer_size = 32

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  n_bytes = n * 4
  #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size

  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  for buffer in stream:
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      code.add(spu.stqx(current, lsa, buffer))

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, len(a)):
    assert(a[i] == i + i)
  
  return
Example #6
0
  def block(self):
    code = spu.get_active_code()
    self._block_idx = len(code)

    # --> add the branch instruction (use brz (?) to always branch, nop to never branch)
    code[self._branch_idx] = spu.nop(0, ignore_active = True)
    # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True)
    # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True)

    # Pack result into vector
    #   [x][y][score][--]

    # Zero the save value
    spu.xor(self._save_value, self._save_value, self._save_value)

    # Copy the score
    spu.selb(self._save_value, self._save_value, self._score, self._word_mask)    
    spu.rotqbyi(self._save_value, self._save_value, 12)

    # Copy the y value
    spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask)
    spu.rotqbyi(self._save_value, self._save_value, 12)        

    # Copy the x value
    spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask)
    
    # Save value to local store
    spu.stqx(self._save_value, self._count, self._md_results.r_addr)
    
    self._count.v = self._count.v + 16

    # --> MemorySave test
    cmp = self._save_value # reuse the save register
    spu.ceq.ex(cmp, self._count, self._md_results.r_size)

    if self._save_op is not None:
      self._save_op.test(cmp, self._count)
      
    # Just reset for now
    spu.selb(self._count, self._count, 0, cmp)

    # Return to the loop
    idx = len(code)
    spu.br(- (idx - self._branch_idx - 1))
    
    return
Example #7
0
def DoubleBufferExample(n_spus=6):
    """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
    n = 30000
    buffer_size = 16

    # Create an array and align the data
    a = array.array('I', range(n))

    addr = a.buffer_info()[0]
    n_bytes = n * 4

    if n_spus > 1: code = ParallelInstructionStream()
    else: code = InstructionStream()

    current = SignedWord(0, code)
    two = SignedWord(2, code)

    # Create the stream buffer, parallelizing it if using more than 1 SPU
    stream = stream_buffer(code,
                           addr,
                           n_bytes,
                           buffer_size,
                           0,
                           buffer_mode='double',
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    # Loop over the buffers
    for buffer in stream:

        # Create an iterators that computes the address offsets within the
        # buffer.  Note: this will be supported by var/vec iters soon.
        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current - two
            code.add(spu.stqx(current, lsa, buffer))

    # Run the synthetic program and copy the results back to the array
    proc = Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(2, len(a)):
        try:
            assert (a[i] == i - 2)
        except:
            print 'DoubleBuffer error:', a[i], i - 2

    return
Example #8
0
def DoubleBufferExample(n_spus = 6):
  """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
  n = 30000
  buffer_size = 16

  # Create an array and align the data
  a = extarray.extarray('I', range(n))

  addr = a.buffer_info()[0]  
  n_bytes = n * 4

  if n_spus > 1:  code = env.ParallelInstructionStream()
  else:           code = env.InstructionStream()

  current = SignedWord(0, code)
  two = SignedWord(2, code)

  # Create the stream buffer, parallelizing it if using more than 1 SPU
  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  # Loop over the buffers
  for buffer in stream:

    # Create an iterators that computes the address offsets within the
    # buffer.  Note: this will be supported by var/vec iters soon.
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current - two
      code.add(spu.stqx(current, lsa, buffer))

  # Run the synthetic program and copy the results back to the array 
  proc = env.Processor()
  r = proc.execute(code, n_spus = n_spus)

  for i in range(2, len(a)):
    try:
      assert(a[i] == i - 2)
    except:
      print 'DoubleBuffer error:', a[i], i - 2
  
  return
Example #9
0
def TestStreamBufferSingle(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  buffer_size = 128

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()
  
  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  #r_bufsize = code.acquire_register()
  #r_lsa = code.acquire_register()
  #r_current = code.acquire_register()
  
  for buffer in stream:
    #util.load_word(code, r_bufsize, buffer_size)
    #code.add(spu.il(r_lsa, 0))

    #loop = code.size()
    
    #code.add(spu.lqx(r_current, buffer, r_lsa))
    #code.add(spu.a(r_current, r_current, r_current))
    #code.add(spu.stqx(r_current, buffer, r_lsa))

    #code.add(spu.ai(r_bufsize, r_bufsize, -16))
    #code.add(spu.ai(r_lsa, r_lsa, 16))
    #code.add(spu.brnz(r_bufsize, loop - code.size()))

    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      #current.v = 5
      code.add(spu.stqx(current, lsa, buffer))
      

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)
  
  return
Example #10
0
  spu.il(r_cnt, 0)
  spu.il(r_sum, 16 * 4)

  r_data = prgm.acquire_register()
  r_cmp = prgm.acquire_register()
  r_lsa = prgm.acquire_register()

  spu.il(r_lsa, 0x1000)

  lbl_incloop = prgm.get_label("incloop")
  code.add(lbl_incloop)

  spu.lqx(r_data, r_cnt, r_lsa)
  spu.ai(r_data, r_data, 2)
  spu.stqx(r_data, r_cnt, r_lsa)

  spu.ai(r_cnt, r_cnt, 16)
  spu.ceq(r_cmp, r_cnt, r_sum)
  spu.brz(r_cmp, lbl_incloop)

  dma.spu_write_out_mbox(code, code.r_zero)

  prgm += code

  t3 = time.time()
  id = proc.execute(prgm, async = True, mode = 'int')


  t1 = time.time()
  for i in xrange(0, ITERS):
Example #11
0
    spu.il(r_cnt, 0)
    spu.il(r_sum, 16 * 4)

    r_data = prgm.acquire_register()
    r_cmp = prgm.acquire_register()
    r_lsa = prgm.acquire_register()

    spu.il(r_lsa, 0x1000)

    lbl_incloop = prgm.get_label("incloop")
    code.add(lbl_incloop)

    spu.lqx(r_data, r_cnt, r_lsa)
    spu.ai(r_data, r_data, 2)
    spu.stqx(r_data, r_cnt, r_lsa)

    spu.ai(r_cnt, r_cnt, 16)
    spu.ceq(r_cmp, r_cnt, r_sum)
    spu.brz(r_cmp, lbl_incloop)

    dma.spu_write_out_mbox(code, code.r_zero)

    prgm += code

    t3 = time.time()
    id = proc.execute(prgm, async=True, mode='int')

    t1 = time.time()
    for i in xrange(0, ITERS):
        #env.spu_exec.write_in_mbox(id, 1)
Example #12
0
def TestSPUIter():
  size = 32
  data = extarray.extarray('I', range(size))
  prgm = env.Program()
  code = prgm.get_stream()

  r_ea_data = prgm.acquire_register()
  r_ls_data = prgm.acquire_register()
  r_size    = prgm.acquire_register()
  r_tag     = prgm.acquire_register()  

  #print 'array ea: %X' % (data.buffer_info()[0])
  #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % (
  #  str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag))
  
  # Load the effective address
  util.load_word(code, r_ea_data, data.buffer_info()[0])

  # Load the size
  util.load_word(code, r_size, size * 4)

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 12))

  # Load the lsa
  code.add(spu.ai(r_ls_data, code.r_zero, 0))

  # Load the data into address 0
  dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<12);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Increment the data values by 1 using an unrolled loop (no branches)
  # r_current = code.acquire_register()
  current = var.SignedWord(0, code)
  
  # Use an SPU iter
  for lsa in syn_iter(code, size * 4, 16):
    code.add(spu.lqx(current, code.r_zero, lsa))
    # code.add(spu.ai(1, r_current, r_current))
    current.v = current + current
    code.add(spu.stqx(current, code.r_zero, lsa))    

  # code.prgm.release_register(r_current)
  #current.release_register(code)
  
  # Store the values back to main memory

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 13))

  # Load the data into address 0
  dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<13);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Cleanup
  prgm.release_register(r_ea_data)
  prgm.release_register(r_ls_data)  
  prgm.release_register(r_size)
  prgm.release_register(r_tag)  

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm)

  for i in range(0, size):
    assert(data[i] == i + i)

  return
Example #13
0
 def store_current(self):
   return self.code.add(spu.stqx(self.r_current, self.r_addr, self.r_count))    
Example #14
0
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True):
  import time
  # n_spus = 8
  # buffer_size = 16 # 16 ints/buffer
  # n_buffers   = 4  # 4 buffers/spu
  # n_buffers = size / buffer_size
  # size = buffer_size * n_buffers * n_spus
  # data = array.array('I', range(size + 2))

  #data = env.aligned_memory(n, typecode = 'I')
  #data.copy_to(data_array.buffer_info()[0], len(data_array))


  # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16)

  code = env.ParallelInstructionStream()
  # code = env.InstructionStream()

  r_zero    = code.acquire_register()
  r_ea_data = code.acquire_register()
  r_ls_data = code.acquire_register()
  r_size    = code.acquire_register()
  r_tag     = code.acquire_register()  

  # Load zero
  util.load_word(code, r_zero, 0)

  # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0]))
  # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % (
  #   r_zero, r_ea_data, r_ls_data, r_size, r_tag)

  # Load the effective address
  if data.buffer_info()[0] % 16 == 0:
    util.load_word(code, r_ea_data, data.buffer_info()[0])
  else: 
    util.load_word(code, r_ea_data, data.buffer_info()[0] + 8)

  ea_start = data.buffer_info()[0]
  # Iterate over each buffer
  for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)):
    # ea = var.SignedWord(code = code, reg = r_ea_data)
  
    # print 'n_iters:', size / buffer_size
    # for i in syn_range(code, size / buffer_size):

    # code.add(spu.stop(0xB))
  
    # Load the size
    util.load_word(code, r_size, buffer_size * 4)

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 12))

    # Load the lsa
    code.add(spu.ai(r_ls_data, r_zero, 0))

    # Load the data into address 0
    dma.mfc_get(code, r_ls_data, ea, r_size, r_tag)

    # Set the tag bit to 12
    dma.mfc_write_tag_mask(code, 1<<12);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);

    # Increment the data values by 1 using an unrolled loop (no branches)
    # r_current = code.acquire_register()
    current = var.SignedWord(0, code)

    count = var.SignedWord(0, code)
    # Use an SPU iter
    for lsa in syn_iter(code, buffer_size * 4, 16):
      code.add(spu.lqx(current, r_zero, lsa))
      # code.add(spu.ai(1, r_current, r_current))
      current.v = current + current
      code.add(spu.stqx(current, r_zero, lsa))    
      count.v = count + 1

    code.add(spu.stqx(count, r_zero, 0))
  
    # code.release_register(r_current)
    current.release_registers(code)

    # Store the values back to main memory

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 13))

    # Load the data into address 0
    dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag)

    # Set the tag bit to 13
    dma.mfc_write_tag_mask(code, 1<<13);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);


    # code.add(spu.stop(0xB))

    # Update ea
    # ea.v = ea + (buffer_size * 4)
  # /for ea address 


  # Cleanup
  code.release_register(r_zero)
  code.release_register(r_ea_data)
  code.release_register(r_ls_data)  
  code.release_register(r_size)
  code.release_register(r_tag)  

  if not run_code:
    return code

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  proc = env.Processor()
  #data.copy_from(data_array.buffer_info()[0], len(data_array))  
  def print_blocks():
    for i in range(0, size, buffer_size):
      # print data[i:(i + buffer_size)]
      print data[i + buffer_size],
    print '' 
  
  # print_blocks()
  s = time.time()
  r = proc.execute(code, n_spus = n_spus)
  # r = proc.execute(code)
  t = time.time() - s
  # print_blocks()

  return t