Beispiel #1
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    if self.buffers is None: raise Exception('Please set buffers')
    if self.stride is None: raise Exception('Please set stride')
    
    # Draw a square
    color  = var.SignedWord(0x0F0F0FFF)
    fb0    = var.Word(self.buffers[0])
    fb1    = var.Word(self.buffers[1])
    stride = var.Word(self.stride)
    addr   = var.Word(0)
    
    # Draw one line
    line_pixels = 256
    for i in spuiter.syn_iter(code, line_pixels*4, step = 16):
      spu.stqx(color, addr, i)

    # Transfer the line to the frame buffer
    md_fb = spuiter.memory_desc('I', size = line_pixels)
    md_fb.set_addr_reg(addr.reg)
    
    addr.v = fb0

    for i in spuiter.syn_iter(code, 128):
      md_fb.put(code, 0)
      addr.v = addr + stride
    
    spu.set_active_code(old_code)
    return
Beispiel #2
0
    def synthesize(self, code):
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        self._load_parameters(code)

        log = spu_log.SPULog()
        log.setup(code)

        if self.renderer is not None:
            self.renderer.setup(code)
            self.renderer.set_one(log.consts['ONE'])

        r1_inc = var.SingleFloat()
        r2_inc = var.SingleFloat()
        r1 = var.SingleFloat()
        r2 = var.SingleFloat()
        result = var.SingleFloat()
        pattern = var.Word(0)

        self.ly_point.set_pattern_reg(pattern)
        self.ly_point.set_result_reg(result)
        self.ly_point.set_r_regs(r1, r2)
        self.ly_point.set_log(log)
        self.ly_point.setup(code)

        spu.lqa(r1, 0)
        spu.lqa(r2, 4)
        spu.lqa(r1_inc, 8)
        spu.lqa(r2_inc, 12)
        spu.lqa(pattern, 16)

        for y in spuiter.syn_iter(code, self.h):
            spu.lqa(r1, 0)

            for x in spuiter.syn_iter(code, self.w / 4):
                self.ly_point.synthesize(code)
                r1.v = spu.fa.ex(r1, r1_inc)

                if self.renderer is not None:
                    # result.v = spu.fm.ex(r1, r2)
                    self.renderer.set_result_reg(result)
                    self.renderer.synthesize(code)

            if self.renderer is not None:
                self.renderer.row_complete(code)
            r2.v = spu.fa.ex(r2, r2_inc)

        # return Numeric.where(Numeric.less(results, 0), results, 0)

        spu.set_active_code(old_code)
        return
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    self._load_parameters(code)

    log = spu_log.SPULog()
    log.setup(code)

    if self.renderer is not None:
      self.renderer.setup(code)
      self.renderer.set_one(log.consts['ONE'])

    r1_inc = var.SingleFloat()
    r2_inc = var.SingleFloat()
    r1 = var.SingleFloat()
    r2 = var.SingleFloat()
    result = var.SingleFloat()
    pattern = var.Word(0)

    self.ly_point.set_pattern_reg(pattern)
    self.ly_point.set_result_reg(result)
    self.ly_point.set_r_regs(r1, r2)
    self.ly_point.set_log(log)
    self.ly_point.setup(code)

    spu.lqa(r1, 0)
    spu.lqa(r2, 4)    
    spu.lqa(r1_inc, 8)
    spu.lqa(r2_inc, 12)
    spu.lqa(pattern, 16)

    for y in spuiter.syn_iter(code, self.h):
      spu.lqa(r1, 0)

      for x in spuiter.syn_iter(code, self.w / 4):
        self.ly_point.synthesize(code)
        r1.v = spu.fa.ex(r1, r1_inc)

        if self.renderer is not None:
          # result.v = spu.fm.ex(r1, r2)
          self.renderer.set_result_reg(result)
          self.renderer.synthesize(code)
          
      if self.renderer is not None:
        self.renderer.row_complete(code)
      r2.v = spu.fa.ex(r2, r2_inc)
      
    # return Numeric.where(Numeric.less(results, 0), results, 0)
    
    spu.set_active_code(old_code)
    return 
Beispiel #4
0
def DoubleBufferExample(n_spus=6):
    """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
    n = 30000
    buffer_size = 16

    # Create an array and align the data
    a = array.array('I', range(n))

    addr = a.buffer_info()[0]
    n_bytes = n * 4

    if n_spus > 1: code = ParallelInstructionStream()
    else: code = InstructionStream()

    current = SignedWord(0, code)
    two = SignedWord(2, code)

    # Create the stream buffer, parallelizing it if using more than 1 SPU
    stream = stream_buffer(code,
                           addr,
                           n_bytes,
                           buffer_size,
                           0,
                           buffer_mode='double',
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    # Loop over the buffers
    for buffer in stream:

        # Create an iterators that computes the address offsets within the
        # buffer.  Note: this will be supported by var/vec iters soon.
        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current - two
            code.add(spu.stqx(current, lsa, buffer))

    # Run the synthetic program and copy the results back to the array
    proc = Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(2, len(a)):
        try:
            assert (a[i] == i - 2)
        except:
            print 'DoubleBuffer error:', a[i], i - 2

    return
Beispiel #5
0
def DoubleBufferExample(n_spus = 6):
  """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
  n = 30000
  buffer_size = 16

  # Create an array and align the data
  a = extarray.extarray('I', range(n))

  addr = a.buffer_info()[0]  
  n_bytes = n * 4

  if n_spus > 1:  code = env.ParallelInstructionStream()
  else:           code = env.InstructionStream()

  current = SignedWord(0, code)
  two = SignedWord(2, code)

  # Create the stream buffer, parallelizing it if using more than 1 SPU
  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  # Loop over the buffers
  for buffer in stream:

    # Create an iterators that computes the address offsets within the
    # buffer.  Note: this will be supported by var/vec iters soon.
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current - two
      code.add(spu.stqx(current, lsa, buffer))

  # Run the synthetic program and copy the results back to the array 
  proc = env.Processor()
  r = proc.execute(code, n_spus = n_spus)

  for i in range(2, len(a)):
    try:
      assert(a[i] == i - 2)
    except:
      print 'DoubleBuffer error:', a[i], i - 2
  
  return
Beispiel #6
0
def TestFloats():
    import math

    code = synspu.InstructionStream()
    proc = synspu.Processor()

    spu.set_active_code(code)

    code.set_debug(True)

    # Create a simple SPU program that computes log for all values bettween
    # .01 and 10.0 with .01 increments

    start = .65
    stop = .75
    inc = .01

    sp_step = 0x3C23D70A
    # r_current = var.Word(0x3C23D70A) # .01 in single precision
    r_current = var.Word(0x3F266666)
    r_step = var.Word(sp_step)  # .01 in single precision
    result = var.Word(0)
    log = SPULog()

    log.setup(code)
    log.set_result(result)
    log.set_x(r_current)

    log_iter = syn_iter(code, int((stop - start) / inc))

    for i in log_iter:

        log.synthesize(code)
        spu.fa(r_current, r_current, r_step)
        spu.wrch(result, dma.SPU_WrOutMbox)

    # code.print_code()
    spe_id = proc.execute(code, mode='async')

    x = start
    for i in range(int((stop - start) / inc)):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        slog = synspu.spu_exec.read_out_mbox(spe_id)
        print '%.3f 0x%08X  %.08f %.08f ' % (x, slog, _sp_to_float(slog),
                                             math.log(x, 2))
        x += inc

    proc.join(spe_id)

    return
Beispiel #7
0
def TestFloats():
  import math
  
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  spu.set_active_code(code)

  code.set_debug(True)
  
  # Create a simple SPU program that computes log for all values bettween
  # .01 and 10.0 with .01 increments

  start = .65
  stop  = .75
  inc   = .01

  sp_step = 0x3C23D70A
  # r_current = var.Word(0x3C23D70A) # .01 in single precision
  r_current = var.Word(0x3F266666)
  r_step  = var.Word(sp_step)    # .01 in single precision
  result  = var.Word(0)
  log = SPULog()

  log.setup(code)
  log.set_result(result)
  log.set_x(r_current)
  
  log_iter = syn_iter(code, int((stop - start) / inc))

  for i in log_iter:
    
    log.synthesize(code)
    spu.fa(r_current, r_current, r_step)
    spu.wrch(result, dma.SPU_WrOutMbox)

  # code.print_code()
  spe_id = proc.execute(code, mode = 'async')

  x = start
  for i in range(int((stop - start) / inc)):
    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    slog = synspu.spu_exec.read_out_mbox(spe_id)
    print '%.3f 0x%08X  %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2))
    x += inc

  proc.join(spe_id)

  return
Beispiel #8
0
def TestSaveBuffer1():
    import array

    code = synspu.InstructionStream()
    proc = synspu.Processor()

    code.set_debug(True)
    spu.set_active_code(code)

    n = 2**14
    data = array.array('I', range(n))
    #data = synspu.aligned_memory(n, typecode = 'I')
    #data.copy_to(data_array.buffer_info()[0], len(data_array))

    save_buffer = SaveBuffer()

    save_buffer.setup()
    save_buffer.init_ls_buffer(0, 128)
    save_buffer.init_mm_buffer(data.buffer_info()[0], n)

    value = var.SignedWord(0xCAFEBABE)

    for i in spuiter.syn_iter(code, n / 4):
        save_buffer.save_register(value)

    code.print_code()
    spe_id = proc.execute(code, mode='async')

    for i in range(n / 4):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    #data.copy_from(data_array.buffer_info()[0], len(data_array))

    print data[:10]
    return
Beispiel #9
0
def TestSaveBuffer1():
  import array

  code = synspu.InstructionStream()
  proc = synspu.Processor()

  code.set_debug(True)
  spu.set_active_code(code)
  
  n = 2**14
  data = array.array('I', range(n))
  #data = synspu.aligned_memory(n, typecode = 'I')
  #data.copy_to(data_array.buffer_info()[0], len(data_array))


  save_buffer = SaveBuffer()
  
  save_buffer.setup()
  save_buffer.init_ls_buffer(0, 128)
  save_buffer.init_mm_buffer(data.buffer_info()[0], n)

  value = var.SignedWord(0xCAFEBABE)
  
  for i in spuiter.syn_iter(code, n / 4):
    save_buffer.save_register(value)

  code.print_code()
  spe_id = proc.execute(code, mode='async')

  for i in range(n/4):
    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

  proc.join(spe_id)

  #data.copy_from(data_array.buffer_info()[0], len(data_array))  

  print data[:10]
  return
Beispiel #10
0
def TestTanimotoBlock(n_vecs = 4):
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  code.set_debug(True)
  spu.set_active_code(code)
  
  tb = TanimotoBlock()
  ls_save = LocalSave()
  mm_save = MemorySave()

  code.set_debug(True)

  # Input block parameters
  m = 128
  n = 64
  # n_vecs = 9
  n_bits = 128 * n_vecs

  # Main memory results buffer
  # max_results = 2**16
  max_results = 16384
  words_per_result = 4

  mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)])
  #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I')
  # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data))

  mm_results = spuiter.memory_desc('I')
  #mm_results.from_array(mm_results_buffer)
  mm_results.from_array(mm_results_data)

  mm_save.set_md_save_buffer(mm_results)
    
  # Local Results buffer
  buffer_size = var.SignedWord(16384)
  buffer_addr = var.SignedWord(m * n * n_vecs * 4)
  ls_results = spuiter.memory_desc('B')
  ls_results.set_size_reg(buffer_size)
  ls_results.set_addr_reg(buffer_addr)

  ls_save.set_md_results(ls_results)
  ls_save.set_mm_save_op(mm_save)

  # Setup the TanimotoBlock class
  tb.set_n_bits(n_bits)
  tb.set_block_size(m, n)

  tb.set_x_addr(0)
  tb.set_y_addr(m * n_vecs * 16)
  tb.set_save_op(ls_save)

  # Main test loop
  n_samples = 10000
  for samples in spuiter.syn_iter(code, n_samples):
    tb.synthesize(code)

  spu.wrch(buffer_size, dma.SPU_WrOutMbox)
  
  spu.stop(0x2000) 

  # "Function" Calls
  ls_save.block()
  mm_save.block()

  # code.print_code()
  start = time.time()
  spe_id = proc.execute(code, async=True)
  
  while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
  # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))
  stop = time.time()

  # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data))
  
  proc.join(spe_id)
  total = stop - start
  bits_sec = (m * n * n_bits * n_samples) / total / 1e9
  ops_per_compare = 48 * 4 + 8  # 48 SIMD instructions, 8 scalar
  insts_per_compare = 56
  gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9
  ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9  
  print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % (
    total, bits_sec, gops, ginsts, code.size())
  return
Beispiel #11
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    # Sanity checks
    if self._x_addr is None: raise Exception("Please set x_addr")
    if self._y_addr is None: raise Exception("Please set y_addr")
    if self._n_bits is None: raise Exception("Please set n_bits")
    if self._m is None: raise Exception("Please set m")
    if self._n is None: raise Exception("Please set n")    
    
    # Acquire a registers for the bit vectors and result
    n_vecs = self._n_bits / 128
    x_regs = [code.acquire_register() for i in range(n_vecs)]
    y_regs = [code.acquire_register() for i in range(n_vecs)]
    result = code.acquire_register()

    x_addr = var.Word()
    y_addr = var.Word()

    if self._save_op is not None:
      if self._threshold is not None:
        threshold = var.SingleFloat(self._threshold)
      else:
        threshold = var.SingleFloat(0.0)
      bcmp = var.Word(0)
    
    # Setup the Tanimito kernel
    tan = Tanimoto()

    tan.set_n_bits(self._n_bits)
    tan.set_x_regs(x_regs)
    tan.set_y_regs(y_regs)
    tan.set_result(result)

    tan.synthesize_constants(code)

    # Setup the save op
    save_op = self._save_op
    if save_op is not None:
      save_op.setup()
      
    # Create the iterators
    xiter = spuiter.syn_iter(code, self._m)
    yiter = spuiter.syn_iter(code, self._n)

    # Synthesize the block comparison loops
    x_addr.v = self._x_addr

    for x_off in xiter:
      x_addr.v = x_addr + 16 * n_vecs
      y_addr.v = self._y_addr

      self._load_bit_vector(x_addr, x_regs)

      for y_off in yiter:
        y_addr.v = y_addr + 16 * n_vecs

        self._load_bit_vector(y_addr, y_regs)
        tan.synthesize(code)

        if save_op is not None:
          spu.fcgt(bcmp, result, threshold)
          save_op.test(bcmp, result, x_off, y_off)

    # /x_off

    if old_code is not None:
      spu.set_active_code(old_code)
    
    return
Beispiel #12
0
    def synthesize(self, code):
        self._check_inputs()

        old_code = spu.get_active_code()
        spu.set_active_code(code)

        zero = var.Word(reg=code.r_zero)
        one = self.log.consts['ONE']
        two = self.consts['TWO']

        x = var.Word(self.x0)
        r = var.Word(0)
        cmp = var.Word(0)
        x_neg = var.Word(0)
        fmax = var.Word(self.max_init)
        temp = var.SingleFloat()

        fmax.v = spu.cuflt.ex(fmax, 155)

        # Init
        for i in spuiter.syn_iter(code, self.max_init):
            # x = r[i % r_max] * x * (1.0 - x)
            self._next_r(r)
            temp.v = spu.fs.ex(one, x)
            x.v = spu.fm.ex(x, temp)
            x.v = spu.fm.ex(r, x)

        #  if x == float('-infinity'):
        #    return -10.0

        # Derive Exponent
        total = var.Word(0)
        logx = var.SingleFloat()

        for i in spuiter.syn_iter(code, self.max_n):
            # x = ri * x * (1.0 - x)
            self._next_r(r)
            temp.v = spu.fs.ex(one, x)
            x.v = spu.fm.ex(x, temp)
            x.v = spu.fm.ex(r, x)

            # logx = ri - 2.0 * ri * x
            logx.v = spu.fm.ex(two, x)
            logx.v = spu.fm.ex(r, logx)
            logx.v = spu.fs.ex(r, logx)

            # abs(logx)
            x_neg.v = spu.fs.ex(zero, logx)
            cmp.v = spu.fcgt.ex(logx, zero)
            logx.v = spu.selb.ex(x_neg, logx, cmp)
            # logx.v = spu.selb.ex(logx, x_neg, cmp)

            # log(logx)
            self.log.set_result(logx)
            self.log.set_x(logx)
            self.log.synthesize(code)

            # total = total + x
            total.v = spu.fa.ex(total, logx)

        # return total / float(max_n)
        fdiv(code, self.result, total, fmax, one)

        spu.set_active_code(code)
        return
Beispiel #13
0
def SpeedTest(n_spus=6, n_floats=6):
    """
  Get a rough estimate of the maximum flop count.
  On a PS3 using all 6 spus, this is 152 GFlops.
  """

    if n_spus > 1: code = ParallelInstructionStream()
    else: code = InstructionStream()

    spu.set_active_code(code)

    f_range = range(n_floats)
    a = [SingleFloat(0.0) for i in f_range]
    b = [SingleFloat(0.0) for i in f_range]
    c = [SingleFloat(0.0) for i in f_range]
    t = [SingleFloat(0.0) for i in f_range]

    outer = 2**12
    inner = 2**16
    unroll = 128
    fuse = 2
    simd = 4
    for x in syn_iter(code, outer):
        for y in syn_iter(code, inner):
            for u in range(unroll):
                for i in f_range:
                    t[i].v = spu.fma.ex(a[i], b[i], c[i])

    # Run the synthetic program and copy the results back to the array
    # TODO - AWF - use the SPU decrementers to time this
    proc = Processor()
    start = time.time()
    r = proc.execute(code, n_spus=n_spus)
    stop = time.time()
    total = stop - start
    n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(
        fuse) * long(simd) * long(n_spus)
    print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

    #   # Run the native program and copy the results back to the array
    #   outer = 2**14
    #   inner = 2**16
    #   unroll = 1
    #   fuse = 1
    #   simd = 1

    #   proc = Processor()
    #   # ncode = NativeInstructionStream("a.out")
    #   start = time.time()
    #   r = proc.execute(ncode, n_spus = n_spus)
    #   stop = time.time()
    #   total = stop - start
    #   n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus)
    #   print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

    results = """
  --> No optimizations
  Executing native code: a.out
  14.805322 sec, 20.89 GFlops

  --> Synthetic
  Platform: linux.spre_linux_spu
  no raw data
  65.023350 sec, 152.19 GFlops

  --> -O3 (fuse: 2, simd: 4)
  Executing native code: a.out
  7.407939 sec, 41.74 GFlops

  --> -O3 (fuse: 1, simd: 1)
  Executing native code: a.out
  7.403702 sec, 5.22 GFlops
  """
    return
Beispiel #14
0
def SpeedTest(n_spus = 6, n_floats = 6):
  """
  Get a rough estimate of the maximum flop count.
  On a PS3 using all 6 spus, this is 152 GFlops.
  """

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  spu.set_active_code(code)
  
  f_range = range(n_floats)
  a = [SingleFloat(0.0) for i in f_range]
  b = [SingleFloat(0.0) for i in f_range]
  c = [SingleFloat(0.0) for i in f_range]  
  t = [SingleFloat(0.0) for i in f_range]

  outer = 2**12
  inner = 2**16
  unroll = 128
  fuse = 2
  simd = 4
  for x in syn_iter(code, outer):
    for y in syn_iter(code, inner):
      for u in xrange(unroll):
        for i in f_range:
          t[i].v = spu.fma.ex(a[i], b[i], c[i])
    

  # Run the synthetic program and copy the results back to the array 
  # TODO - AWF - use the SPU decrementers to time this
  proc = env.Processor()
  prgm += code

  start = time.time()
  r = proc.execute(prgm, n_spus = n_spus)
  stop = time.time()
  total = stop - start
  n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus)
  print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

#   # Run the native program and copy the results back to the array
#   outer = 2**14
#   inner = 2**16
#   unroll = 1
#   fuse = 1
#   simd = 1

#   proc = Processor()
#   # ncode = NativeInstructionStream("a.out")
#   start = time.time()
#   r = proc.execute(ncode, n_spus = n_spus)
#   stop = time.time()
#   total = stop - start
#   n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus)
#   print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

  results = """
  --> No optimizations
  Executing native code: a.out
  14.805322 sec, 20.89 GFlops

  --> Synthetic
  Platform: linux.spre_linux_spu
  no raw data
  65.023350 sec, 152.19 GFlops

  --> -O3 (fuse: 2, simd: 4)
  Executing native code: a.out
  7.407939 sec, 41.74 GFlops

  --> -O3 (fuse: 1, simd: 1)
  Executing native code: a.out
  7.403702 sec, 5.22 GFlops
  """
  return
  def synthesize(self, code):
    self._check_inputs()
    
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    zero = var.Word(reg = code.r_zero)
    one = self.log.consts['ONE']
    two = self.consts['TWO']
    
    x   = var.Word(self.x0)
    r   = var.Word(0)
    cmp = var.Word(0)
    x_neg = var.Word(0)
    fmax  = var.Word(self.max_init)
    temp = var.SingleFloat()

    fmax.v = spu.cuflt.ex(fmax, 155)

    # Init
    for i in spuiter.syn_iter(code, self.max_init):
      # x = r[i % r_max] * x * (1.0 - x)      
      self._next_r(r)
      temp.v = spu.fs.ex(one, x)
      x.v = spu.fm.ex(x, temp)
      x.v = spu.fm.ex(r, x)

    #  if x == float('-infinity'):
    #    return -10.0
    
    # Derive Exponent
    total = var.Word(0)
    logx  = var.SingleFloat()

    for i in spuiter.syn_iter(code, self.max_n):    
      # x = ri * x * (1.0 - x)
      self._next_r(r)
      temp.v = spu.fs.ex(one, x)
      x.v = spu.fm.ex(x, temp)
      x.v = spu.fm.ex(r, x)
      
      # logx = ri - 2.0 * ri * x
      logx.v = spu.fm.ex(two, x)
      logx.v = spu.fm.ex(r, logx)
      logx.v = spu.fs.ex(r, logx)

      # abs(logx)
      x_neg.v = spu.fs.ex(zero, logx)
      cmp.v = spu.fcgt.ex(logx, zero)
      logx.v = spu.selb.ex(x_neg, logx, cmp)
      # logx.v = spu.selb.ex(logx, x_neg, cmp)
      

      # log(logx)
      self.log.set_result(logx)
      self.log.set_x(logx)
      self.log.synthesize(code)

      # total = total + x
      total.v = spu.fa.ex(total, logx)

    # return total / float(max_n)    
    fdiv(code, self.result, total, fmax, one)
    
    spu.set_active_code(code)
    return