Beispiel #1
0
  def end(self, branch = True):
    """Do post-loop iterator code"""

    p = self.code.prgm.acquire_register('pred')

    if self.mode == DEC:
      if self._external_stop:
        self.code.add(ptx.setp('gt', p, self.r_count, self.r_stop))
      else:
        self.code.add(ptx.setp('gt', p, self.r_count, self.n_stop))

    elif self.mode == INC:
      if self._external_stop:
        self.code.add(ptx.setp('lt', p, self.r_count, self.r_stop))
      else:
        self.code.add(ptx.setp('lt', p, self.r_count, self.n_stop))

    self.code.add(ptx.bra(self.start_label, pred=p))

    # Reset the the current value in case this is a nested loop
    if self._external_start:
      self.code.add(ptx.mov(self.r_count, self.r_start))
    else:
      self.code.add(ptx.mov(self.r_count, self.n_start))

    # TODO: erm put this back in
    #for reg in self.get_acquired_registers():
    #  self.code.prgm.release_register(reg)

    return
Beispiel #2
0
    def end(self, branch=True):
        """Do post-loop iterator code"""

        p = self.code.prgm.acquire_register('pred')

        if self.mode == DEC:
            if self._external_stop:
                self.code.add(ptx.setp('gt', p, self.r_count, self.r_stop))
            else:
                self.code.add(ptx.setp('gt', p, self.r_count, self.n_stop))

        elif self.mode == INC:
            if self._external_stop:
                self.code.add(ptx.setp('lt', p, self.r_count, self.r_stop))
            else:
                self.code.add(ptx.setp('lt', p, self.r_count, self.n_stop))

        self.code.add(ptx.bra(self.start_label, pred=p))

        # Reset the the current value in case this is a nested loop
        if self._external_start:
            self.code.add(ptx.mov(self.r_count, self.r_start))
        else:
            self.code.add(ptx.mov(self.r_count, self.n_start))

        # TODO: erm put this back in
        #for reg in self.get_acquired_registers():
        #  self.code.prgm.release_register(reg)

        return
Beispiel #3
0
def TestSynIterInc():
    SIZE = 64

    # build and run the kernel
    prgm = env.Program()
    code = prgm.get_stream()

    code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos))
    ones = prgm.acquire_register((1, 1, 1, 1))
    counter = prgm.acquire_register()
    code.add(ptx.mov(counter, ones))

    for i in syn_iter(code, 4, step=1, mode=INC):
        code.add(ptx.iadd(counter, counter, ones))

    code.add(ptx.mov(reg.o0, counter.x))

    domain = (0, 0, SIZE, SIZE)
    proc = env.Processor(0)

    ext_output = proc.alloc_remote('i', 1, SIZE)
    prgm.set_binding(reg.o0, ext_output)

    prgm.add(code)
    proc.execute(prgm, domain)

    passed = True
    for i in xrange(0, SIZE):
        if ext_output[i] != 5:
            passed = False
    print "Passed == ", passed

    proc.free(ext_output)

    return
Beispiel #4
0
def TestSynIterInc():
  SIZE = 64

  # build and run the kernel
  prgm = env.Program()
  code = prgm.get_stream()

  code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos))
  ones = prgm.acquire_register((1, 1, 1, 1))
  counter = prgm.acquire_register()
  code.add(ptx.mov(counter, ones))

  for i in syn_iter(code, 4, step=1, mode=INC):
    code.add(ptx.iadd(counter, counter, ones))

  code.add(ptx.mov(reg.o0, counter.x))

  domain = (0, 0, SIZE, SIZE)
  proc = env.Processor(0)

  ext_output=proc.alloc_remote('i', 1, SIZE)
  prgm.set_binding(reg.o0, ext_output)

  prgm.add(code)
  proc.execute(prgm, domain)

  passed = True
  for i in xrange(0, SIZE):
    if ext_output[i] != 5:
      passed = False
  print "Passed == ", passed

  proc.free(ext_output)

  return
Beispiel #5
0
  def start(self, align = True, branch = True):
    """Do pre-loop iteration initialization"""

    if self.r_count is None:
      self.r_count = self.code.prgm.acquire_register(self.type)
      
    if self._external_start == False:
      self.code.add(ptx.mov(self.r_count, self.n_start))
    else:
      self.code.add(ptx.mov(self.r_count, self.r_start))

    self.start_label = self.code.prgm.get_unique_label("SYN_ITER_START")
    self.code.add(self.start_label)

    return
Beispiel #6
0
    def start(self, align=True, branch=True):
        """Do pre-loop iteration initialization"""

        if self.r_count is None:
            self.r_count = self.code.prgm.acquire_register(self.type)

        if self._external_start == False:
            self.code.add(ptx.mov(self.r_count, self.n_start))
        else:
            self.code.add(ptx.mov(self.r_count, self.r_start))

        self.start_label = self.code.prgm.get_unique_label("SYN_ITER_START")
        self.code.add(self.start_label)

        return
Beispiel #7
0
def TestParamsFull():
    import time
    import corepy.arch.ptx.isa as isa
    import corepy.arch.ptx.types.registers as regs

    proc = Processor(0)

    # build and run the kernel
    prgm = Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter("u64", name="_mem")
    _a = prgm.add_parameter("f32", name="_a")
    _b = prgm.add_parameter("f32", name="_b")

    rd1 = prgm.acquire_register("u64")
    r1 = prgm.acquire_register("f32")
    r2 = prgm.acquire_register("f32")
    r3 = prgm.acquire_register("f32")
    r4 = prgm.acquire_register("f32")
    v1 = prgm.add_variable("shared", "f32")  # don't need this, but let's test add_variable

    code.add(isa.ld("param", r1, regs.ptxAddress(_a)))
    code.add(isa.ld("param", r2, regs.ptxAddress(_b)))
    code.add(isa.add(r3, r2, r1))
    code.add(isa.add(r3, r3, 1.0))
    code.add(isa.mov(r4, r3))
    code.add(isa.ld("param", rd1, regs.ptxAddress(_mem)))
    code.add(isa.st("global", regs.ptxAddress(rd1), r4))
    prgm.add(code)

    prgm.cache_code()

    a = 1.0
    b = 2.0

    ptx_mem_addr = proc.alloc_device("f32", 1)
    mem = extarray.extarray("f", 1)
    mem[0] = 5.0

    param_list = [ptx_mem_addr.address, a, b]

    proc.copy(ptx_mem_addr, mem)
    prgm.cache_code()
    for i in range(20):
        t1 = time.time()
        proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
        t2 = time.time()
        print "run time", t2 - t1
        print "#####"
    print "X", mem.buffer_info()[0], ptx_mem_addr.address
    proc.copy(mem, ptx_mem_addr)

    print param_list
    print mem

    return
Beispiel #8
0
def TestParamsFull():
  import time
  import corepy.arch.ptx.isa as isa
  import corepy.arch.ptx.types.registers as regs

  proc = Processor(0)

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  _mem = prgm.add_parameter('u64', name='_mem')
  _a = prgm.add_parameter('f32', name='_a')
  _b = prgm.add_parameter('f32', name='_b')

  rd1 = prgm.acquire_register('u64')
  r1 = prgm.acquire_register('f32')
  r2 = prgm.acquire_register('f32')
  r3 = prgm.acquire_register('f32')
  r4 = prgm.acquire_register('f32')    
  v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable

  code.add(isa.ld('param', r1, regs.ptxAddress(_a)))
  code.add(isa.ld('param', r2, regs.ptxAddress(_b)))
  code.add(isa.add(r3, r2, r1))
  code.add(isa.add(r3, r3, 1.0))
  code.add(isa.mov(r4, r3))
  code.add(isa.ld('param', rd1, regs.ptxAddress(_mem)))
  code.add(isa.st('global', regs.ptxAddress(rd1), r4))
  prgm.add(code)

  prgm.cache_code()

  a = 1.0
  b = 2.0

  ptx_mem_addr = proc.alloc_device('f32', 1)
  mem = extarray.extarray('f', 1)
  mem[0] = 5.0

  param_list = [ptx_mem_addr.address, a, b]

  proc.copy(ptx_mem_addr, mem)
  prgm.cache_code()
  for i in range(20):
    t1 = time.time()
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    t2 = time.time()
    print "run time", t2 - t1
    print "#####"
  print "X", mem.buffer_info()[0], ptx_mem_addr.address
  proc.copy(mem, ptx_mem_addr)

  print param_list
  print mem

  return
Beispiel #9
0
def TestSynIterIncFloatExtStopExtStart():
    SIZE = 64

    # build and run the kernel
    prgm = env.Program()
    code = prgm.get_stream()

    code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos))
    ones = prgm.acquire_register((1, 1, 1, 1))
    counter = prgm.acquire_register()
    code.add(ptx.mov(counter, ones))

    stop = prgm.acquire_register((4.0, 4.0, 4.0, 4.0))
    start = prgm.acquire_register((2.0, 2.0, 2.0, 2.0))
    step = prgm.acquire_register((1.0, 1.0, 1.0, 1.0))

    fiter = syn_iter_float(code, stop, step=step, mode=INC)
    fiter.set_start_reg(start)
    for i in fiter:
        code.add(ptx.iadd(counter, counter, ones))

    code.add(ptx.mov(reg.o0, counter.x))

    domain = (0, 0, SIZE, SIZE)
    proc = env.Processor(0)

    ext_output = proc.alloc_remote('i', 1, SIZE, 1)
    prgm.set_binding(reg.o0, ext_output)

    prgm.add(code)
    proc.execute(prgm, domain)

    passed = True
    for i in xrange(0, SIZE):
        if ext_output[i] != 3:
            passed = False
    print "Passed == ", passed

    proc.free(ext_output)

    return
Beispiel #10
0
def TestSynIterIncFloatExtStopExtStart():
  SIZE = 64

  # build and run the kernel
  prgm = env.Program()
  code = prgm.get_stream()

  code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos))
  ones = prgm.acquire_register((1, 1, 1, 1))
  counter = prgm.acquire_register()
  code.add(ptx.mov(counter, ones))

  stop = prgm.acquire_register((4.0, 4.0, 4.0, 4.0))
  start = prgm.acquire_register((2.0, 2.0, 2.0, 2.0))
  step = prgm.acquire_register((1.0, 1.0, 1.0, 1.0))

  fiter = syn_iter_float(code, stop, step=step, mode=INC)
  fiter.set_start_reg(start)
  for i in fiter:
    code.add(ptx.iadd(counter, counter, ones))

  code.add(ptx.mov(reg.o0, counter.x))

  domain = (0, 0, SIZE, SIZE)
  proc = env.Processor(0)

  ext_output=proc.alloc_remote('i', 1, SIZE, 1)
  prgm.set_binding(reg.o0, ext_output)

  prgm.add(code)
  proc.execute(prgm, domain)

  passed = True
  for i in xrange(0, SIZE):
    if ext_output[i] != 3:
      passed = False
  print "Passed == ", passed

  proc.free(ext_output)

  return
Beispiel #11
0
def TestSynIterDec():
    import corepy.arch.ptx.isa as ptx
    import corepy.arch.ptx.types.registers as regs

    SIZE = 64

    proc = env.Processor(0)

    # build and run the kernel
    prgm = env.Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter('u64', name='_mem')

    memp = prgm.acquire_register('u64')
    counter = prgm.acquire_register('u32')
    code.add(ptx.ld('param', memp, regs.ptxAddress(_mem)))
    code.add(ptx.mov(counter, 0))
    for i in syn_iter(code, 5, step=1, mode=DEC):
        code.add(ptx.add(counter, counter, 1))
    code.add(ptx.st('global', regs.ptxAddress(memp), counter))
    prgm.add(code)

    ptx_mem_addr = proc.alloc_device('u32', 1)
    mem = extarray.extarray('I', 1)
    mem[0] = 5

    param_list = [
        ptx_mem_addr.address,
    ]

    proc.copy(ptx_mem_addr, mem)
    prgm.cache_code()
    print prgm.render_string
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    proc.copy(mem, ptx_mem_addr)

    print mem

    #passed = True
    #for i in xrange(0, SIZE):
    #  if ext_output[i] != 5:
    #    passed = False
    #print "Passed == ", passed

    return
Beispiel #12
0
def TestSynIterDec():
  import corepy.arch.ptx.isa as ptx
  import corepy.arch.ptx.types.registers as regs

  SIZE = 64

  proc = env.Processor(0)

  # build and run the kernel
  prgm = env.Program()
  code = prgm.get_stream()

  _mem = prgm.add_parameter('u64', name='_mem')

  memp = prgm.acquire_register('u64')
  counter = prgm.acquire_register('u32')
  code.add(ptx.ld('param', memp, regs.ptxAddress(_mem)))
  code.add(ptx.mov(counter, 0))
  for i in syn_iter(code, 5, step=1, mode=DEC):
    code.add(ptx.add(counter, counter, 1))
  code.add(ptx.st('global', regs.ptxAddress(memp), counter))
  prgm.add(code)

  ptx_mem_addr = proc.alloc_device('u32', 1)
  mem = extarray.extarray('I', 1)
  mem[0] = 5

  param_list = [ptx_mem_addr.address,]

  proc.copy(ptx_mem_addr, mem)
  prgm.cache_code()
  print prgm.render_string
  proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
  proc.copy(mem, ptx_mem_addr)

  print mem

  #passed = True
  #for i in xrange(0, SIZE):
  #  if ext_output[i] != 5:
  #    passed = False
  #print "Passed == ", passed

  return
Beispiel #13
0
def TestSimpleKernel():
    import corepy.arch.ptx.isa as isa
    import corepy.arch.ptx.types.registers as regs
    import time

    SIZE = 128

    proc = Processor(0)

    # build and run the kernel
    prgm = Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter("u64", name="_mem")
    _a = prgm.add_parameter("f32", name="_a")
    _b = prgm.add_parameter("f32", name="_b")

    #  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
    #  r1 = regs.ptxVariable('reg', 'f32', 'f1')
    #  r2 = regs.ptxVariable('reg', 'f32', 'f2')
    #  r3 = regs.ptxVariable('reg', 'f32', 'f3')
    #  r4 = regs.ptxVariable('reg', 'f32', 'f4')
    #  code.add('  .reg .u64 rd1;')
    #  code.add('  .reg .f32 f1;')
    #  code.add('  .reg .f32 f2;')
    #  code.add('  .reg .f32 f3;')
    #  code.add('  .reg .f32 f4;')

    rd1 = prgm.acquire_register("u64")
    r1 = prgm.acquire_register("f32")
    r2 = prgm.acquire_register("f32")
    r3 = prgm.acquire_register("f32")
    r4 = prgm.acquire_register("f32")
    v1 = prgm.add_variable("shared", "f32")  # don't need this, but let's test add_variable

    #  import pdb
    #  pdb.set_trace()
    # code.add(isa.add(r3, r2, r1))
    # code.add('add.f32 r3, r2, r1;')
    code.add(isa.ld("param", r1, regs.ptxAddress(_a)))
    code.add(isa.ld("param", r2, regs.ptxAddress(_b)))
    code.add(isa.add(r3, r2, r1))
    code.add(isa.add(r3, r3, 1.0))
    code.add(isa.mov(r4, r3))
    # temp = prgm.acquire_register('u32')
    # code.add(isa.cvt(temp, regs.tid.x))
    # code.add(isa.cvt(r4, temp, rnd='rn'))
    temp1 = prgm.acquire_register("u32")
    temp2 = prgm.acquire_register("u32")
    temp3 = prgm.acquire_register("u32")
    code.add(isa.mul(temp2, temp1, temp3, hlw="lo"))

    code.add(isa.ld("param", rd1, regs.ptxAddress(_mem)))
    code.add(isa.st("global", regs.ptxAddress(rd1), r4))
    prgm.add(code)

    prgm.cache_code()
    #   prgm.render_string = (
    #   '''
    #   .version 1.4
    #   .target sm_10, map_f64_to_f32
    #   .entry _main (
    #   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
    #   {
    #   .reg .u64 %rd<3>;
    #   .reg .f32 %f<6>;
    #   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
    #   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
    #   add.f32 %f3, %f1, %f2;

    #   mov.f32         %f4, %f3;
    #   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
    #   st.global.f32   [%rd1+0], %f4;

    #   exit;
    #   } // _Z16addArrayOnDevicePfff
    #   '''
    #   )
    #   prgm.render_code = ptx_exec.compile(prgm.render_string)

    ####
    # ptx_mem_addr = proc.alloc_device('f32', 1)
    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    mem[0] = 5.0

    a = 1.0
    b = 2.0

    print mem.buffer_info()[0]
    param_list = [ptx_mem_addr, a, b]
    print map(type, param_list)
    #   # image, dev num, (x, y, w, h)

    # import pdb

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    # kernel = prgm.render_string
    # module = ptx_exec.compile(kernel)
    t1 = time.time()
    # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    t2 = time.time()
    #  pdb.set_trace()
    print "run time", t2 - t1

    print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
    print int(ptx_mem_addr)
    print int(mem.buffer_info()[0])
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem
    ####

    return
Beispiel #14
0
#           self.params['rnd'] =  koperands['rnd']

#     return True

if __name__ == '__main__':
    import corepy.arch.ptx.isa as isa
    #import corepy.arch.ptx.platform as env

    #code = env.InstructionStream()
    #set_active_code(code)

    r1 = regs.ptxVariable('reg', 'u32', 'r1')
    r2 = regs.ptxVariable('reg', 'u32', 'r2')
    r3 = regs.ptxVariable('reg', 'u32', 'r3')
    r4 = regs.ptxVariable('reg', 'u32', 'r4')

    #x = add(r3, r2, r1, ignore_active = True)
    x = isa.add(r3, r2, r1)
    print x.render()
    y = isa.mov(r2, r1)
    print y.render()
    a = regs.ptxAddress(r4)
    z = isa.ld('param', r1, a)
    print z.render()

    f1 = regs.ptxVariable('reg', 'f32', 'f1')
    f2 = regs.ptxVariable('reg', 'f32', 'f2')
    f3 = regs.ptxVariable('reg', 'f32', 'f3')
    a = isa.add(f3, f2, f1)
    print a.render()
Beispiel #15
0
          
#     return True


if __name__ == '__main__':
  import corepy.arch.ptx.isa as isa
  #import corepy.arch.ptx.platform as env

  #code = env.InstructionStream()
  #set_active_code(code)

  r1 = regs.ptxVariable('reg', 'u32', 'r1')
  r2 = regs.ptxVariable('reg', 'u32', 'r2')
  r3 = regs.ptxVariable('reg', 'u32', 'r3')
  r4 = regs.ptxVariable('reg', 'u32', 'r4')

  #x = add(r3, r2, r1, ignore_active = True)
  x = isa.add(r3, r2, r1)
  print x.render()
  y = isa.mov(r2, r1)
  print y.render()
  a = regs.ptxAddress(r4)
  z = isa.ld('param', r1, a)
  print z.render()

  f1 = regs.ptxVariable('reg', 'f32', 'f1')
  f2 = regs.ptxVariable('reg', 'f32', 'f2')
  f3 = regs.ptxVariable('reg', 'f32', 'f3')
  a = isa.add(f3, f2, f1)
  print a.render()
Beispiel #16
0
 def copy_register(self, other):
   return self.code.add(ptx.mov(self, other))
Beispiel #17
0
def TestSimpleKernel():
  import corepy.arch.ptx.isa as isa
  import corepy.arch.ptx.types.registers as regs
  import time

  SIZE = 128

  proc = Processor(0)

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  _mem = prgm.add_parameter('u64', name='_mem')
  _a = prgm.add_parameter('f32', name='_a')
  _b = prgm.add_parameter('f32', name='_b')

#  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
#  r1 = regs.ptxVariable('reg', 'f32', 'f1')
#  r2 = regs.ptxVariable('reg', 'f32', 'f2')
#  r3 = regs.ptxVariable('reg', 'f32', 'f3')
#  r4 = regs.ptxVariable('reg', 'f32', 'f4')
#  code.add('  .reg .u64 rd1;')
#  code.add('  .reg .f32 f1;')
#  code.add('  .reg .f32 f2;')
#  code.add('  .reg .f32 f3;')
#  code.add('  .reg .f32 f4;')

  rd1 = prgm.acquire_register('u64')
  r1 = prgm.acquire_register('f32')
  r2 = prgm.acquire_register('f32')
  r3 = prgm.acquire_register('f32')
  r4 = prgm.acquire_register('f32')    
  v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable

#  import pdb
#  pdb.set_trace()
  #code.add(isa.add(r3, r2, r1))
  #code.add('add.f32 r3, r2, r1;')
  code.add(isa.ld('param', r1, regs.ptxAddress(_a)))
  code.add(isa.ld('param', r2, regs.ptxAddress(_b)))
  code.add(isa.add(r3, r2, r1))
  code.add(isa.add(r3, r3, 1.0))
  code.add(isa.mov(r4, r3))
  #temp = prgm.acquire_register('u32')
  #code.add(isa.cvt(temp, regs.tid.x))
  #code.add(isa.cvt(r4, temp, rnd='rn'))
  temp1 = prgm.acquire_register('u32')
  temp2 = prgm.acquire_register('u32')
  temp3 = prgm.acquire_register('u32')
  code.add(isa.mul(temp2, temp1, temp3, hlw='lo'))
  
  code.add(isa.ld('param', rd1, regs.ptxAddress(_mem)))
  code.add(isa.st('global', regs.ptxAddress(rd1), r4))
  prgm.add(code)

  prgm.cache_code()
#   prgm.render_string = (
#   '''
#   .version 1.4
#   .target sm_10, map_f64_to_f32
#   .entry _main (
#   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
#   {
#   .reg .u64 %rd<3>;
#   .reg .f32 %f<6>;
#   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
#   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
#   add.f32 %f3, %f1, %f2;
  
#   mov.f32         %f4, %f3;
#   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
#   st.global.f32   [%rd1+0], %f4;

#   exit;
#   } // _Z16addArrayOnDevicePfff
#   '''
#   )
#   prgm.render_code = ptx_exec.compile(prgm.render_string)

  ####
  #ptx_mem_addr = proc.alloc_device('f32', 1)
  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  mem[0] = 5.0

  a = 1.0
  b = 2.0
  
  print mem.buffer_info()[0]
  param_list = [ptx_mem_addr, a, b]
  print map(type, param_list)
  #   # image, dev num, (x, y, w, h)

  #import pdb

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  #kernel = prgm.render_string
  #module = ptx_exec.compile(kernel)
  t1 = time.time()
  #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  proc.execute(prgm, (1,1,1,1,1), param_list)
  t2 = time.time()
#  pdb.set_trace()
  print "run time", t2 - t1

  print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
  print int(ptx_mem_addr)
  print int(mem.buffer_info()[0])
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem
  ####

  return