Ejemplo n.º 1
0
def TestParams():
    import time

    # SIZE = 1024
    kernel = """
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  """

    t1 = time.time()
    module = ptx_exec.compile(kernel)
    t2 = time.time()
    print "compile time", t2 - t1

    a = 1.0
    b = 2.0

    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    # mem.set_memory(ptx_mem_addr, 4)
    mem[0] = 5.0

    print ptx_mem_addr, type(ptx_mem_addr)
    print mem.buffer_info()[0], type(mem.buffer_info()[0])
    param_list = [ptx_mem_addr, a, b]
    # image, dev num, (x, y, w, h)

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    t1 = time.time()
    ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    t2 = time.time()
    print "run time", t2 - t1
    print "X", mem.buffer_info()[0], ptx_mem_addr
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem

    # ptx_exec.free(input)
    # ptx_exec.free(output)
    ##ptx_exec.free(glob)
    # ptx_exec.unload_module(image)
    return
Ejemplo n.º 2
0
  def alloc_device(self, typecode, length, comps = 1):
    """
    Allocate local GPU memory and return a handle for copying/binding.

    Typecode is ptx typecode (u32, s32, f32, u64, etc.)
    """
    #fmt = self._get_fmt(typecode, comps)

    scalar_byte_width = int(typecode[1:])/8
        
    # Allocate GPU memory and create a DeviceMemory handle
    address = ptx_exec.alloc_device(length*scalar_byte_width*comps)

    return DeviceMemory(address, typecode, length)
Ejemplo n.º 3
0
    def alloc_device(self, typecode, length, comps=1):
        """
    Allocate local GPU memory and return a handle for copying/binding.

    Typecode is ptx typecode (u32, s32, f32, u64, etc.)
    """
        # fmt = self._get_fmt(typecode, comps)

        scalar_byte_width = int(typecode[1:]) / 8

        # Allocate GPU memory and create a DeviceMemory handle
        address = ptx_exec.alloc_device(length * scalar_byte_width * comps)

        return DeviceMemory(address, typecode, length)
Ejemplo n.º 4
0
def TestSimpleKernel():
  import corepy.arch.ptx.isa as isa
  import corepy.arch.ptx.types.registers as regs
  import time

  SIZE = 128

  proc = Processor(0)

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  _mem = prgm.add_parameter('u64', name='_mem')
  _a = prgm.add_parameter('f32', name='_a')
  _b = prgm.add_parameter('f32', name='_b')

#  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
#  r1 = regs.ptxVariable('reg', 'f32', 'f1')
#  r2 = regs.ptxVariable('reg', 'f32', 'f2')
#  r3 = regs.ptxVariable('reg', 'f32', 'f3')
#  r4 = regs.ptxVariable('reg', 'f32', 'f4')
#  code.add('  .reg .u64 rd1;')
#  code.add('  .reg .f32 f1;')
#  code.add('  .reg .f32 f2;')
#  code.add('  .reg .f32 f3;')
#  code.add('  .reg .f32 f4;')

  rd1 = prgm.acquire_register('u64')
  r1 = prgm.acquire_register('f32')
  r2 = prgm.acquire_register('f32')
  r3 = prgm.acquire_register('f32')
  r4 = prgm.acquire_register('f32')    
  v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable

#  import pdb
#  pdb.set_trace()
  #code.add(isa.add(r3, r2, r1))
  #code.add('add.f32 r3, r2, r1;')
  code.add(isa.ld('param', r1, regs.ptxAddress(_a)))
  code.add(isa.ld('param', r2, regs.ptxAddress(_b)))
  code.add(isa.add(r3, r2, r1))
  code.add(isa.add(r3, r3, 1.0))
  code.add(isa.mov(r4, r3))
  #temp = prgm.acquire_register('u32')
  #code.add(isa.cvt(temp, regs.tid.x))
  #code.add(isa.cvt(r4, temp, rnd='rn'))
  temp1 = prgm.acquire_register('u32')
  temp2 = prgm.acquire_register('u32')
  temp3 = prgm.acquire_register('u32')
  code.add(isa.mul(temp2, temp1, temp3, hlw='lo'))
  
  code.add(isa.ld('param', rd1, regs.ptxAddress(_mem)))
  code.add(isa.st('global', regs.ptxAddress(rd1), r4))
  prgm.add(code)

  prgm.cache_code()
#   prgm.render_string = (
#   '''
#   .version 1.4
#   .target sm_10, map_f64_to_f32
#   .entry _main (
#   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
#   {
#   .reg .u64 %rd<3>;
#   .reg .f32 %f<6>;
#   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
#   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
#   add.f32 %f3, %f1, %f2;
  
#   mov.f32         %f4, %f3;
#   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
#   st.global.f32   [%rd1+0], %f4;

#   exit;
#   } // _Z16addArrayOnDevicePfff
#   '''
#   )
#   prgm.render_code = ptx_exec.compile(prgm.render_string)

  ####
  #ptx_mem_addr = proc.alloc_device('f32', 1)
  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  mem[0] = 5.0

  a = 1.0
  b = 2.0
  
  print mem.buffer_info()[0]
  param_list = [ptx_mem_addr, a, b]
  print map(type, param_list)
  #   # image, dev num, (x, y, w, h)

  #import pdb

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  #kernel = prgm.render_string
  #module = ptx_exec.compile(kernel)
  t1 = time.time()
  #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  proc.execute(prgm, (1,1,1,1,1), param_list)
  t2 = time.time()
#  pdb.set_trace()
  print "run time", t2 - t1

  print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
  print int(ptx_mem_addr)
  print int(mem.buffer_info()[0])
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem
  ####

  return
Ejemplo n.º 5
0
def TestParams():
  import time
  #SIZE = 1024
  kernel = (
  '''
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  '''
  )

  t1 = time.time()
  module = ptx_exec.compile(kernel)
  t2 = time.time()
  print "compile time", t2 - t1

  a = 1.0
  b = 2.0

  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  #mem.set_memory(ptx_mem_addr, 4)
  mem[0] = 5.0

  print ptx_mem_addr, type(ptx_mem_addr)
  print mem.buffer_info()[0], type(mem.buffer_info()[0])
  param_list = [ptx_mem_addr, a, b]
  # image, dev num, (x, y, w, h)

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  t1 = time.time()
  ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  t2 = time.time()
  print "run time", t2 - t1
  print "X", mem.buffer_info()[0], ptx_mem_addr
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem

  #ptx_exec.free(input)
  #ptx_exec.free(output)
  ##ptx_exec.free(glob)
  #ptx_exec.unload_module(image)
  return
Ejemplo n.º 6
0
def TestSimpleKernel():
    import corepy.arch.ptx.isa as isa
    import corepy.arch.ptx.types.registers as regs
    import time

    SIZE = 128

    proc = Processor(0)

    # build and run the kernel
    prgm = Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter("u64", name="_mem")
    _a = prgm.add_parameter("f32", name="_a")
    _b = prgm.add_parameter("f32", name="_b")

    #  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
    #  r1 = regs.ptxVariable('reg', 'f32', 'f1')
    #  r2 = regs.ptxVariable('reg', 'f32', 'f2')
    #  r3 = regs.ptxVariable('reg', 'f32', 'f3')
    #  r4 = regs.ptxVariable('reg', 'f32', 'f4')
    #  code.add('  .reg .u64 rd1;')
    #  code.add('  .reg .f32 f1;')
    #  code.add('  .reg .f32 f2;')
    #  code.add('  .reg .f32 f3;')
    #  code.add('  .reg .f32 f4;')

    rd1 = prgm.acquire_register("u64")
    r1 = prgm.acquire_register("f32")
    r2 = prgm.acquire_register("f32")
    r3 = prgm.acquire_register("f32")
    r4 = prgm.acquire_register("f32")
    v1 = prgm.add_variable("shared", "f32")  # don't need this, but let's test add_variable

    #  import pdb
    #  pdb.set_trace()
    # code.add(isa.add(r3, r2, r1))
    # code.add('add.f32 r3, r2, r1;')
    code.add(isa.ld("param", r1, regs.ptxAddress(_a)))
    code.add(isa.ld("param", r2, regs.ptxAddress(_b)))
    code.add(isa.add(r3, r2, r1))
    code.add(isa.add(r3, r3, 1.0))
    code.add(isa.mov(r4, r3))
    # temp = prgm.acquire_register('u32')
    # code.add(isa.cvt(temp, regs.tid.x))
    # code.add(isa.cvt(r4, temp, rnd='rn'))
    temp1 = prgm.acquire_register("u32")
    temp2 = prgm.acquire_register("u32")
    temp3 = prgm.acquire_register("u32")
    code.add(isa.mul(temp2, temp1, temp3, hlw="lo"))

    code.add(isa.ld("param", rd1, regs.ptxAddress(_mem)))
    code.add(isa.st("global", regs.ptxAddress(rd1), r4))
    prgm.add(code)

    prgm.cache_code()
    #   prgm.render_string = (
    #   '''
    #   .version 1.4
    #   .target sm_10, map_f64_to_f32
    #   .entry _main (
    #   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
    #   {
    #   .reg .u64 %rd<3>;
    #   .reg .f32 %f<6>;
    #   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
    #   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
    #   add.f32 %f3, %f1, %f2;

    #   mov.f32         %f4, %f3;
    #   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
    #   st.global.f32   [%rd1+0], %f4;

    #   exit;
    #   } // _Z16addArrayOnDevicePfff
    #   '''
    #   )
    #   prgm.render_code = ptx_exec.compile(prgm.render_string)

    ####
    # ptx_mem_addr = proc.alloc_device('f32', 1)
    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    mem[0] = 5.0

    a = 1.0
    b = 2.0

    print mem.buffer_info()[0]
    param_list = [ptx_mem_addr, a, b]
    print map(type, param_list)
    #   # image, dev num, (x, y, w, h)

    # import pdb

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    # kernel = prgm.render_string
    # module = ptx_exec.compile(kernel)
    t1 = time.time()
    # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    t2 = time.time()
    #  pdb.set_trace()
    print "run time", t2 - t1

    print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
    print int(ptx_mem_addr)
    print int(mem.buffer_info()[0])
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem
    ####

    return