def TestParams():
    import time

    # SIZE = 1024
    kernel = """
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  """

    t1 = time.time()
    module = ptx_exec.compile(kernel)
    t2 = time.time()
    print "compile time", t2 - t1

    a = 1.0
    b = 2.0

    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    # mem.set_memory(ptx_mem_addr, 4)
    mem[0] = 5.0

    print ptx_mem_addr, type(ptx_mem_addr)
    print mem.buffer_info()[0], type(mem.buffer_info()[0])
    param_list = [ptx_mem_addr, a, b]
    # image, dev num, (x, y, w, h)

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    t1 = time.time()
    ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    t2 = time.time()
    print "run time", t2 - t1
    print "X", mem.buffer_info()[0], ptx_mem_addr
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem

    # ptx_exec.free(input)
    # ptx_exec.free(output)
    ##ptx_exec.free(glob)
    # ptx_exec.unload_module(image)
    return
Exemple #2
0
def TestSimpleKernel():
  import corepy.arch.ptx.isa as isa
  import corepy.arch.ptx.types.registers as regs
  import time

  SIZE = 128

  proc = Processor(0)

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  _mem = prgm.add_parameter('u64', name='_mem')
  _a = prgm.add_parameter('f32', name='_a')
  _b = prgm.add_parameter('f32', name='_b')

#  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
#  r1 = regs.ptxVariable('reg', 'f32', 'f1')
#  r2 = regs.ptxVariable('reg', 'f32', 'f2')
#  r3 = regs.ptxVariable('reg', 'f32', 'f3')
#  r4 = regs.ptxVariable('reg', 'f32', 'f4')
#  code.add('  .reg .u64 rd1;')
#  code.add('  .reg .f32 f1;')
#  code.add('  .reg .f32 f2;')
#  code.add('  .reg .f32 f3;')
#  code.add('  .reg .f32 f4;')

  rd1 = prgm.acquire_register('u64')
  r1 = prgm.acquire_register('f32')
  r2 = prgm.acquire_register('f32')
  r3 = prgm.acquire_register('f32')
  r4 = prgm.acquire_register('f32')    
  v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable

#  import pdb
#  pdb.set_trace()
  #code.add(isa.add(r3, r2, r1))
  #code.add('add.f32 r3, r2, r1;')
  code.add(isa.ld('param', r1, regs.ptxAddress(_a)))
  code.add(isa.ld('param', r2, regs.ptxAddress(_b)))
  code.add(isa.add(r3, r2, r1))
  code.add(isa.add(r3, r3, 1.0))
  code.add(isa.mov(r4, r3))
  #temp = prgm.acquire_register('u32')
  #code.add(isa.cvt(temp, regs.tid.x))
  #code.add(isa.cvt(r4, temp, rnd='rn'))
  temp1 = prgm.acquire_register('u32')
  temp2 = prgm.acquire_register('u32')
  temp3 = prgm.acquire_register('u32')
  code.add(isa.mul(temp2, temp1, temp3, hlw='lo'))
  
  code.add(isa.ld('param', rd1, regs.ptxAddress(_mem)))
  code.add(isa.st('global', regs.ptxAddress(rd1), r4))
  prgm.add(code)

  prgm.cache_code()
#   prgm.render_string = (
#   '''
#   .version 1.4
#   .target sm_10, map_f64_to_f32
#   .entry _main (
#   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
#   {
#   .reg .u64 %rd<3>;
#   .reg .f32 %f<6>;
#   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
#   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
#   add.f32 %f3, %f1, %f2;
  
#   mov.f32         %f4, %f3;
#   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
#   st.global.f32   [%rd1+0], %f4;

#   exit;
#   } // _Z16addArrayOnDevicePfff
#   '''
#   )
#   prgm.render_code = ptx_exec.compile(prgm.render_string)

  ####
  #ptx_mem_addr = proc.alloc_device('f32', 1)
  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  mem[0] = 5.0

  a = 1.0
  b = 2.0
  
  print mem.buffer_info()[0]
  param_list = [ptx_mem_addr, a, b]
  print map(type, param_list)
  #   # image, dev num, (x, y, w, h)

  #import pdb

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  #kernel = prgm.render_string
  #module = ptx_exec.compile(kernel)
  t1 = time.time()
  #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  proc.execute(prgm, (1,1,1,1,1), param_list)
  t2 = time.time()
#  pdb.set_trace()
  print "run time", t2 - t1

  print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
  print int(ptx_mem_addr)
  print int(mem.buffer_info()[0])
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem
  ####

  return
Exemple #3
0
def TestParams():
  import time
  #SIZE = 1024
  kernel = (
  '''
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  '''
  )

  t1 = time.time()
  module = ptx_exec.compile(kernel)
  t2 = time.time()
  print "compile time", t2 - t1

  a = 1.0
  b = 2.0

  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  #mem.set_memory(ptx_mem_addr, 4)
  mem[0] = 5.0

  print ptx_mem_addr, type(ptx_mem_addr)
  print mem.buffer_info()[0], type(mem.buffer_info()[0])
  param_list = [ptx_mem_addr, a, b]
  # image, dev num, (x, y, w, h)

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  t1 = time.time()
  ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  t2 = time.time()
  print "run time", t2 - t1
  print "X", mem.buffer_info()[0], ptx_mem_addr
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem

  #ptx_exec.free(input)
  #ptx_exec.free(output)
  ##ptx_exec.free(glob)
  #ptx_exec.unload_module(image)
  return
Exemple #4
0
class Processor(spe.Processor):
  exec_module = ptx_exec

  def __init__(self, device=0):
    """Create a new Processor representing a particular GPU in the system, 
       indexed by device."""
    spe.Processor.__init__(self)

    if device < 0 or device > N_GPUS:
      raise Exception("Invalid device number %d" % device)

    print "Creating ctx"
    self.ctx = ptx_exec.alloc_ctx(device)
    self.device = device
    return


  def __del__(self):
    print "Destroying ctx"
    ptx_exec.free_ctx(self.ctx)
    return


  # ------------------------------
  # Memory Management
  # ------------------------------

  # def _get_fmt(self, typecode, comps = 1):
#     if typecode == 'f':
#       if comps == 1:
#         fmt = ptx_exec.FMT_FLOAT32_1
#       elif comps == 2:
#         fmt = ptx_exec.FMT_FLOAT32_2
#       elif comps == 4:
#         fmt = ptx_exec.FMT_FLOAT32_4
#       else:
#         raise Exception("Number of components must be 1, 2, or 4")
#     elif typecode == 'i':
#       if comps == 1:
#         fmt = ptx_exec.FMT_SIGNED_INT32_1
#       elif comps == 2:
#         fmt = ptx_exec.FMT_SIGNED_INT32_2
#       elif comps == 4:
#         fmt = ptx_exec.FMT_SIGNED_INT32_4
#       else:
#         raise Exception("Number of components must be 1, 2, or 4")
#     elif typecode == 'I':
#       if comps == 1:
#         fmt = ptx_exec.FMT_UNSIGNED_INT32_1
#       elif comps == 2:
#         fmt = ptx_exec.FMT_UNSIGNED_INT32_2
#       elif comps == 4:
#         fmt = ptx_exec.FMT_UNSIGNED_INT32_4
#       else:
#         raise Exception("Number of components must be 1, 2, or 4")
#     else:
#       raise Exception("Unsupported data type: " + str(typecode))
#     return fmt
  

  def alloc_device(self, typecode, length, comps = 1):
    """
    Allocate local GPU memory and return a handle for copying/binding.

    Typecode is ptx typecode (u32, s32, f32, u64, etc.)
    """
    #fmt = self._get_fmt(typecode, comps)

    scalar_byte_width = int(typecode[1:])/8
        
    # Allocate GPU memory and create a DeviceMemory handle
    address = ptx_exec.alloc_device(length*scalar_byte_width*comps)

    return DeviceMemory(address, typecode, length)

  def alloc_host(self, typecode, length, comps = 1):
    """
    Allocate local GPU memory and return a handle for copying/binding.

    Typecode is ptx typecode (u32, s32, f32, u64, etc.)
    """
    #fmt = self._get_fmt(typecode, comps)


    array_typecode = ''
    
    # This might be clearer, but not very efficient...
    #type_conversion_table = {}
    #type_conversion_table['32'] = {'f': 'f', 'u': 'I', 's', 'i'}
    #type_conversion_table['64'] = {'f': 'd', 'u': 'L', 's', 'l'}
    #type_conversion_table['16'] = {'u': 'H', 's', 'h'}
    #type_conversion_table['8'] = {'u': 'B', 's', 'b'}
    #
    #if typecode == 'b':
    #  typecode = 'u'
    #array_typecode = type_conversion_table[typecode[0]][typecode[1:]]
    
    scalar_width = int(typecode[1:])
    if typecode[0] == 'f':
      if scalar_width == 32:
        array_typecode = 'f'
      elif scalar_width == 64:
        array_typecode = 'd'
    elif typecode[0] == 'u':
      if scalar_width == 32:
        array_typecode = 'I'
      elif scalar_width == 64:
        array_typecode = 'L'
      elif scalar_width == 16:
        array_typecode = 'H'
      elif scalar_width == 8:
        array_typecode = 'b'
    elif typecode[0] == 's':
      if scalar_width == 32:
        array_typecode = 'i'
      elif scalar_width == 64:
        array_typecode = 'l'
      elif scalar_width == 16:
        array_typecode = 'h'
      elif scalar_width == 8:
        array_typecode = 'B'

    if array_typecode == '':
      raise Exception('Unable to convert type')
          
    mem = ptx_exec.alloc_host(length*scalar_byte_width*comps)
    
    arr = extarray.extarray(array_typecode, 0)
    arr.data_len = scalar_width/4 * length * comps
    arr.set_memory(mem, arr.data_len * 4)
    arr.gpu_mem_handle = mem
#    arr.gpu_device = self.device
    arr.gpu_width = length
#     arr.gpu_pitch = mem[2]
#     arr.gpu_height = height
    return arr

#   def alloc_remote(self, typecode, comps, width, height = 1, globl = False):
#     """Allocate an ExtArray backed by remote (main) memory."""
#     fmt = self._get_fmt(typecode, comps)

#     if globl:
#       globl = ptx_exec.GLOBAL_BUFFER

#     # Allocate and initialize the memory
#     # TODO - more operand error checking
#     mem = ptx_exec.alloc_remote(self.device, fmt, width, height, globl)


#   def alloc_remote_npy(self, typecode, comps, width, height = 1, globl = False):
#     """Allocate a NumPy ndarray backed by remote (main) memory."""
#     if not HAS_NUMPY:
#       raise ImportError("NumPy array support requires NumPy installation")

#     fmt = self._get_fmt(typecode, comps)
#     if typecode == 'f':
#       dtype = numpy.float32
#     elif typecode == 'i':
#       dtype = numpy.int32
#     elif typecode == 'I':
#       dtype = numpy.uint32
#     else:
#       raise Exception("Unsupported data type: " + str(typecode))

#     if globl:
#       globl = ptx_exec.GLOBAL_BUFFER

#     buf = ptx_exec.calmembuffer(self.device, fmt, width, height, globl)
#     arr = numpy.frombuffer(buf, dtype=dtype)

#     if height == 1:
#       arr.shape = (width, comps)
#     else:
#       arr.shape = (buf.pitch, height, comps)

#     return arr


  def free_device(self, hdl):
    ptx_exec.free_device(hdl.address)

  def free_host(self, arr):
    ptx_exec.free_host(arr.buffer_info()[0])

  def free(self, hdl):
    #if not (isinstance(arr, extarray.extarray) and hasattr(arr, "gpu_mem_handle")):
    #  raise Exception("Not a register or extarray with a GPU memory handle")

    if isinstance(hdl, extarray.extarray):
      if not hasattr(hdl, "gpu_mem_handle"):
        raise TypeError("Not an extarray with a GPU memory handle")

      ptx_exec.free_remote(hdl.gpu_mem_handle)

      del hdl.gpu_mem_handle
      del hdl.gpu_device
      del hdl.gpu_width
      del hdl.gpu_pitch

      hdl.set_memory(0, 0)
      hdl.data_len = 0
    elif isinstance(hdl, LocalMemory):
      ptx_exec.free_local(hdl.binding)
      hdl.res = None
    else:
      raise TypeError("Unknown handle type %s" % (type(hdl)))
    return


  # ------------------------------
  # Kernel Execution
  # ------------------------------

  def copy(self, dst, src, async = False):
    """Copy memory from src to dst, using this GPU."""

    # Figure out what dst and src are and extract bindings
    if isinstance(dst, extarray.extarray):
      ptx_exec.copy_dtoh(dst.buffer_info()[0], src.address, src.length*src.itemsize)
    elif isinstance(dst, DeviceMemory):
      ptx_exec.copy_htod(dst.address, src.buffer_info()[0], src.buffer_info()[1]*src.itemsize)
    #elif isinstance(dst, numpy.ndarray):
    #  # NumPy array.. do we support it, and does it use a CAL buffer?
    #  if not HAS_NUMPY:
    #    raise ImportError("NumPy array support requires NumPy installation")
    #  if not isinstance(arr.base, ptx_exec.calmembuffer):
    #    raise TypeError("Not NumPy with a GPU memory buffer")

    ## Start the copy
    #hdl = ptx_exec.copy_async(self.ctx, dst_binding, src_binding)
    #
    #if async:
    #  return hdl
    #
    ## Not async, complete the copy here.
    #ptx_exec.join_copy(self.ctx, hdl)

    return
def TestSimpleKernel():
    import corepy.arch.ptx.isa as isa
    import corepy.arch.ptx.types.registers as regs
    import time

    SIZE = 128

    proc = Processor(0)

    # build and run the kernel
    prgm = Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter("u64", name="_mem")
    _a = prgm.add_parameter("f32", name="_a")
    _b = prgm.add_parameter("f32", name="_b")

    #  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
    #  r1 = regs.ptxVariable('reg', 'f32', 'f1')
    #  r2 = regs.ptxVariable('reg', 'f32', 'f2')
    #  r3 = regs.ptxVariable('reg', 'f32', 'f3')
    #  r4 = regs.ptxVariable('reg', 'f32', 'f4')
    #  code.add('  .reg .u64 rd1;')
    #  code.add('  .reg .f32 f1;')
    #  code.add('  .reg .f32 f2;')
    #  code.add('  .reg .f32 f3;')
    #  code.add('  .reg .f32 f4;')

    rd1 = prgm.acquire_register("u64")
    r1 = prgm.acquire_register("f32")
    r2 = prgm.acquire_register("f32")
    r3 = prgm.acquire_register("f32")
    r4 = prgm.acquire_register("f32")
    v1 = prgm.add_variable("shared", "f32")  # don't need this, but let's test add_variable

    #  import pdb
    #  pdb.set_trace()
    # code.add(isa.add(r3, r2, r1))
    # code.add('add.f32 r3, r2, r1;')
    code.add(isa.ld("param", r1, regs.ptxAddress(_a)))
    code.add(isa.ld("param", r2, regs.ptxAddress(_b)))
    code.add(isa.add(r3, r2, r1))
    code.add(isa.add(r3, r3, 1.0))
    code.add(isa.mov(r4, r3))
    # temp = prgm.acquire_register('u32')
    # code.add(isa.cvt(temp, regs.tid.x))
    # code.add(isa.cvt(r4, temp, rnd='rn'))
    temp1 = prgm.acquire_register("u32")
    temp2 = prgm.acquire_register("u32")
    temp3 = prgm.acquire_register("u32")
    code.add(isa.mul(temp2, temp1, temp3, hlw="lo"))

    code.add(isa.ld("param", rd1, regs.ptxAddress(_mem)))
    code.add(isa.st("global", regs.ptxAddress(rd1), r4))
    prgm.add(code)

    prgm.cache_code()
    #   prgm.render_string = (
    #   '''
    #   .version 1.4
    #   .target sm_10, map_f64_to_f32
    #   .entry _main (
    #   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
    #   {
    #   .reg .u64 %rd<3>;
    #   .reg .f32 %f<6>;
    #   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
    #   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
    #   add.f32 %f3, %f1, %f2;

    #   mov.f32         %f4, %f3;
    #   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
    #   st.global.f32   [%rd1+0], %f4;

    #   exit;
    #   } // _Z16addArrayOnDevicePfff
    #   '''
    #   )
    #   prgm.render_code = ptx_exec.compile(prgm.render_string)

    ####
    # ptx_mem_addr = proc.alloc_device('f32', 1)
    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    mem[0] = 5.0

    a = 1.0
    b = 2.0

    print mem.buffer_info()[0]
    param_list = [ptx_mem_addr, a, b]
    print map(type, param_list)
    #   # image, dev num, (x, y, w, h)

    # import pdb

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    # kernel = prgm.render_string
    # module = ptx_exec.compile(kernel)
    t1 = time.time()
    # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    t2 = time.time()
    #  pdb.set_trace()
    print "run time", t2 - t1

    print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
    print int(ptx_mem_addr)
    print int(mem.buffer_info()[0])
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem
    ####

    return