Exemple #1
0
  def evaluate(self, params, returnOutputs=False):
    """Evaluate several networks (with given params) on training set.
    
    @param params: network params
    @type params: list of Parameters
    @param returnOutputs: return network output values (debug)
    @type returnOutputs: bool, default False
    
    @return output matrix if returnOutputs=True, else None
    """
    if self.popSize != len(params):
      raise ValueError("Need %d Parameter structures (provided %d)" % (
        self.popSize, len(params)))
    
    paramArrayType = Parameters * len(params)
    driver.memcpy_htod(self.params, paramArrayType(*params))

    # TODO: remove
    driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4)
    
    self.evaluateKernel.prepared_call(self.evaluateGridDim,
                                      self.trainSetDev,
                                      self.trainSet.size,
                                      self.params,
                                      self.popSize,
                                      self.outputs)

    driver.Context.synchronize()

    self.outputsMat = driver.from_device(self.outputs,
                                         shape=(self.popSize, self.trainSet.size),
                                         dtype=np.float32)
    
    if returnOutputs:
      return self.outputsMat
Exemple #2
0
    def add_ancilla(self, anc_st):
        """Add an ancilla in the ground or excited state as the highest new bit.
        """

        byte_size_of_smaller_dm = 2**(2 * self.no_qubits) * 8

        if self.allocated_qubits == self.no_qubits:
            # allocate larger memory
            new_dm = ga.zeros(self._size * 4, np.float64)
            offset = anc_st * 3 * byte_size_of_smaller_dm
            drv.memcpy_dtod(int(new_dm.gpudata) + offset,
                                  self.data.gpudata, byte_size_of_smaller_dm)

            self.data = new_dm
        else:
            # reuse previously allocated memory
            if anc_st == 0:
                drv.memset_d8(int(self.data.gpudata) + byte_size_of_smaller_dm,
                              0, 3 * byte_size_of_smaller_dm)
            if anc_st == 1:
                drv.memcpy_dtod(int(self.data.gpudata) + 3 * byte_size_of_smaller_dm,
                                      self.data.gpudata, byte_size_of_smaller_dm)
                drv.memset_d8(self.data.gpudata, 0, 3 *
                              byte_size_of_smaller_dm)

        self._set_no_qubits(self.no_qubits + 1)
Exemple #3
0
def stepFuntion():
  getModulo( psi_d, psiMod_d )
  maxVal = (gpuarray.max(psiMod_d)).get()
  multiplyByScalarReal( cudaPre(0.95/(maxVal)), psiMod_d )
  sendModuloToUCHAR( psiMod_d, plotData_d)
  copyToScreenArray()

  if volumeRender.nTextures == 2:
    if not realDynamics:
      cuda.memset_d8(activity_d.ptr, 0, nBlocks3D )
      findActivityKernel( cudaPre(0.001), psi_d, activity_d, grid=grid3D, block=block3D )
    if plotVar == 1: getActivityKernel( psiOther_d, activity_d, grid=grid3D, block=block3D )
    if plotVar == 0:
      if realTEXTURE:
	tex_psiReal.set_array( psiK2Real_array )
	tex_psiImag.set_array( psiK2Imag_array )
	getVelocity_texKernel( dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D )
      else: getVelocityKernel( np.int32(neighbors), dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D )
      maxVal = (gpuarray.max(psiOther_d)).get()
      if maxVal > 0: multiplyByScalarReal( cudaPre(1./maxVal), psiOther_d )
    sendModuloToUCHAR( psiOther_d, plotData_d_1)
    copyToScreenArray_1()
  if applyTransition: timeTransition()
  if realDynamics: realStep()
  else: imaginaryStep()
Exemple #4
0
	def prepare_dest_package_and_dest_devptr(new_data, dest_package):
		# ???????????????????
		output_package = dest_package.copy()
		if new_data:
			# create new data
			# because we don't have cuda memory allocation for dest_package
			dest_devptr, new_usage = malloc_with_swap_out(output_package.data_bytes)
			output_package.set_usage(new_usage)
			
			cuda.memset_d8(dest_devptr, 0, output_package.data_bytes)
		else:
			# we already have cuda memory allocation
			# if there are enough halo, we can use exist buffer instead dest_package
			# if there are not enough halo, we have to allocate new buffer
			
			new_data_halo = task.dest.data_halo
			exist_data_halo = data_list[u][ss][sp].data_halo
			
			if new_data_halo <= exist_data_halo: 
				output_package = data_list[u][ss][sp]
			else:
				output_package = dest_package
			dest_devptr = data_list[u][ss][sp].devptr

		return output_package, dest_devptr
Exemple #5
0
def rk4_iteration():
  cuda.memset_d8(activity_d.ptr, 0, nBlocks3D )
  findActivityKernel( cudaPre(0.00001), psi_d, activity_d, grid=grid3D, block=block3D )
  #Step 1
  slopeCoef = cudaPre( 1.0 )
  weight    = cudaPre( 0.5 )
  eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiK2_d, psiK1_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D )
  #Step 2
  slopeCoef = cudaPre( 2.0 )
  weight    = cudaPre( 0.5 )
  eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiK1_d, psiK2_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D )  
  #Step 3
  slopeCoef = cudaPre( 2.0 )
  weight    = cudaPre( 1. )
  eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiK2_d, psiK1_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D )    
  #Step 4
  slopeCoef = cudaPre( 1.0 )
  weight    = cudaPre( 1. )
  eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiK1_d, psiK2_d, psiRunge_d, np.uint8(1), activity_d, grid=grid3D, block=block3D ) 
Exemple #6
0
    def test_reshuffle_invertible(self):
        dm = random_dm10()

        dm_gpu = drv.to_device(dm)

        for i in range(no_qubits):
            bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits),
                               block=block, grid=grid)

        dmreal = np.zeros(2**(2 * no_qubits))
        dmreal_gpu = drv.to_device(dmreal)

        pauli_reshuffle(dm_gpu, dmreal_gpu, np.int32(no_qubits), np.int32(0),
                        block=block, grid=grid)

        dm_gpu2 = drv.mem_alloc(dm.nbytes)
        drv.memset_d8(dm_gpu2, 0, dm.nbytes)

        pauli_reshuffle(dm_gpu2, dmreal_gpu, np.int32(no_qubits), np.int32(1),
                        block=block, grid=grid)

        for i in range(no_qubits):
            bit_to_pauli_basis(dm_gpu2, np.int32(1 << i), np.int32(no_qubits),
                               block=block, grid=grid)

        dm2 = drv.from_device_like(dm_gpu2, dm)

        assert np.allclose(dm, dm2)
Exemple #7
0
def set_ipc_handle(op, shared_queue, handle):
    lock = drv.mem_alloc(1)
    drv.memset_d8(lock, 0, 1)
    buf_ipc_hdl = drv.mem_get_ipc_handle(handle)
    lock_ipc_hdl = drv.mem_get_ipc_handle(lock)
    shared_queue.put((buf_ipc_hdl, lock_ipc_hdl))
    return (lock)
Exemple #8
0
 def execute(self):
     sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8)
     while (sender_ready == 0):
         sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8)
     drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf,
                     self.tensor.tensor.size * self.op.dtype.itemsize)
     drv.memset_d8(self.sender_ready, 0, 1)
def stepFunction():
    global animIter
    if showActivity:
        cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks)
        findActivityKernel(cudaPre(1.e-10),
                           concentrationIn_d,
                           activeBlocks_d,
                           grid=grid2D,
                           block=block2D)
        getActivityKernel(activeBlocks_d,
                          activeThreads_d,
                          grid=grid2D,
                          block=block2D)
    cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr,
                     concentrationOut_d.nbytes)
    maxVal = gpuarray.max(plotData_d).get()
    scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity),
                  activeThreads_d)
    if cudaP == "float":
        [oneIteration_tex() for i in range(nIterationsPerPlot)]
    else:
        [oneIteration_sh() for i in range(nIterationsPerPlot // 2)]
    if plotting and animIter % 25 == 0:
        maxVals.append(maxVal)
        sumConc.append(gpuarray.sum(concentrationIn_d).get())
        plotData(maxVals, sumConc)
    animIter += 1
Exemple #10
0
 def execute(self):
     for i in range(len(self.op.from_id)):
         sender_ready = drv.from_device(self.sender_ready[i], (1, ),
                                        np.int8)
         while (sender_ready == 0):
             sender_ready = drv.from_device(self.sender_ready[i], (1, ),
                                            np.int8)
         drv.memset_d8(self.sender_ready[i], 0, 1)
Exemple #11
0
def set_ipc_handle(op, shared_queue, handle, local=False):
    lock = drv.mem_alloc(1)
    drv.memset_d8(lock, 0, 1)
    if local:
        buf_ipc_hdl = int(handle)
        lock_ipc_hdl = int(lock)
    else:
        buf_ipc_hdl = drv.mem_get_ipc_handle(handle)
        lock_ipc_hdl = drv.mem_get_ipc_handle(lock)
    shared_queue.put((local, buf_ipc_hdl, lock_ipc_hdl))
    return (lock)
Exemple #12
0
def execute(positions, num_particles, num_frames):
    #Get host positions:
    cpuPos = numpy.array(positions, dtype=numpy.float32)
    #Allocate position space on device:
    devPos = cuda.mem_alloc(cpuPos.nbytes)
    #Copy positions:
    cuda.memcpy_htod(devPos, cpuPos)

    #Allocate device velocities:
    devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes)
    cuda.memset_d32(devVels, 0, 2 * num_particles)
    # #Copy velocities:
    # cuda.memcpy_htod(devVels, cpuVels)

    #Allocate and initialize device in bounds to false:
    #inBounds = numpy.zeros(num_particles, dtype=bool)
    devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes)
    cuda.memset_d8(devInBounds, True, num_particles)

    # inB = numpy.zeros(num_particles, dtype=numpy.bool)
    # cuda.memcpy_dtoh(inB, devInBounds)
    # print inB

    # cuda.memcpy_htod(devInBounds, inBounds)
    # numBlocks = 1#(num_particles // 512) + 1;
    grid_dim = ((num_particles // NUM_THREADS) + 1, 1)
    print grid_dim
    runframe = module.get_function("runframe")
    frames = [None] * num_frames
    for i in range(num_frames):
        runframe(devPos,
                 devVels,
                 devInBounds,
                 numpy.int32(num_particles),
                 grid=grid_dim,
                 block=(NUM_THREADS, 1, 1))
        #Get the positions from device:
        cuda.memcpy_dtoh(cpuPos, devPos)
        frames[i] = cpuPos.copy()
        #frames[i] = copy(cpuPos)
        #write_frame(out, cpuPos, num_particles)

    #Simulation destination file:
    # out = open(OUTPUT_FILE, 'w')
    # write_header(out, num_particles)
    # for frame in frames:
    #     write_frame(out, frame, num_particles)

    #clean up...
    #out.close()
    devPos.free()
    devVels.free()
    devInBounds.free()
Exemple #13
0
def execute(positions, num_particles, num_frames):
    #Get host positions:
    cpuPos = numpy.array(positions, dtype=numpy.float32)
    #Allocate position space on device:
    devPos = cuda.mem_alloc(cpuPos.nbytes)
    #Copy positions:
    cuda.memcpy_htod(devPos, cpuPos)
    
    #Allocate device velocities:
    devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes)
    cuda.memset_d32(devVels, 0, 2 * num_particles)
    # #Copy velocities:
    # cuda.memcpy_htod(devVels, cpuVels)
    
    #Allocate and initialize device in bounds to false:
    #inBounds = numpy.zeros(num_particles, dtype=bool)
    devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes)
    cuda.memset_d8(devInBounds, True, num_particles)
    
    # inB = numpy.zeros(num_particles, dtype=numpy.bool)
    # cuda.memcpy_dtoh(inB, devInBounds)
    # print inB
    
    # cuda.memcpy_htod(devInBounds, inBounds)
    # numBlocks = 1#(num_particles // 512) + 1;
    grid_dim = ((num_particles // NUM_THREADS) + 1, 1)
    print grid_dim
    runframe = module.get_function("runframe")
    frames = [None] * num_frames
    for i in range(num_frames):
        runframe(devPos, devVels, devInBounds, 
                 numpy.int32(num_particles),
                 grid=grid_dim,
                 block=(NUM_THREADS, 1, 1))
        #Get the positions from device:
        cuda.memcpy_dtoh(cpuPos, devPos)
        frames[i] = cpuPos.copy()
        #frames[i] = copy(cpuPos)
        #write_frame(out, cpuPos, num_particles)
    
    #Simulation destination file:
    # out = open(OUTPUT_FILE, 'w')
    # write_header(out, num_particles)
    # for frame in frames:
    #     write_frame(out, frame, num_particles)
    
    #clean up...
    #out.close()
    devPos.free()
    devVels.free()
    devInBounds.free()
Exemple #14
0
def rk4_FFT_iteration():
  cuda.memset_d8(activity_d.ptr, 0, nBlocks3D )
  findActivityKernel( cudaPre(0.00001), psi_d, activity_d, grid=grid3D, block=block3D )
  #Step 1
  slopeCoef = cudaPre( 1.0 )
  weight    = cudaPre( 0.5 )
  fftPlan.execute( psiK2_d, psiFFT_d )
  getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D )
  fftPlan.execute( partialX_d,  inverse=True )
  fftPlan.execute( partialY_d,  inverse=True )
  fftPlan.execute( laplacian_d, inverse=True )
  eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega,
		  psi_d, psiK2_d, psiK1_d, psiRunge_d, laplacian_d, partialX_d, partialY_d,
		  np.uint8(0), activity_d, grid=grid3D, block=block3D )
  #Step 2
  slopeCoef = cudaPre( 2.0 )
  weight    = cudaPre( 0.5 )
  fftPlan.execute( psiK1_d, psiFFT_d )
  getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D )
  fftPlan.execute( partialX_d,  inverse=True )
  fftPlan.execute( partialY_d,  inverse=True )
  fftPlan.execute( laplacian_d, inverse=True )
  eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega,
		  psi_d, psiK1_d, psiK2_d, psiRunge_d, laplacian_d, partialX_d, partialY_d,
		  np.uint8(0), activity_d, grid=grid3D, block=block3D )  
  #Step 3
  slopeCoef = cudaPre( 2.0 )
  weight    = cudaPre( 1. )
  fftPlan.execute( psiK2_d, psiFFT_d )
  getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D )
  fftPlan.execute( partialX_d,  inverse=True )
  fftPlan.execute( partialY_d,  inverse=True )
  fftPlan.execute( laplacian_d, inverse=True )
  eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega,
		  psi_d, psiK2_d, psiK1_d, psiRunge_d, laplacian_d, partialX_d, partialY_d,
		  np.uint8(0), activity_d, grid=grid3D, block=block3D )    
  #Step 4
  slopeCoef = cudaPre( 1.0 )
  weight    = cudaPre( 1. )
  fftPlan.execute( psiK1_d, psiFFT_d )
  getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D )
  fftPlan.execute( partialX_d,  inverse=True )
  fftPlan.execute( partialY_d,  inverse=True )
  fftPlan.execute( laplacian_d, inverse=True )
  eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega,
		  psi_d, psiK1_d, psiK2_d, psiRunge_d, laplacian_d, partialX_d, partialY_d,
		  np.uint8(1), activity_d, grid=grid3D, block=block3D ) 
Exemple #15
0
    def memset(self, allocation, value, size):
        """set the memory in allocation to the value in value

        :param allocation: A GPU memory allocation unit
        :type allocation: pycuda.driver.DeviceAllocation

        :param value: The value to set the memory to
        :type value: a single 8-bit unsigned int

        :param size: The size of to the allocation unit in bytes
        :type size: int

        """
        drv.memset_d8(allocation, value, size)
Exemple #16
0
    def memset(self, allocation, value, size):
        """set the memory in allocation to the value in value

        :param allocation: A GPU memory allocation unit
        :type allocation: pycuda.driver.DeviceAllocation

        :param value: The value to set the memory to
        :type value: a single 8-bit unsigned int

        :param size: The size of to the allocation unit in bytes
        :type size: int

        """
        drv.memset_d8(allocation, value, size)
Exemple #17
0
    def execute(self):
        if self.recvr_buf is None:
            # set_ipc_handle must be called before open_ipc_handle in certain cases to avoid a
            # hang, hence calling set_ in bind_buffers and open_ in execute.
            # See corresponding comment in ScatterRecv kernel for details.
            (self.tnsr_ipc_hdl, self.send_ready) = open_ipc_handle(
                self.op._shared_queues[self.op.idx])
            chunk_size = self.tensor.tensor.size * self.op.dtype.itemsize
            self.recvr_buf = int(self.tnsr_ipc_hdl) + self.op.idx * chunk_size

        # Push our fragment into its section of the larger recvr buffer, which assumes gather axis
        # is least contiguous.
        drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata,
                        self.tensor.tensor.size * self.op.dtype.itemsize)
        drv.memset_d8(self.send_ready, 1, 1)
def stepFunction():
  global animIter
  if showActivity: 
    cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks )
    findActivityKernel( cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D  )
    getActivityKernel( activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D ) 
  cuda.memcpy_dtod( plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes )
  maxVal = gpuarray.max( plotData_d ).get()
  scalePlotData(100./maxVal, plotData_d, np.uint8(showActivity), activeThreads_d )
  if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ]
  else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ]
  if plotting and animIter%25 == 0: 
    maxVals.append( maxVal )
    sumConc.append( gpuarray.sum(concentrationIn_d).get() )
    plotData( maxVals, sumConc )
  animIter += 1
	def prepare_dest_package_and_dest_devptr(new_data, dest_package):
		output_package = dest_package.copy()
		if new_data:
			# create new data
			dest_devptr, new_usage = malloc_with_swap_out(output_package.buffer_bytes)
			output_package.usage = new_usage
			cuda.memset_d8(dest_devptr, 0, output_package.buffer_bytes)
		else:
			new_data_halo = task.dest.data_halo
			exist_data_halo = data_list[u][ss][sp]
			
			if new_data_halo < exist_data_halo: output_package = data_list[u][ss][sp]
			else: output_package = dest_package
			
			dest_devptr = data_list[u][ss][sp].devptr

		return output_package, dest_devptr
Exemple #20
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8( self.gpudata,
                                   unpack_from('B', value)[0],
                                   self.size)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16(self.gpudata,
                                   unpack_from('H', value)[0],
                                   self.size)
                else:
                    drv.memset_d32(self.gpudata,
                                   unpack_from('I', value)[0],
                                   self.size)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Exemple #21
0
def rk4_texture_iteration():
  cuda.memset_d8(activity_d.ptr, 0, nBlocks3D )
  findActivityKernel( cudaPre(0.00001), psi_d, activity_d, grid=grid3D, block=block3D )
  #Step 1
  slopeCoef = cudaPre( 1.0 )
  weight    = cudaPre( 0.5 )
  tex_psiReal.set_array( psiK2Real_array )
  tex_psiImag.set_array( psiK2Imag_array )
  surf_psiReal.set_array( psiK1Real_array )
  surf_psiImag.set_array( psiK1Imag_array )
  eulerStep_textKernel( slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D )
  #Step 2
  slopeCoef = cudaPre( 2.0 )
  weight    = cudaPre( 0.5 )
  tex_psiReal.set_array( psiK1Real_array )
  tex_psiImag.set_array( psiK1Imag_array )
  surf_psiReal.set_array( psiK2Real_array )
  surf_psiImag.set_array( psiK2Imag_array )
  eulerStep_textKernel(  slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D )  
  #Step 3
  slopeCoef = cudaPre( 2.0 )
  weight    = cudaPre( 1. )
  tex_psiReal.set_array( psiK2Real_array )
  tex_psiImag.set_array( psiK2Imag_array )
  surf_psiReal.set_array( psiK1Real_array )
  surf_psiImag.set_array( psiK1Imag_array )
  eulerStep_textKernel(  slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D )    
  #Step 4
  slopeCoef = cudaPre( 1.0 )
  weight    = cudaPre( 1. )
  tex_psiReal.set_array( psiK1Real_array )
  tex_psiImag.set_array( psiK1Imag_array )
  surf_psiReal.set_array( psiK2Real_array )
  surf_psiImag.set_array( psiK2Imag_array )
  eulerStep_textKernel(  slopeCoef, weight,
		  xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega,
		  psi_d, psiRunge_d, np.uint8(1), activity_d, grid=grid3D, block=block3D ) 
Exemple #22
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8(self.gpudata,
                                  unpack_from('B', value)[0], self.size)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16(self.gpudata,
                                   unpack_from('H', value)[0], self.size)
                else:
                    drv.memset_d32(self.gpudata,
                                   unpack_from('I', value)[0], self.size)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Exemple #23
0
    def prepare_dest_package_and_dest_devptr(new_data, dest_package):
        output_package = dest_package.copy()
        if new_data:
            # create new data
            dest_devptr, new_usage = malloc_with_swap_out(
                output_package.buffer_bytes)
            output_package.usage = new_usage
            cuda.memset_d8(dest_devptr, 0, output_package.buffer_bytes)
        else:
            new_data_halo = task.dest.data_halo
            exist_data_halo = data_list[u][ss][sp]

            if new_data_halo < exist_data_halo:
                output_package = data_list[u][ss][sp]
            else:
                output_package = dest_package

            dest_devptr = data_list[u][ss][sp].devptr

        return output_package, dest_devptr
Exemple #24
0
 def step_stage1(self):
     # Copy data to GPU memory
     cuda.memcpy_htod(self.mass_rx_array_g, self.mass_r_array[:, 0])
     cuda.memcpy_htod(self.mass_ry_array_g, self.mass_r_array[:, 1])
     cuda.memcpy_htod(self.mass_rz_array_g, self.mass_r_array[:, 2])
     cuda.memset_d8(self.mass_ax_array_g, 0, self.MEM_LEN)
     cuda.memset_d8(self.mass_ay_array_g, 0, self.MEM_LEN)
     cuda.memset_d8(self.mass_az_array_g, 0, self.MEM_LEN)
     # Run "pair" calculation: One object against vector of objects per iteration
     for row_np, threads_per_block, blocks in self.index_range:
         self.sm_update_pair(row_np,
                             self.MASS_LEN_np,
                             self.mass_rx_array_g,
                             self.mass_ry_array_g,
                             self.mass_rz_array_g,
                             self.mass_ax_array_g,
                             self.mass_ay_array_g,
                             self.mass_az_array_g,
                             self.mass_m_array_g,
                             block=(threads_per_block, 1, 1),
                             grid=(blocks, 1))
     # Copy data to GPU memory
     cuda.memcpy_dtoh(self.mass_a_array[:, 0], self.mass_ax_array_g)
     cuda.memcpy_dtoh(self.mass_a_array[:, 1], self.mass_ay_array_g)
     cuda.memcpy_dtoh(self.mass_a_array[:, 2], self.mass_az_array_g)
Exemple #25
0
 def execute(self):
     # Push our fragment into its section of the larger recvr buffer, which assumes gather axis
     # is least contiguous.
     drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata,
                     self.tensor.tensor.size * self.op.dtype.itemsize)
     drv.memset_d8(self.send_ready, 1, 1)
Exemple #26
0
 def func():
     drv.memset_d8(devU, 0, cpuU.nbytes)
     kernel.prepared_call(grid, block, *parms)
Exemple #27
0
 def execute(self):
     for i in range(len(self.op.to_id)):
         drv.memset_d8(self.send_ready[i], 1, 1)