Python sizeofの例、pycuda.characterize.sizeof Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cool.py プロジェクト: danrugeles/Heron

def get_rng_states(size, block, grid):

    rng_states = cuda.mem_alloc(
        size *
        characterize.sizeof('curandState', "#include <curand_kernel.h>"))

    mod = SourceModule("""
	#include <curand_kernel.h>

	extern "C"
	{

	__global__ void init_rng(int nthreads, curandState *s )
	{
		   int idx = blockIdx.x*blockDim.x + threadIdx.x;

		   if (idx >= nthreads)
		           return;

		   curand_init(1234, idx, 0, &s[idx]);
	}

	} // extern "C"
	""",
                       no_extern_c=True)

    init_rng = mod.get_function('init_rng')

    init_rng(np.int32(size), rng_states, block=block, grid=grid)

    return rng_states

コード例 #2

0

ファイルを表示

    def __init__(self, detector, wavelengths=None, print_usage=False):
        GPUGeometry.__init__(self,
                             detector,
                             wavelengths=wavelengths,
                             print_usage=False)
        self.solid_id_to_channel_index_gpu = \
            ga.to_gpu(detector.solid_id_to_channel_index.astype(np.int32))
        self.nchannels = detector.num_channels()

        self.time_cdf_x_gpu = ga.to_gpu(detector.time_cdf[0].astype(
            np.float32))
        self.time_cdf_y_gpu = ga.to_gpu(detector.time_cdf[1].astype(
            np.float32))

        self.charge_cdf_x_gpu = ga.to_gpu(detector.charge_cdf[0].astype(
            np.float32))
        self.charge_cdf_y_gpu = ga.to_gpu(detector.charge_cdf[1].astype(
            np.float32))

        detector_source = get_cu_source('detector.h')
        detector_struct_size = characterize.sizeof('Detector', detector_source)
        self.detector_gpu = make_gpu_struct(detector_struct_size, [
            self.solid_id_to_channel_index_gpu, self.time_cdf_x_gpu,
            self.time_cdf_y_gpu, self.charge_cdf_x_gpu, self.charge_cdf_y_gpu,
            np.int32(self.nchannels),
            np.int32(len(detector.time_cdf[0])),
            np.int32(len(detector.charge_cdf[0])),
            np.float32(detector.charge_cdf[0][-1] / 2**16)
        ])

コード例 #3

0

ファイルを表示

    def get_rng_states(size):
        init_rng_src = """
        #include <curand_kernel.h>

        extern "C"
        {

        __global__ void init_rng(int nthreads, curandStateMRG32k3a *s)
        {
            int tid = threadIdx.x + (blockIdx.x * blockDim.x);

            if (tid >= nthreads)
            {
                return;
            }

            curand_init(tid, 0, 0, &s[tid]);
        }

        } // extern "C"
        """

        rng_states = cuda.mem_alloc(size * characterize.sizeof(
            'curandStateMRG32k3a', '#include <curand_kernel.h>'))

        module = SourceModule(init_rng_src, no_extern_c=True)
        init_rng = module.get_function('init_rng')

        init_rng(numpy.int32(size),
                 rng_states,
                 numpy.uint64(0),
                 block=(64, 1, 1),
                 grid=(size // 64 + 1, 1))

        return rng_states

コード例 #4

0

ファイルを表示

ファイル: random.py プロジェクト: fjarri-attic/beclab

	def seed(self, seed=None):
		from pycuda.characterize import sizeof, has_stack
		import pycuda.driver as cuda
		import pycuda.gpuarray as gpuarray

		rng = numpy.random.RandomState()
		rng.seed(seed)

		gen_block_size = min(
			self._initialize.max_threads_per_block,
			self._sample.max_threads_per_block)
		gen_grid_size = self._env.device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)
		gen_block = (gen_block_size, 1, 1)
		gen_gsize = (gen_grid_size * gen_block_size, 1, 1)

		num_gen = gen_block_size * gen_grid_size
		assert num_gen <= 20000

		seeds = gpuarray.to_gpu(rng.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32))
		state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
		self.states = gpuarray.GPUArray(num_gen * state_type_size, numpy.uint8)
		self._initialize.customCall(gen_gsize, gen_block, self.states.gpudata, seeds.gpudata)
		self._env.synchronize()
		self.gsize = gen_gsize
		self.lsize = gen_block

コード例 #5

0

ファイルを表示

ファイル: cuda_lib.py プロジェクト: koszullab/centroID

    def __init__(self, init_data, n_generators):

        self.ctx = curr_gpu.make_context()
        self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        ################################################################################################################

        self.width_mat = np.int32(init_data.shape[0])
        #        self.gpu_init_data = ga.to_gpu(init_data)
        self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
        cuda.memcpy_htod(self.gpu_init_data, init_data)

        self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32)
        print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
        cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
        #        self.gpu_new_data = ga.to_gpu(self.cpu_new_data)

        self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
        self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
        cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
        #        self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
        ################################################################################################################
        self.init_rng = self.module.get_function("init_rng")
        self.gen_rand_mat = self.module.get_function("gen_rand_mat")
        self.sum_along_axis = self.module.get_function("sum_along_axis")
        self.norm_along_axis = self.module.get_function("norm_along_axis")
        self.init_vect_sum = self.module.get_function("init_vect_sum")
        self.copy_mat = self.module.get_function("copy_mat")
        ################################################################################################################
        self.n_generators = n_generators
        seed = 1
        self.rng_states = cuda.mem_alloc(
            n_generators
            * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
        )
        self.init_rng(
            np.int32(n_generators),
            self.rng_states,
            np.uint64(seed),
            np.uint64(0),
            block=(64, 1, 1),
            grid=(n_generators // 64 + 1, 1),
        )
        (free, total) = cuda.mem_get_info()

        size_block_x = 32
        size_block_y = 32
        n_blocks_x = int(self.width_mat) // (size_block_x) + 1
        n_blocks_y = int(self.width_mat) // (size_block_y) + 1
        self.grid = (n_blocks_x, n_blocks_y, 1)
        self.block = (size_block_x, size_block_y, 1)

コード例 #6

0

ファイルを表示

ファイル: curandom.py プロジェクト: pikawika/VUB-CC-Project

    def state(self):
        if self._state is None:
            from pycuda.characterize import sizeof
            data_type_size = sizeof(self.state_type, "#include <curand_kernel.h>")

            self._state = drv.mem_alloc(
                self.block_count * self.generators_per_block * data_type_size)

        return self._state

コード例 #7

0

ファイルを表示

ファイル: curandom.py プロジェクト: hannes-brt/pycuda

    def state(self):
        if self._state is None:
            from pycuda.characterize import sizeof
            data_type_size = sizeof(self.state_type, "#include <curand_kernel.h>")

            self._state = drv.mem_alloc(
                self.block_count * self.generators_per_block * data_type_size)

        return self._state

コード例 #8

0

ファイルを表示

ファイル: tools.py プロジェクト: BenLand100/chroma

def get_rng_states(size, seed=1):
    "Return `size` number of CUDA random number generator states."
    rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))

    module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
    init_rng = module.get_function('init_rng')

    init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))

    return rng_states

コード例 #9

0

ファイルを表示

def get_rng_states(size, seed=1):
    "Return `size` number of CUDA random number generator states."
    rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))

    module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
    init_rng = module.get_function('init_rng')

    init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))

    return rng_states

コード例 #10

0

ファイルを表示

def get_pInit(module, x, y, z):
    "Return `size` number of CUDA random number generator states."
    pInit = cuda.mem_alloc(characterize.sizeof('vec3', "#include <vec3.h>", include_dirs='/home/thomas/Documents/toy-mc/photon_prob/cuda_tools'))

    # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, 
    #    include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools'])
    
    init_pInit = module.get_function('init_pInit')

    init_pInit(np.uint64(1), np.float32(x), np.float32(y), np.float32(z), pInit, block=(1,1,1), grid=(1,1))

    return pInit

コード例 #11

0

ファイルを表示

def get_times(module, size):
    "Return `size` number of CUDA random number generator states."
    times = cuda.mem_alloc(size*characterize.sizeof('float'))
    # print "times: ", size*characterize.sizeof('float')

    # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, 
    #    include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools'])
    
    init_times = module.get_function('init_times')

    init_times(np.uint64(size), times, block=(64,1,1), grid=(size//64+1,1))

    return times

コード例 #12

0

ファイルを表示

def get_rng_states(module, size, seed=1):
    "Return `size` number of CUDA random number generator states."
    rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStatePhilox4_32_10_t', '#include <curand_kernel.h>'))
    # print "rng_states: ", size*characterize.sizeof('curandStatePhilox4_32_10_t', '#include <curand_kernel.h>')
    
    # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, 
    #    include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools'])

    init_rng = module.get_function('init_rng')

    init_rng(np.uint64(size), rng_states, np.uint64(seed), block=(64,1,1), grid=(size//64+1,1))

    return rng_states

コード例 #13

0

ファイルを表示

ファイル: cpu_basic_python_implementation.py プロジェクト: Tamme/mutationalsignaturesNCSUT

def init_rng(seed):
    global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks
    from pycuda.characterize import sizeof
    ds = sizeof("curandState", "#include <curand_kernel.h>")
    _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds)

    src = SourceModule('''
    #include <curand_kernel.h>

    extern "C"
    {
    __global__ void setup_rng(curandState* rng_state, const unsigned seed)
    {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        curand_init(seed, tid, 0, &rng_state[tid]);
    }

    __global__ void dropout_eltw(float* x, const unsigned size,
                                 float dropout_rate,
                                 curandState* rng_state) {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        const unsigned num_threads = gridDim.x*blockDim.x;
        curandState localState = rng_state[tid];
        for (unsigned i = tid; i < size; i += num_threads)
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
        rng_state[tid] = localState;
    }

    __global__ void saltpepper_eltw(float* x, const unsigned size,
                                    float dropout_rate,
                                    curandState* rng_state) {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        const unsigned num_threads = gridDim.x*blockDim.x;
        curandState localState = rng_state[tid];
        for (unsigned i = tid; i < size; i += num_threads)
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i];
        rng_state[tid] = localState;
    }
    }
    ''',
                       no_extern_c=True)
    setup_rng = src.get_function("setup_rng")
    setup_rng.prepare("Pi")
    setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1),
                            _rng_state, np.uint32(seed))
    _dropout_kernel = src.get_function("dropout_eltw")
    _dropout_kernel.prepare("PifP")
    _saltpepper_kernel = src.get_function("saltpepper_eltw")
    _saltpepper_kernel.prepare("PifP")

コード例 #14

0

ファイルを表示

ファイル: cpu_basic_python_implementation.py プロジェクト: Tamme/mutationalsignaturesNCSUT

def init_rng(seed):
    global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks
    from pycuda.characterize import sizeof
    ds = sizeof("curandState", "#include <curand_kernel.h>")
    _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds)

    src = SourceModule(
    '''
    #include <curand_kernel.h>

    extern "C"
    {
    __global__ void setup_rng(curandState* rng_state, const unsigned seed)
    {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        curand_init(seed, tid, 0, &rng_state[tid]);
    }

    __global__ void dropout_eltw(float* x, const unsigned size,
                                 float dropout_rate,
                                 curandState* rng_state) {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        const unsigned num_threads = gridDim.x*blockDim.x;
        curandState localState = rng_state[tid];
        for (unsigned i = tid; i < size; i += num_threads)
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
        rng_state[tid] = localState;
    }

    __global__ void saltpepper_eltw(float* x, const unsigned size,
                                    float dropout_rate,
                                    curandState* rng_state) {
        const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
        const unsigned num_threads = gridDim.x*blockDim.x;
        curandState localState = rng_state[tid];
        for (unsigned i = tid; i < size; i += num_threads)
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
            x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i];
        rng_state[tid] = localState;
    }
    }
    ''', no_extern_c=True)
    setup_rng = src.get_function("setup_rng")
    setup_rng.prepare("Pi")
    setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1),
                            _rng_state, np.uint32(seed))
    _dropout_kernel = src.get_function("dropout_eltw")
    _dropout_kernel.prepare("PifP")
    _saltpepper_kernel = src.get_function("saltpepper_eltw")
    _saltpepper_kernel.prepare("PifP")

コード例 #15

0

ファイルを表示

ファイル: geometry.py プロジェクト: NuTufts/ChromaUBooNE

    def _package_material_data_cuda(self, geometry, wavelengths,
                                    wavelength_step):
        material_data = []
        material_ptrs = []
        geometry_source = cutools.get_cu_source('geometry_types.h')
        material_struct_size = characterize.sizeof('Material', geometry_source)

        for i in range(len(geometry.unique_materials)):
            material = geometry.unique_materials[i]

            if material is None:
                raise Exception('one or more triangles is missing a material.')

            refractive_index = self._interp_material_property(
                wavelengths, material.refractive_index)
            refractive_index_gpu = ga.to_gpu(refractive_index)
            absorption_length = self._interp_material_property(
                wavelengths, material.absorption_length)
            absorption_length_gpu = ga.to_gpu(absorption_length)
            scattering_length = self._interp_material_property(
                wavelengths, material.scattering_length)
            scattering_length_gpu = ga.to_gpu(scattering_length)
            reemission_prob = self._interp_material_property(
                wavelengths, material.reemission_prob)
            reemission_prob_gpu = ga.to_gpu(reemission_prob)
            reemission_cdf = self._interp_material_property(
                wavelengths, material.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)

            material_data.append(refractive_index_gpu)
            material_data.append(absorption_length_gpu)
            material_data.append(scattering_length_gpu)
            material_data.append(reemission_prob_gpu)
            material_data.append(reemission_cdf_gpu)

            material_gpu = \
                make_gpu_struct(material_struct_size,
                                [refractive_index_gpu, absorption_length_gpu,
                                 scattering_length_gpu,
                                 reemission_prob_gpu,
                                 reemission_cdf_gpu,
                                 np.uint32(len(wavelengths)),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0])])

            material_ptrs.append(material_gpu)

        material_pointer_array = make_gpu_struct(8 * len(material_ptrs),
                                                 material_ptrs)
        return material_data, material_ptrs, material_pointer_array

コード例 #16

0

ファイルを表示

def get_doms(module, size, radius, d, dN):
    "Return `size` number of CUDA random number generator states."
    d_list = cuda.mem_alloc(size*characterize.sizeof('dom', "#include <dom_RT.h>", include_dirs='/home/thomas/Documents/toy-mc/photon_prob/cuda_tools'))
   # hits = cuda.mem_alloc(size*characterize.sizeof('int'))
    # print "doms: ", size*characterize.sizeof('dom', "#include <dom_RT.h>", include_dirs='/home/thomas/Documents/toy-mc/photon_prob/cuda_tools')

    # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, 
    #    include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools'])
    
    create_doms = module.get_function('create_doms')

    d = np.uint32(d)
    dN = np.uint32(dN)
    radius = np.float32(radius)
    create_doms(radius, d, dN, d_list, block=(64,1,1), grid=(size//64+1,1))

    return d_list #, hits

コード例 #17

0

ファイルを表示

ファイル: geometry.py プロジェクト: BenLand100/chroma

    def __init__(self, geometry, wavelengths=None, print_usage=False, min_free_gpu_mem=300e6):
        if wavelengths is None:
            wavelengths = standard_wavelengths

        try:
            wavelength_step = np.unique(np.diff(wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')

        geometry_source = get_cu_source('geometry_types.h')
        material_struct_size = characterize.sizeof('Material', geometry_source)
        surface_struct_size = characterize.sizeof('Surface', geometry_source)
        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        self.material_data = []
        self.material_ptrs = []

        def interp_material_property(wavelengths, property):
            # note that it is essential that the material properties be
            # interpolated linearly. this fact is used in the propagation
            # code to guarantee that probabilities still sum to one.
            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)

        for i in range(len(geometry.unique_materials)):
            material = geometry.unique_materials[i]

            if material is None:
                raise Exception('one or more triangles is missing a material.')

            refractive_index = interp_material_property(wavelengths, material.refractive_index)
            refractive_index_gpu = ga.to_gpu(refractive_index)
            absorption_length = interp_material_property(wavelengths, material.absorption_length)
            absorption_length_gpu = ga.to_gpu(absorption_length)
            scattering_length = interp_material_property(wavelengths, material.scattering_length)
            scattering_length_gpu = ga.to_gpu(scattering_length)
            reemission_prob = interp_material_property(wavelengths, material.reemission_prob)
            reemission_prob_gpu = ga.to_gpu(reemission_prob)
            reemission_cdf = interp_material_property(wavelengths, material.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)

            self.material_data.append(refractive_index_gpu)
            self.material_data.append(absorption_length_gpu)
            self.material_data.append(scattering_length_gpu)
            self.material_data.append(reemission_prob_gpu)
            self.material_data.append(reemission_cdf_gpu)

            material_gpu = \
                make_gpu_struct(material_struct_size,
                                [refractive_index_gpu, absorption_length_gpu,
                                 scattering_length_gpu,
                                 reemission_prob_gpu,
                                 reemission_cdf_gpu,
                                 np.uint32(len(wavelengths)),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0])])

            self.material_ptrs.append(material_gpu)

        self.material_pointer_array = \
            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)

        self.surface_data = []
        self.surface_ptrs = []

        for i in range(len(geometry.unique_surfaces)):
            surface = geometry.unique_surfaces[i]

            if surface is None:
                # need something to copy to the surface array struct
                # that is the same size as a 64-bit pointer.
                # this pointer will never be used by the simulation.
                self.surface_ptrs.append(np.uint64(0))
                continue

            detect = interp_material_property(wavelengths, surface.detect)
            detect_gpu = ga.to_gpu(detect)
            absorb = interp_material_property(wavelengths, surface.absorb)
            absorb_gpu = ga.to_gpu(absorb)
            reemit = interp_material_property(wavelengths, surface.reemit)
            reemit_gpu = ga.to_gpu(reemit)
            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
            reflect_specular_gpu = ga.to_gpu(reflect_specular)
            eta = interp_material_property(wavelengths, surface.eta)
            eta_gpu = ga.to_gpu(eta)
            k = interp_material_property(wavelengths, surface.k)
            k_gpu = ga.to_gpu(k)
            reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)

            self.surface_data.append(detect_gpu)
            self.surface_data.append(absorb_gpu)
            self.surface_data.append(reemit_gpu)
            self.surface_data.append(reflect_diffuse_gpu)
            self.surface_data.append(reflect_specular_gpu)
            self.surface_data.append(eta_gpu)
            self.surface_data.append(k_gpu)
            self.surface_data.append(reemission_cdf_gpu)

            surface_gpu = \
                make_gpu_struct(surface_struct_size,
                                [detect_gpu, absorb_gpu, reemit_gpu,
                                 reflect_diffuse_gpu,reflect_specular_gpu,
                                 eta_gpu, k_gpu, reemission_cdf_gpu,
                                 np.uint32(surface.model),
                                 np.uint32(len(wavelengths)),
                                 np.uint32(surface.transmissive),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0]),
                                 np.float32(surface.thickness)])

            self.surface_ptrs.append(surface_gpu)

        self.surface_pointer_array = \
            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)

        self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3,
                                     write_combined=True)
        self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3,
                                      write_combined=True)
        self.vertices[:] = to_float3(geometry.mesh.vertices)
        self.triangles[:] = to_uint3(geometry.mesh.triangles)
        
        self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin)
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
        self.material_codes = ga.to_gpu(material_codes)
        colors = geometry.colors.astype(np.uint32)
        self.colors = ga.to_gpu(colors)
        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        gpu_free, gpu_total = cuda.mem_get_info()
        node_array_usage = geometry.bvh.nodes.nbytes

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100),
            n_nodes
            )
        
        self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index)) # forbid zero size
        self.extra_nodes = mapped_empty(shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype,
                                        write_combined=True)
        if split_index < n_nodes:
            logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]

        # See if there is enough memory to put the and/ortriangles back on the GPU
        gpu_free, gpu_total = cuda.mem_get_info()
        if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
            self.triangles = ga.to_gpu(self.triangles)
            logger.info('Optimization: Sufficient memory to move triangles onto GPU')

        gpu_free, gpu_total = cuda.mem_get_info()
        if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
            self.vertices = ga.to_gpu(self.vertices)
            logger.info('Optimization: Sufficient memory to move vertices onto GPU')

        self.gpudata = make_gpu_struct(geometry_struct_size,
                                       [Mapped(self.vertices), 
                                        Mapped(self.triangles),
                                        self.material_codes,
                                        self.colors, self.nodes,
                                        Mapped(self.extra_nodes),
                                        self.material_pointer_array,
                                        self.surface_pointer_array,
                                        self.world_origin,
                                        self.world_scale,
                                        np.int32(len(self.nodes))])

        self.geometry = geometry

        if print_usage:
            self.print_device_usage()
        logger.info(self.device_usage_str())

コード例 #18

0

ファイルを表示

ファイル: geometry.py プロジェクト: NuTufts/ChromaUBooNE

    def __init__(self,
                 geometry,
                 wavelengths=None,
                 print_usage=False,
                 min_free_gpu_mem=300e6,
                 cl_context=None,
                 cl_queue=None):
        log.info("GPUGeometry.__init__ min_free_gpu_mem %s ", min_free_gpu_mem)

        self.geometry = geometry
        self.instance_count += 1
        assert self.instance_count == 1, traceback.print_stack()

        self.metadata = Metadata()
        self.metadata(None, 'preinfo')
        self.metadata('a', "start")
        self.metadata['a_min_free_gpu_mem'] = min_free_gpu_mem

        if wavelengths is None:
            self.wavelengths = standard_wavelengths
        else:
            self.wavelengths = wavelengths

        try:
            self.wavelength_step = np.unique(np.diff(self.wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')

        # this is where things get difficult.
        # pycuda and pyopencl gives us very different methods for working with structs
        #geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        # Note, that unfortunately the data types returned are very different as the
        if api.is_gpu_api_cuda():
            self.material_data, self.material_ptrs, self.material_pointer_array = self._package_material_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
            self.surface_data, self.surface_ptrs, self.surface_pointer_array = self._package_surface_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
        elif api.is_gpu_api_opencl():
            self.material_data, materials_bytes_cl = self._package_material_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)
            self.surface_data, surfaces_bytes_cl = self._package_surface_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)

        self.metadata('b', "after materials,surfaces")
        if api.is_gpu_api_opencl():
            self.metadata[
                'b_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl  # opencl, we have to track this ourselves

        # Load Vertices and Triangles
        if api.is_gpu_api_cuda():
            self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                         dtype=ga.vec.float3,
                                         write_combined=True)
            self.vertices4 = np.zeros(shape=(len(self.vertices), 4),
                                      dtype=np.float32)
            self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                          dtype=ga.vec.uint3,
                                          write_combined=True)
            self.triangles4 = np.zeros(shape=(len(self.triangles), 4),
                                       dtype=np.uint32)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.vertices4[:, :-1] = self.vertices.ravel().view(
                np.float32).reshape(len(self.vertices), 3)  # for textures
            self.triangles[:] = to_uint3(geometry.mesh.triangles)
            self.triangles4[:, :-1] = self.triangles.ravel().view(
                np.uint32).reshape(len(self.triangles), 3)  # for textures
        elif api.is_gpu_api_opencl():
            self.vertices = ga.empty(cl_queue,
                                     len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3)
            self.triangles = ga.empty(cl_queue,
                                      len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.triangles[:] = to_uint3(geometry.mesh.triangles)

        if api.is_gpu_api_cuda():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
        elif api.is_gpu_api_opencl():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
            #self.world_origin = geometry.bvh.world_coords.world_origin
            self.world_origin = ga.to_device(cl_queue, self.world_origin)
            print type(self.world_origin), self.world_origin
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        # Load material and surface indices into 8-bit codes
        # check if we've reached a complexity threshold
        if len(geometry.unique_materials) >= int(0xff):
            raise ValueError(
                'Number of materials to index has hit maximum of %d' %
                (int(0xff)))
        if len(geometry.unique_surfaces) >= int(0xff):
            raise ValueError(
                'Number of surfaces to index has hit maximum of %d' %
                (int(0xff)))
        # make bit code
        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(
                              np.uint32)
        if api.is_gpu_api_cuda():
            self.material_codes = ga.to_gpu(material_codes)
        elif api.is_gpu_api_opencl():
            self.material_codes = ga.to_device(cl_queue, material_codes)

        # assign color codes
        colors = geometry.colors.astype(np.uint32)
        if api.is_gpu_api_cuda():
            self.colors = ga.to_gpu(colors)
            self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
        elif api.is_gpu_api_opencl():
            self.colors = ga.to_device(cl_queue, colors)
            self.solid_id_map = ga.to_device(
                cl_queue, geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        self.metadata('c', "after colors, idmap")
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            meshdef_nbytes_cl = self.vertices.nbytes + self.triangles.nbytes + self.world_origin.nbytes + self.world_scale.nbytes + self.material_codes.nbytes + self.colors.nbytes + self.solid_id_map.nbytes
            self.metadata[
                'c_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl
            gpu_free = gpu_total - (materials_bytes_cl + surfaces_bytes_cl +
                                    meshdef_nbytes_cl)

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(
                int((gpu_free - min_free_gpu_mem) /
                    geometry.bvh.nodes.itemsize), 100), n_nodes)
        print "split index=", split_index, " vs. total nodes=", n_nodes

        # push nodes to GPU
        if api.is_gpu_api_cuda():
            self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        elif api.is_gpu_api_opencl():
            self.nodes = ga.to_device(cl_queue,
                                      geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index))  # forbid zero size

        # left over nodes
        if api.is_gpu_api_cuda():
            self.extra_nodes = mapped_empty(shape=n_extra,
                                            dtype=geometry.bvh.nodes.dtype,
                                            write_combined=True)
        elif api.is_gpu_api_opencl():
            self.extra_nodes = ga.empty(cl_queue,
                                        shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype)

        if split_index < n_nodes:
            log.info('Splitting BVH between GPU and CPU memory at node %d' %
                     split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]
            splitting = 1
        else:
            splitting = 0

        self.metadata('d', "after nodes")
        if api.is_gpu_api_opencl():
            nodes_nbytes_cl = self.nodes.nbytes
            self.metadata[
                'd_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl + nodes_nbytes_cl
        self.metadata.array("d_nodes", geometry.bvh.nodes)
        self.metadata['d_split_index'] = split_index
        self.metadata['d_extra_nodes_count'] = n_extra
        self.metadata['d_splitting'] = splitting
        self.print_device_usage(cl_context=cl_context)

        # CUDA See if there is enough memory to put the vertices and/or triangles back on the GPU
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']
        self.metadata.array('e_triangles', self.triangles)
        if api.is_gpu_api_cuda():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                self.triangles = ga.to_gpu(self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                #self.triangles = ga.to_device(cl_queue,self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0

        self.metadata('e', "after triangles")
        self.metadata['e_triangles_gpu'] = ftriangles_gpu

        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']

        self.metadata.array('f_vertices', self.vertices)

        if api.is_gpu_api_cuda():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                #self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0

        self.metadata('f', "after vertices")
        self.metadata['f_vertices_gpu'] = vertices_gpu

        if api.is_gpu_api_cuda():
            geometry_source = cutools.get_cu_source('geometry_types.h')
            geometry_struct_size = characterize.sizeof('Geometry',
                                                       geometry_source)
            self.gpudata = make_gpu_struct(geometry_struct_size, [
                Mapped(self.vertices),
                Mapped(self.triangles), self.material_codes, self.colors,
                self.nodes,
                Mapped(self.extra_nodes), self.material_pointer_array,
                self.surface_pointer_array, self.world_origin,
                self.world_scale,
                np.int32(len(self.nodes))
            ])
        elif api.is_gpu_api_opencl():
            # No relevant way to pass struct into OpenCL kernel. We have to pass everything by arrays
            # We then build a geometry struct later in the kernel
            # provided below is example/test of passing the data
            #if True: # for debuggin
            if False:  #
                print "loading geometry_structs.cl"
                geostructsmod = cltools.get_cl_module(
                    "geometry_structs.cl",
                    cl_context,
                    options=cltools.cl_options,
                    include_source_directory=True)
                geostructsfunc = GPUFuncs(geostructsmod)
                geostructsfunc.make_geostruct(
                    cl_queue, (3, ), None, self.vertices.data,
                    self.triangles.data, self.material_codes.data,
                    self.colors.data, self.nodes.data, self.extra_nodes.data,
                    np.int32(len(geometry.unique_materials)),
                    self.material_data['refractive_index'].data,
                    self.material_data['absorption_length'].data,
                    self.material_data['scattering_length'].data,
                    self.material_data['reemission_prob'].data,
                    self.material_data['reemission_cdf'].data,
                    np.int32(len(geometry.unique_surfaces)),
                    self.surface_data['detect'].data,
                    self.surface_data['absorb'].data,
                    self.surface_data['reemit'].data,
                    self.surface_data['reflect_diffuse'].data,
                    self.surface_data['reflect_specular'].data,
                    self.surface_data['eta'].data, self.surface_data['k'].data,
                    self.surface_data['reemission_cdf'].data,
                    self.surface_data['model'].data,
                    self.surface_data['transmissive'].data,
                    self.surface_data['thickness'].data,
                    self.surface_data['nplanes'].data,
                    self.surface_data['wire_diameter'].data,
                    self.surface_data['wire_pitch'].data,
                    self.world_origin.data, self.world_scale,
                    np.int32(len(self.nodes)), self.material_data['n'],
                    self.material_data['step'],
                    self.material_data["wavelength0"])
                cl_queue.finish()
                self.material_codes.get()
                raise RuntimeError('bail')
        if print_usage:
            self.print_device_usage(cl_context=cl_context)
        log.info(self.device_usage_str(cl_context=cl_context))
        self.metadata('g', "after geometry struct")

コード例 #19

0

ファイルを表示

ファイル: ghz.py プロジェクト: fjarri/thesis

def calculation(in_queue, out_queue):

    device_num, params = in_queue.get()

    chunk_size = params['chunk_size']
    chunks_num = params['chunks_num']
    particles = params['particles']
    state = params['state']
    representation = params['representation']
    quantities = params['quantities']

    decoherence = params['decoherence']
    if decoherence is not None:
        decoherence_steps = decoherence['steps']
        decoherence_coeff = decoherence['coeff']
    else:
        decoherence_steps = 0
        decoherence_coeff = 1

    binning = params['binning']
    if binning is not None:
        s = set()
        for names, _, _ in binning:
            s.update(names)
        quantities = sorted(list(s))

    c_dtype = numpy.complex128
    c_ctype = 'double2'
    s_dtype = numpy.float64
    s_ctype = 'double'
    Fs = []

    cuda.init()

    device = cuda.Device(device_num)
    ctx = device.make_context()
    free, total = cuda.mem_get_info()
    max_chunk_size = float(total) / len(quantities) / numpy.dtype(c_dtype).itemsize / 1.1
    max_chunk_size = 10 ** int(numpy.log(max_chunk_size) / numpy.log(10))
    #print free, total, max_chunk_size

    if max_chunk_size > chunk_size:
        subchunk_size = chunk_size
        subchunks_num = 1
    else:
        assert chunk_size % max_chunk_size == 0
        subchunk_size = max_chunk_size
        subchunks_num = chunk_size / subchunk_size

    buffers = []
    for quantity in sorted(quantities):
        buffers.append(GPUArray(subchunk_size, c_dtype))

    stream = cuda.Stream()

    # compile code
    try:
        source = TEMPLATE.render(
            c_ctype=c_ctype, s_ctype=s_ctype, particles=particles,
            state=state, representation=representation, quantities=quantities,
            decoherence_coeff=decoherence_coeff)
    except:
        print exceptions.text_error_template().render()
        raise

    try:
        module = SourceModule(source, no_extern_c=True)
    except:
        for i, l in enumerate(source.split("\n")):
            print i + 1, ":", l
        raise

    kernel_initialize = module.get_function("initialize")
    kernel_calculate = module.get_function("calculate")
    kernel_decoherence = module.get_function("decoherence")

    # prepare call parameters

    gen_block_size = min(
        kernel_initialize.max_threads_per_block,
        kernel_calculate.max_threads_per_block)
    gen_grid_size = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)
    gen_block = (gen_block_size, 1, 1)
    gen_grid = (gen_grid_size, 1, 1)

    num_gen = gen_block_size * gen_grid_size
    assert num_gen <= 20000

    # prepare RNG states

    #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32))
    seeds = to_gpu(numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32))
    state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
    states = cuda.mem_alloc(num_gen * state_type_size)

    #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k
    kernel_initialize(states, seeds.gpudata, block=gen_block, grid=gen_grid, stream=stream)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size)

    # run calculation
    args = [states] + [buf.gpudata for buf in buffers] + [numpy.int32(subchunk_size)]

    if binning is None:

        results = {quantity:numpy.zeros((decoherence_steps+1, chunks_num * subchunks_num), c_dtype)
            for quantity in quantities}
        for i in xrange(chunks_num * subchunks_num):
            kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream)

            for k in xrange(decoherence_steps + 1):
                if k > 0:
                    kernel_decoherence(*args, block=gen_block, grid=gen_grid, stream=stream)

                for j, quantity in enumerate(sorted(quantities)):
                    F = (gpuarray.sum(buffers[j], stream=stream) / buffers[j].size).get()
                    results[quantity][k, i] = F

        for quantity in sorted(quantities):
            results[quantity] = results[quantity].reshape(
                decoherence_steps + 1, chunks_num, subchunks_num).mean(2).real.tolist()

        out_queue.put(results)

    else:

        bin_accums = [numpy.zeros(tuple([binnum] * len(vals)), numpy.int64)
            for vals, binnum, _ in binning]
        bin_edges = [None] * len(binning)

        for i in xrange(chunks_num * subchunks_num):
            bin_edges = []
            kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream)
            results = {quantity:buffers[j].get().real for j, quantity in enumerate(sorted(quantities))}

            for binparam, bin_accum in zip(binning, bin_accums):
                qnames, binnum, ranges = binparam
                sample_lines = [results[quantity] for quantity in qnames]
                sample = numpy.concatenate([arr.reshape(subchunk_size, 1) for arr in sample_lines], axis=1)

                hist, edges = numpy.histogramdd(sample, binnum, ranges)
                bin_accum += hist
                bin_edges.append(numpy.array(edges))

        results = [[acc.tolist(), edges.tolist()] for acc, edges in zip(bin_accums, bin_edges)]

        out_queue.put(results)

    #ctx.pop()
    ctx.detach()

コード例 #20

0

ファイルを表示

def AlgorithmMCMC(graph_file, file_parameters=None):

    MyGraph = GraphColor("Graph/" + str(graph_file))

    nb_nodes = MyGraph.nb_nodes
    nb_edges = MyGraph.nb_edges
    seed = int(time.time())

    threadPerBlock = (32, 1, 1)
    BlockPerGrid = ((nb_nodes + threadPerBlock[0] - 1) / threadPerBlock[0], 1,
                    1)
    rand_states = cuda.mem_alloc(
        nb_nodes *
        characterize.sizeof('curandState', '#include <curand_kernel.h>'))
    with open('CUDA/Cuda.cu', 'r') as myfile:
        cuda_code = myfile.read()

    mod = SourceModule(cuda_code, no_extern_c=True)
    func_initCurand = mod.get_function("initCurand")
    func_initCurand(rand_states,
                    np.uint32(seed),
                    np.uint32(nb_nodes),
                    block=threadPerBlock,
                    grid=BlockPerGrid,
                    time_kernel=True)

    #default params
    p_nb_col = MyGraph.maxDeg
    p_numThreads = 32
    p_epsilon = 1e-8
    p_lambda = 0.01
    p_ratioFreezed = 1e-2
    p_maxRip = 250

    #params if file given
    if file_parameters != None:
        parameters_from_file = parser_parameters.parser_parameters(
            file_parameters)
        if parameters_from_file[0] != None:
            p_epsilon = float(parameters_from_file[0])
        if parameters_from_file[1] != None:
            p_lambda = float(parameters_from_file[1])
        if parameters_from_file[2] != None:
            p_ratioFreezed = float(parameters_from_file[2])
        if parameters_from_file[3] != None:
            p_maxRip = int(parameters_from_file[3])
        if parameters_from_file[4] != None:
            p_numThreads = int(parameters_from_file[4])

    #configuration grille
    threadsPerBlock = (p_numThreads, 1, 1)
    blocksPerGrid = ((nb_nodes + p_numThreads - 1) / p_numThreads, 1, 1)
    #blocksPerGrid_nCol = ((p_nb_col + threadsPerBlock[0] - 1)/threadsPerBlock[0], 1, 1)
    #blocksPerGrid_half = (((nb_nodes / 2) + threadsPerBlock[0] - 1) / threadsPerBlock[0], 1, 1)
    blocksPerGrid_edges = ((nb_edges + threadsPerBlock[0] - 1) /
                           threadsPerBlock[0], 1, 1)
    blocksPerGrid_half_edges = (((nb_edges / 2) + threadsPerBlock[0] - 1) /
                                threadsPerBlock[0], 1, 1)

    ##############################################
    #compute and print the allocation memory on the GPU
    sizeof_uint32 = sizeof("uint32_t", "#include <stdint.h>")
    sizeof_float = sizeof("float")
    sizeof_bool = sizeof("bool")

    free_mem, tot_mem = cuda.mem_get_info()
    print "total mem: " + str(tot_mem) + " free mem: " + str(free_mem)

    tot = nb_nodes * sizeof_uint32 * 3
    print "nb_nodes * sizeof(uint32_t): " + str(
        nb_nodes * sizeof_uint32) + " x3"
    tot = tot + nb_nodes * sizeof_float * 2
    print "nb_nodes * sizeof(np.float32): " + str(
        nb_nodes * sizeof_float) + " x2"
    tot = tot + nb_edges * sizeof_uint32
    print "nb_edges * sizeof(np.uint32): " + str(
        nb_edges * sizeof_uint32) + " x1"
    tot = tot + nb_nodes * p_nb_col * sizeof_bool
    print "nb_nodes * p_nb_col * sizeof(np.bool): " + str(
        nb_nodes * p_nb_col * sizeof_bool) + " x1"
    tot = tot + nb_nodes * p_nb_col * sizeof_uint32
    print "nb_nodes * p_nb_col * sizeof(np.uint32): " + str(
        nb_nodes * p_nb_col * sizeof_uint32) + " x1"
    print "TOTAL: " + str(tot) + " bytes"

    ###############################################################
    #Cuda Allocation
    coloring_d = gpuarray.zeros(nb_nodes, np.uint32)
    starColoring_d = gpuarray.zeros(nb_nodes, np.uint32)
    qStar_d = gpuarray.zeros(nb_nodes, np.float32)
    conflictCounter_d = gpuarray.zeros(nb_edges, np.uint32)
    colorsChecker_d = gpuarray.zeros(nb_nodes * p_nb_col, np.bool)
    orderedColors_d = gpuarray.zeros(nb_nodes * p_nb_col, np.uint32)

    free_mem, tot_mem = cuda.mem_get_info()
    print "total memory: " + str(tot_mem) + " free memory: " + str(free_mem)

    print "ColoringMCMC GPU"
    print "nbCol: " + str(p_nb_col)
    print "numThreads: " + str(p_numThreads)
    print "epsilon: " + str(p_epsilon)
    print "lambda: " + str(p_lambda)
    print "ratioFreezed: " + str(p_ratioFreezed)
    print "maxRip: " + str(p_maxRip)

    #
    logFile = open(
        "Out/" + str(nb_nodes) + "-" + str(nb_edges) + "-logFile.txt", "wt")
    resultsFile = open(
        "Out/" + str(nb_nodes) + "-" + str(nb_edges) + "-resultsFile.txt",
        "wt")

    logFile.write("nbCol: " + str(p_nb_col) + "\n")
    logFile.write("epsilon: " + str(p_epsilon) + "\n")
    logFile.write("lambda: " + str(p_lambda) + "\n")
    logFile.write("ratioFreezed: " + str(p_ratioFreezed) + "\n")
    logFile.write("maxRip: " + str(p_maxRip) + "\n")

    resultsFile.write("nbCol: " + str(p_nb_col) + "\n")
    resultsFile.write("epsilon: " + str(p_epsilon) + "\n")
    resultsFile.write("lambda: " + str(p_lambda) + "\n")
    resultsFile.write("ratioFreezed: " + str(p_ratioFreezed) + "\n")
    resultsFile.write("maxRip: " + str(p_maxRip) + "\n")

    ####################################################################
    #Init a color for nb_nodes nodes

    func_initColoring = mod.get_function("initColoring")
    func_initColoring(np.uint32(nb_nodes),
                      coloring_d,
                      np.float32(p_nb_col),
                      rand_states,
                      block=threadsPerBlock,
                      grid=blocksPerGrid,
                      time_kernel=True)

    #####################################################################
    #run algorithm MCMC

    rip = 0

    colors = coloring_d.get()
    #print "Couleurs des sommets initial: "
    #print colors
    resultsFile.write("\n_______________\n")
    resultsFile.write("Initial colors:\n\n")
    for index in range(len(colors)):
        resultsFile.write("node " + str(index) + ": " + str(colors[index]) +
                          "\n")

    ############
    #np.set_printoptions(threshold=np.nan)
    #print "nb edges=" + str(nb_edges)
    #print conflictCounter_d.get()
    #print coloring_d.get()
    #print MyGraph.cuda_edges.get()
    #######################

    func_conflictChecker = mod.get_function("conflictChecker")
    func_sumReduction = mod.get_function("sumReduction")
    func_selectStarColoring = mod.get_function("selectStarColoring")

    tStart = tm.time()

    # compute nb of conflict before a tentative
    func_conflictChecker(np.uint32(nb_edges),
                         conflictCounter_d,
                         coloring_d,
                         MyGraph.cuda_edges,
                         grid=blocksPerGrid_edges,
                         block=threadsPerBlock,
                         time_kernel=True)

    # print conflictCounter_d.get()

    func_sumReduction(np.uint32(nb_edges),
                      conflictCounter_d,
                      grid=blocksPerGrid_half_edges,
                      block=threadsPerBlock,
                      shared=(threadsPerBlock[0] * sizeof_uint32),
                      time_kernel=True)

    conflictCounter_h = conflictCounter_d.get()

    conflictCounter = 0
    for i in range(blocksPerGrid_half_edges[0]):
        conflictCounter += conflictCounter_h[i]

    while (rip < p_maxRip):
        rip = rip + 1

        if conflictCounter == 0:
            break

        print "<<< Tentative numero: " + str(rip) + " >>>"
        print "conflits relatifs avant: " + str(conflictCounter)

        logFile.write("<<< Tentative numero: " + str(rip) + " >>>\n")
        logFile.write("conflits relatifs avant: " + str(conflictCounter) +
                      "\n")

        #resultsFile .write("iteration " + str(rip) + "\n")
        #resultsFile.write("iteration_" + str(rip) + "_conflits " + str(conflictCounter) + "\n")

        colorsChecker_d.fill(0)
        orderedColors_d.fill(0)

        func_selectStarColoring(np.uint32(nb_nodes),
                                starColoring_d,
                                qStar_d,
                                np.uint32(p_nb_col),
                                coloring_d,
                                MyGraph.cuda_listCumulDeg,
                                MyGraph.cuda_listNeighbors,
                                colorsChecker_d,
                                orderedColors_d,
                                rand_states,
                                np.uint32(p_epsilon),
                                grid=blocksPerGrid,
                                block=threadsPerBlock,
                                time_kernel=True)

        temp = coloring_d
        coloring_d = starColoring_d
        starColoring_d = temp

        #compute nb of conflict after a tentative
        func_conflictChecker(np.uint32(nb_edges),
                             conflictCounter_d,
                             coloring_d,
                             MyGraph.cuda_edges,
                             grid=blocksPerGrid_edges,
                             block=threadsPerBlock,
                             time_kernel=True)

        #print conflictCounter_d.get()
        func_sumReduction(np.uint32(nb_edges),
                          conflictCounter_d,
                          grid=blocksPerGrid_half_edges,
                          block=threadsPerBlock,
                          shared=(threadsPerBlock[0] * sizeof_uint32),
                          time_kernel=True)

        conflictCounter_h = conflictCounter_d.get()

        conflictCounter = 0
        for i in range(blocksPerGrid_half_edges[0]):
            conflictCounter += conflictCounter_h[i]

        print "conflits relatifs apres: " + str(conflictCounter) + "\n"
        logFile.write("conflits relatifs apres: " + str(conflictCounter) +
                      "\n")

    #fin algorithme
    print('Fin MCMC en : %.3f s' % (tm.time() - tStart))
    logFile.write("Fin MCMC\n")

    colors = coloring_d.get()
    #print "Couleurs des sommets final: "
    #print colors
    resultsFile.write("\n_____________\n")
    resultsFile.write("Final colors:\n\n")
    for index in range(len(colors)):
        resultsFile.write("node " + str(index) + ": " + str(colors[index]) +
                          "\n")

    resultsFile.write("\n___________________________\n")
    resultsFile.write("Counter of each color used:\n")
    counter_colors_used = Counter(colors)
    for key, value in counter_colors_used.iteritems():
        resultsFile.write("\nColor " + str(key) + ": " + str(value))

コード例 #21

0

ファイルを表示

ファイル: cuda_lib.py プロジェクト: koszullab/centroID

def get_rng_states(size_output, seed=1):
    init_rng_src = """
        #include <curand_kernel.h>

        extern "C"
        {

            __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
            {
                    int id = blockIdx.x*blockDim.x + threadIdx.x;

                    if (id >= nthreads)
                            return;
                    curand_init(seed, id, offset, &s[id]);
            }

            __global__ void make_rand(int nthreads, curandState *state, int *randArray)
            {
                int idx = blockIdx.x * blockDim.x + threadIdx.x;
                int id_rng = blockIdx.x;
                double mean = 10;
                if (idx<= nthreads){
                randArray[idx] = curand_poisson(&state[id_rng], mean);
                //randArray[idx] = curand_uniform(&state[idx]);
                }
            }

        } // extern "C"
    """

    "Return `size_rng` number of CUDA random number generator states."
    curr_gpu.make_context()
    gpu_vect_rand = ga.GPUArray((size_output,), dtype=np.int32)
    cpu_vect_rand = np.ones((size_output,), dtype=np.int32)
    (free, total) = cuda.mem_get_info()
    print(("Global memory occupancy:%f%% free" % (free * 100 / total)))

    # module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True,arch="sm_30")
    module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
    init_rng = module.get_function("init_rng")
    make_rand = module.get_function("make_rand")
    size_block = 1024
    n_blocks = size_output // size_block + 1
    rng_states = cuda.mem_alloc(
        n_blocks
        * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
    )
    init_rng(
        np.int32(n_blocks),
        rng_states,
        np.uint64(seed),
        np.uint64(0),
        block=(64, 1, 1),
        grid=(n_blocks // 64 + 1, 1),
    )
    try:
        make_rand(
            np.int32(size_output),
            rng_states,
            gpu_vect_rand,
            block=(size_block, 1, 1),
            grid=(n_blocks, 1),
        )
    except cuda.LogicError:
        print("random number generation failed ...")

    (free, total) = cuda.mem_get_info()
    print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
    rng_states.free()
    gpu_vect_rand.get(ary=cpu_vect_rand)
    cuda.Context.pop()
    return cpu_vect_rand

コード例 #22

0

ファイルを表示

    def __init__(self, geometry, wavelengths=None, times=None, print_usage=False, min_free_gpu_mem=300e6):
        if wavelengths is None:
            wavelengths = standard_wavelengths

        try:
            wavelength_step = np.unique(np.diff(wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')
            
        if times is None:
            time_step = 0.05
            times = np.arange(0,1000,time_step)
        else:
            try:
                time_step = np.unique(np.diff(times)).item()
            except ValueError:
                raise ValueError('times must be equally spaced apart.')

        geometry_source = get_cu_source('geometry_types.h')
        material_struct_size = characterize.sizeof('Material', geometry_source)
        surface_struct_size = characterize.sizeof('Surface', geometry_source)
        dichroicprops_struct_size = characterize.sizeof('DichroicProps', geometry_source)
        geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        self.material_data = []
        self.material_ptrs = []

        def interp_material_property(wavelengths, property):
            # note that it is essential that the material properties be
            # interpolated linearly. this fact is used in the propagation
            # code to guarantee that probabilities still sum to one.
            return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)

        for i in range(len(geometry.unique_materials)):
            material = geometry.unique_materials[i]

            if material is None:
                raise Exception('one or more triangles is missing a material.')

            refractive_index = interp_material_property(wavelengths, material.refractive_index)
            refractive_index_gpu = ga.to_gpu(refractive_index)
            absorption_length = interp_material_property(wavelengths, material.absorption_length)
            absorption_length_gpu = ga.to_gpu(absorption_length)
            scattering_length = interp_material_property(wavelengths, material.scattering_length)
            scattering_length_gpu = ga.to_gpu(scattering_length)
            num_comp = len(material.comp_reemission_prob)
            comp_reemission_prob_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_prob]
            self.material_data.append(comp_reemission_prob_gpu)
            comp_reemission_prob_gpu = np.uint64(0) if len(comp_reemission_prob_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_prob_gpu), comp_reemission_prob_gpu)
            assert num_comp == len(material.comp_reemission_wvl_cdf), 'component arrays must be same length'
            comp_reemission_wvl_cdf_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_wvl_cdf]
            self.material_data.append(comp_reemission_wvl_cdf_gpu)
            comp_reemission_wvl_cdf_gpu = np.uint64(0) if len(comp_reemission_wvl_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_wvl_cdf_gpu), comp_reemission_wvl_cdf_gpu)
            assert num_comp == len(material.comp_reemission_time_cdf), 'component arrays must be same length'
            comp_reemission_time_cdf_gpu = [ga.to_gpu(interp_material_property(times, component)) for component in material.comp_reemission_time_cdf]
            self.material_data.append(comp_reemission_time_cdf_gpu)
            comp_reemission_time_cdf_gpu = np.uint64(0) if len(comp_reemission_time_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_time_cdf_gpu), comp_reemission_time_cdf_gpu)
            assert num_comp == len(material.comp_absorption_length), 'component arrays must be same length'
            comp_absorption_length_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_absorption_length]
            self.material_data.append(comp_absorption_length_gpu)
            comp_absorption_length_gpu = np.uint64(0) if len(comp_absorption_length_gpu) == 0 else make_gpu_struct(8*len(comp_absorption_length_gpu), comp_absorption_length_gpu)

            self.material_data.append(refractive_index_gpu)
            self.material_data.append(absorption_length_gpu)
            self.material_data.append(scattering_length_gpu)
            self.material_data.append(comp_reemission_prob_gpu)
            self.material_data.append(comp_reemission_wvl_cdf_gpu)
            self.material_data.append(comp_reemission_time_cdf_gpu)
            self.material_data.append(comp_absorption_length_gpu)

            material_gpu = \
                make_gpu_struct(material_struct_size,
                                [refractive_index_gpu, absorption_length_gpu,
                                 scattering_length_gpu,
                                 comp_reemission_prob_gpu,
                                 comp_reemission_wvl_cdf_gpu,
                                 comp_reemission_time_cdf_gpu,
                                 comp_absorption_length_gpu,
                                 np.uint32(num_comp),
                                 np.uint32(len(wavelengths)),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0]),
                                 np.uint32(len(times)),
                                 np.float32(time_step),
                                 np.float32(times[0])])

            self.material_ptrs.append(material_gpu)

        self.material_pointer_array = \
            make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)

        self.surface_data = []
        self.surface_ptrs = []

        for i in range(len(geometry.unique_surfaces)):
            surface = geometry.unique_surfaces[i]

            if surface is None:
                # need something to copy to the surface array struct
                # that is the same size as a 64-bit pointer.
                # this pointer will never be used by the simulation.
                self.surface_ptrs.append(np.uint64(0))
                continue

            detect = interp_material_property(wavelengths, surface.detect)
            detect_gpu = ga.to_gpu(detect)
            absorb = interp_material_property(wavelengths, surface.absorb)
            absorb_gpu = ga.to_gpu(absorb)
            reemit = interp_material_property(wavelengths, surface.reemit)
            reemit_gpu = ga.to_gpu(reemit)
            reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
            reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
            reflect_specular_gpu = ga.to_gpu(reflect_specular)
            eta = interp_material_property(wavelengths, surface.eta)
            eta_gpu = ga.to_gpu(eta)
            k = interp_material_property(wavelengths, surface.k)
            k_gpu = ga.to_gpu(k)
            reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)
            
            if surface.dichroic_props:
                props = surface.dichroic_props
                transmit_pointers = []
                reflect_pointers = []
                angles_gpu = ga.to_gpu(np.asarray(props.angles,dtype=np.float32))
                self.surface_data.append(angles_gpu)
                
                for i,angle in enumerate(props.angles):
                    dichroic_reflect = interp_material_property(wavelengths, props.dichroic_reflect[i])
                    dichroic_reflect_gpu = ga.to_gpu(dichroic_reflect)
                    self.surface_data.append(dichroic_reflect_gpu)
                    reflect_pointers.append(dichroic_reflect_gpu)
                    
                    dichroic_transmit = interp_material_property(wavelengths, props.dichroic_transmit[i])
                    dichroic_transmit_gpu = ga.to_gpu(dichroic_transmit)
                    self.surface_data.append(dichroic_transmit_gpu)
                    transmit_pointers.append(dichroic_transmit_gpu)
                
                reflect_arr_gpu = make_gpu_struct(8*len(reflect_pointers),reflect_pointers)
                self.surface_data.append(reflect_arr_gpu)
                transmit_arr_gpu = make_gpu_struct(8*len(transmit_pointers), transmit_pointers)
                self.surface_data.append(transmit_arr_gpu)
                dichroic_props = make_gpu_struct(dichroicprops_struct_size,[angles_gpu,reflect_arr_gpu,transmit_arr_gpu,np.uint32(len(props.angles))])
            else:
                dichroic_props = np.uint64(0) #NULL
            
            

            self.surface_data.append(detect_gpu)
            self.surface_data.append(absorb_gpu)
            self.surface_data.append(reemit_gpu)
            self.surface_data.append(reflect_diffuse_gpu)
            self.surface_data.append(reflect_specular_gpu)
            self.surface_data.append(eta_gpu)
            self.surface_data.append(k_gpu)
            self.surface_data.append(dichroic_props)
            
            surface_gpu = \
                make_gpu_struct(surface_struct_size,
                                [detect_gpu, absorb_gpu, reemit_gpu,
                                 reflect_diffuse_gpu,reflect_specular_gpu,
                                 eta_gpu, k_gpu, reemission_cdf_gpu,
                                 dichroic_props,
                                 np.uint32(surface.model),
                                 np.uint32(len(wavelengths)),
                                 np.uint32(surface.transmissive),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0]),
                                 np.float32(surface.thickness)])

            self.surface_ptrs.append(surface_gpu)

        self.surface_pointer_array = \
            make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)

        self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3,
                                     write_combined=True)
        self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3,
                                      write_combined=True)
        self.vertices[:] = to_float3(geometry.mesh.vertices)
        self.triangles[:] = to_uint3(geometry.mesh.triangles)
        
        self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin)
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
        self.material_codes = ga.to_gpu(material_codes)
        colors = geometry.colors.astype(np.uint32)
        self.colors = ga.to_gpu(colors)
        self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        gpu_free, gpu_total = cuda.mem_get_info()
        node_array_usage = geometry.bvh.nodes.nbytes

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100),
            n_nodes
            )
        
        self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index)) # forbid zero size
        self.extra_nodes = mapped_empty(shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype,
                                        write_combined=True)
        if split_index < n_nodes:
            logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]

        # See if there is enough memory to put the and/ortriangles back on the GPU
        gpu_free, gpu_total = cuda.mem_get_info()
        if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
            self.triangles = ga.to_gpu(self.triangles)
            logger.info('Optimization: Sufficient memory to move triangles onto GPU')

        gpu_free, gpu_total = cuda.mem_get_info()
        if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
            self.vertices = ga.to_gpu(self.vertices)
            logger.info('Optimization: Sufficient memory to move vertices onto GPU')

        self.gpudata = make_gpu_struct(geometry_struct_size,
                                       [Mapped(self.vertices), 
                                        Mapped(self.triangles),
                                        self.material_codes,
                                        self.colors, self.nodes,
                                        Mapped(self.extra_nodes),
                                        self.material_pointer_array,
                                        self.surface_pointer_array,
                                        self.world_origin,
                                        self.world_scale,
                                        np.int32(len(self.nodes))])

        self.geometry = geometry

        if print_usage:
            self.print_device_usage()
        logger.info(self.device_usage_str())

コード例 #23

0

ファイルを表示

ファイル: ghz.py プロジェクト: fjarri-attic/bellsim-letter

def calculation(in_queue, out_queue):

    device_num, params = in_queue.get()

    chunk_size = params['chunk_size']
    chunks_num = params['chunks_num']
    particles = params['particles']
    state = params['state']
    representation = params['representation']
    quantities = params['quantities']

    decoherence = params['decoherence']
    if decoherence is not None:
        decoherence_steps = decoherence['steps']
        decoherence_coeff = decoherence['coeff']
    else:
        decoherence_steps = 0
        decoherence_coeff = 1

    binning = params['binning']
    if binning is not None:
        s = set()
        for names, _, _ in binning:
            s.update(names)
        quantities = sorted(list(s))

    c_dtype = numpy.complex128
    c_ctype = 'double2'
    s_dtype = numpy.float64
    s_ctype = 'double'
    Fs = []

    cuda.init()

    device = cuda.Device(device_num)
    ctx = device.make_context()
    free, total = cuda.mem_get_info()
    max_chunk_size = float(total) / len(quantities) / numpy.dtype(
        c_dtype).itemsize / 1.1
    max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10))
    #print free, total, max_chunk_size

    if max_chunk_size > chunk_size:
        subchunk_size = chunk_size
        subchunks_num = 1
    else:
        assert chunk_size % max_chunk_size == 0
        subchunk_size = max_chunk_size
        subchunks_num = chunk_size / subchunk_size

    buffers = []
    for quantity in sorted(quantities):
        buffers.append(GPUArray(subchunk_size, c_dtype))

    stream = cuda.Stream()

    # compile code
    try:
        source = TEMPLATE.render(c_ctype=c_ctype,
                                 s_ctype=s_ctype,
                                 particles=particles,
                                 state=state,
                                 representation=representation,
                                 quantities=quantities,
                                 decoherence_coeff=decoherence_coeff)
    except:
        print exceptions.text_error_template().render()
        raise

    try:
        module = SourceModule(source, no_extern_c=True)
    except:
        for i, l in enumerate(source.split("\n")):
            print i + 1, ":", l
        raise

    kernel_initialize = module.get_function("initialize")
    kernel_calculate = module.get_function("calculate")
    kernel_decoherence = module.get_function("decoherence")

    # prepare call parameters

    gen_block_size = min(kernel_initialize.max_threads_per_block,
                         kernel_calculate.max_threads_per_block)
    gen_grid_size = device.get_attribute(
        cuda.device_attribute.MULTIPROCESSOR_COUNT)
    gen_block = (gen_block_size, 1, 1)
    gen_grid = (gen_grid_size, 1, 1)

    num_gen = gen_block_size * gen_grid_size
    assert num_gen <= 20000

    # prepare RNG states

    #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32))
    seeds = to_gpu(
        numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32))
    state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
    states = cuda.mem_alloc(num_gen * state_type_size)

    #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k
    kernel_initialize(states,
                      seeds.gpudata,
                      block=gen_block,
                      grid=gen_grid,
                      stream=stream)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size)

    # run calculation
    args = [states] + [buf.gpudata
                       for buf in buffers] + [numpy.int32(subchunk_size)]

    if binning is None:

        results = {
            quantity: numpy.zeros(
                (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype)
            for quantity in quantities
        }
        for i in xrange(chunks_num * subchunks_num):
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)

            for k in xrange(decoherence_steps + 1):
                if k > 0:
                    kernel_decoherence(*args,
                                       block=gen_block,
                                       grid=gen_grid,
                                       stream=stream)

                for j, quantity in enumerate(sorted(quantities)):
                    F = (gpuarray.sum(buffers[j], stream=stream) /
                         buffers[j].size).get()
                    results[quantity][k, i] = F

        for quantity in sorted(quantities):
            results[quantity] = results[quantity].reshape(
                decoherence_steps + 1, chunks_num,
                subchunks_num).mean(2).real.tolist()

        out_queue.put(results)

    else:

        bin_accums = [
            numpy.zeros(tuple([binnum] * len(vals)), numpy.int64)
            for vals, binnum, _ in binning
        ]
        bin_edges = [None] * len(binning)

        for i in xrange(chunks_num * subchunks_num):
            bin_edges = []
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)
            results = {
                quantity: buffers[j].get().real
                for j, quantity in enumerate(sorted(quantities))
            }

            for binparam, bin_accum in zip(binning, bin_accums):
                qnames, binnum, ranges = binparam
                sample_lines = [results[quantity] for quantity in qnames]
                sample = numpy.concatenate(
                    [arr.reshape(subchunk_size, 1) for arr in sample_lines],
                    axis=1)

                hist, edges = numpy.histogramdd(sample, binnum, ranges)
                bin_accum += hist
                bin_edges.append(numpy.array(edges))

        results = [[acc.tolist(), edges.tolist()]
                   for acc, edges in zip(bin_accums, bin_edges)]

        out_queue.put(results)

    #ctx.pop()
    ctx.detach()

コード例 #24

0

ファイルを表示

ファイル: geometry.py プロジェクト: NuTufts/ChromaUBooNE

    def _package_surface_data_cuda(self, geometry, wavelengths,
                                   wavelength_step):
        surface_data = []
        surface_ptrs = []
        geometry_source = cutools.get_cu_source('geometry_types.h')
        surface_struct_size = characterize.sizeof('Surface', geometry_source)

        for i in range(len(geometry.unique_surfaces)):
            surface = geometry.unique_surfaces[i]

            if surface is None:
                # need something to copy to the surface array struct
                # that is the same size as a 64-bit pointer.
                # this pointer will never be used by the simulation.
                surface_ptrs.append(np.uint64(0))
                continue

            detect = self._interp_material_property(wavelengths,
                                                    surface.detect)
            detect_gpu = ga.to_gpu(detect)
            absorb = self._interp_material_property(wavelengths,
                                                    surface.absorb)
            absorb_gpu = ga.to_gpu(absorb)
            reemit = self._interp_material_property(wavelengths,
                                                    surface.reemit)
            reemit_gpu = ga.to_gpu(reemit)
            reflect_diffuse = self._interp_material_property(
                wavelengths, surface.reflect_diffuse)
            reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
            reflect_specular = self._interp_material_property(
                wavelengths, surface.reflect_specular)
            reflect_specular_gpu = ga.to_gpu(reflect_specular)
            eta = self._interp_material_property(wavelengths, surface.eta)
            eta_gpu = ga.to_gpu(eta)
            k = self._interp_material_property(wavelengths, surface.k)
            k_gpu = ga.to_gpu(k)
            reemission_cdf = self._interp_material_property(
                wavelengths, surface.reemission_cdf)
            reemission_cdf_gpu = ga.to_gpu(reemission_cdf)

            surface_data.append(detect_gpu)
            surface_data.append(absorb_gpu)
            surface_data.append(reemit_gpu)
            surface_data.append(reflect_diffuse_gpu)
            surface_data.append(reflect_specular_gpu)
            surface_data.append(eta_gpu)
            surface_data.append(k_gpu)
            surface_data.append(reemission_cdf_gpu)

            surface_gpu = \
                make_gpu_struct(surface_struct_size,
                                [detect_gpu, absorb_gpu, reemit_gpu,
                                 reflect_diffuse_gpu,reflect_specular_gpu,
                                 eta_gpu, k_gpu, reemission_cdf_gpu,
                                 np.uint32(surface.model),
                                 np.uint32(len(wavelengths)),
                                 np.uint32(surface.transmissive),
                                 np.float32(wavelength_step),
                                 np.float32(wavelengths[0]),
                                 np.float32(surface.thickness),
                                 np.float32(surface.nplanes),
                                 np.float32(surface.wire_diameter),
                                 np.float32(surface.wire_pitch)] )

            surface_ptrs.append(surface_gpu)

        surface_pointer_array = make_gpu_struct(8 * len(surface_ptrs),
                                                surface_ptrs)
        return surface_data, surface_ptrs, surface_pointer_array

コード例 #25

0

ファイルを表示

    
    } // end extern "C"
"""

mod = SourceModule(kernel_code, no_extern_c = True)

# Get kernel functions
local = mod.get_function('local_diffuse')
non_local = mod.get_function('non_local_diffuse')
survival_layer = mod.get_function('survival_of_the_fittest')
population_layer = mod.get_function('population_growth')
init_generators = mod.get_function('init_generators')

# Initialize random number generator
generator = curandom.XORWOWRandomNumberGenerator()
data_type_size = sizeof(generator.state_type, "#include <curand_kernel.h>")
generator._state = drv.mem_alloc((matrix_size * matrix_size) * data_type_size)
seed = 123456789
init_generators(generator.state, np.int32(seed), np.int32(matrix_size),
    grid = (grid_dims, grid_dims), block = (block_dims, block_dims, 1))

# Run n_iters of the Brown Marmorated Stink Bug (BMSB) Diffusion Simulation
run_primitive(
    empty_grid.vars(matrix_size) == 
    initialize_grid.vars(matrix_size, initial_population, survival_probabilities, generator) ==
    bmsb_stop_condition.vars(n_iters) <= 
    local_diffusion.vars(local, matrix_size, p_local, grid_dims, block_dims) == 
    non_local_diffusion.vars(non_local, matrix_size, p_non_local, mu, gamma, grid_dims, block_dims) ==
    survival_function.vars(survival_layer, matrix_size, grid_dims, block_dims) ==
    population_growth.vars(population_layer, matrix_size, growth_rate, grid_dims, block_dims) ==
    bmsb_stop >=

コード例 #26

0

ファイルを表示

ファイル: detector.py プロジェクト: NuTufts/ChromaUBooNE

    def __init__(self,
                 detector,
                 wavelengths=None,
                 print_usage=False,
                 cl_context=None,
                 cl_queue=None):
        GPUGeometry.__init__(self,
                             detector,
                             wavelengths=wavelengths,
                             print_usage=False,
                             cl_context=cl_context,
                             cl_queue=cl_queue)

        if api.is_gpu_api_cuda():
            self.solid_id_to_channel_index_gpu = ga.to_gpu(
                detector.solid_id_to_channel_index.astype(np.int32))
            self.solid_id_to_channel_id_gpu = ga.to_gpu(
                detector.solid_id_to_channel_id.astype(np.int32))

            self.nchannels = detector.num_channels()

            self.time_cdf_x_gpu = ga.to_gpu(detector.time_cdf[0].astype(
                np.float32))
            self.time_cdf_y_gpu = ga.to_gpu(detector.time_cdf[1].astype(
                np.float32))

            self.charge_cdf_x_gpu = ga.to_gpu(detector.charge_cdf[0].astype(
                np.float32))
            self.charge_cdf_y_gpu = ga.to_gpu(detector.charge_cdf[1].astype(
                np.float32))

            detector_source = cutools.get_cu_source('detector.h')
            detector_struct_size = characterize.sizeof('Detector',
                                                       detector_source)
            self.detector_gpu = make_gpu_struct(detector_struct_size, [
                self.solid_id_to_channel_index_gpu, self.time_cdf_x_gpu,
                self.time_cdf_y_gpu, self.charge_cdf_x_gpu,
                self.charge_cdf_y_gpu,
                np.int32(self.nchannels),
                np.int32(len(detector.time_cdf[0])),
                np.int32(len(detector.charge_cdf[0])),
                np.float32(detector.charge_cdf[0][-1] / 2**16)
            ])
        elif api.is_gpu_api_opencl():
            self.solid_id_to_channel_index_gpu = ga.to_device(
                cl_queue, detector.solid_id_to_channel_index.astype(np.int32))
            self.solid_id_to_channel_id_gpu = ga.to_device(
                cl_queue, detector.solid_id_to_channel_id.astype(np.int32))
            self.nchannels = np.int32(detector.num_channels())
            self.time_cdf_x_gpu = ga.to_device(
                cl_queue, detector.time_cdf[0].astype(np.float32))
            self.time_cdf_y_gpu = ga.to_device(
                cl_queue, detector.time_cdf[1].astype(np.float32))
            self.charge_cdf_x_gpu = ga.to_device(
                cl_queue, detector.charge_cdf[0].astype(np.float32))
            self.charge_cdf_y_gpu = ga.to_device(
                cl_queue, detector.charge_cdf[1].astype(np.float32))
            self.time_cdf_len = np.int32(len(detector.time_cdf[0]))
            self.charge_cdf_len = np.int32(len(detector.charge_cdf[0]))
            self.charge_unit = np.float32(detector.charge_cdf[0][-1] / 2**16)
        else:
            raise RuntimeError("GPU API is neither OpenCL nor CUDA")

コード例 #27

0

ファイルを表示

__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
{
        int id = blockIdx.x*blockDim.x + threadIdx.x;

        if (id >= nthreads)
            return;

        curand_init(seed+id, id, offset, &s[id]);
}

} // extern "C"
"""
rng_states_gpu = cuda.mem_alloc(
    NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK *
    characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
module = SourceModule(init_rng_src, no_extern_c=True)
init_rng = module.get_function('init_rng')
init_rng(np.int32(NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK),
         rng_states_gpu,
         np.uint32(time.time()),
         np.uint64(0),
         block=(32, NUM_RUNS_PER_BLOCK, 1),
         grid=(NUM_RUNS, 1))

is_simiulation = 0
if NUM_RUNS == 1:
    is_simulation = 1
defines = "#define NUM_ROUTES " + str(NUM_ROUTES) + "\n" +\
          "#define NUM_STOPS " + str(NUM_STOPS) + "\n" +\
          "#define NUM_STOPS_INTS " + str(NUM_CHARGER_INTS) + "\n" +\

コード例 #28

0

ファイルを表示

from generate_random_graph import generate_filepath_pickle
from pycuda import (autoinit, characterize, compiler, curandom, driver,
                    gpuarray, tools)
from timer import (cumulative_runtimes, execution_counts,
                   find_k_seeds_runtimes, runtimes, timeit, to_csv)

L_CONSTANT = 1
EPSILON_CONSTANT = 0.2
K_CONSTANT = 2
BLOCK_SIZE = 1024
TILE_X = 1
TILE_Y = 32
TILE_Z = 32

SIZEOF_GENERATOR = characterize.sizeof('curandStateXORWOW',
                                       '#include <curand_kernel.h>')

TWITTER_DATASET_FILEPATH = './datasets/twitter'
TWITTER_DATASET_PICKLE_FILEPATH = './datasets/twitter.pickle'
EDGE_FILE_SUFFIX = '.edges'
RANDOM_CSR_GRAPH_FILEPATH = './datasets/random_graph.pickle'
GENERATE_RR_SETS_CUDA_CODE_FILEPATH = 'node_selection.cu'

# Compile kernel code
with open(GENERATE_RR_SETS_CUDA_CODE_FILEPATH, "r") as fp:
    content = fp.read()
mod = compiler.SourceModule(content, no_extern_c=True)


@timeit
def width(graph, nodes):