def get_rng_states(size, block, grid): rng_states = cuda.mem_alloc( size * characterize.sizeof('curandState', "#include <curand_kernel.h>")) mod = SourceModule(""" #include <curand_kernel.h> extern "C" { __global__ void init_rng(int nthreads, curandState *s ) { int idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx >= nthreads) return; curand_init(1234, idx, 0, &s[idx]); } } // extern "C" """, no_extern_c=True) init_rng = mod.get_function('init_rng') init_rng(np.int32(size), rng_states, block=block, grid=grid) return rng_states
def __init__(self, detector, wavelengths=None, print_usage=False): GPUGeometry.__init__(self, detector, wavelengths=wavelengths, print_usage=False) self.solid_id_to_channel_index_gpu = \ ga.to_gpu(detector.solid_id_to_channel_index.astype(np.int32)) self.nchannels = detector.num_channels() self.time_cdf_x_gpu = ga.to_gpu(detector.time_cdf[0].astype( np.float32)) self.time_cdf_y_gpu = ga.to_gpu(detector.time_cdf[1].astype( np.float32)) self.charge_cdf_x_gpu = ga.to_gpu(detector.charge_cdf[0].astype( np.float32)) self.charge_cdf_y_gpu = ga.to_gpu(detector.charge_cdf[1].astype( np.float32)) detector_source = get_cu_source('detector.h') detector_struct_size = characterize.sizeof('Detector', detector_source) self.detector_gpu = make_gpu_struct(detector_struct_size, [ self.solid_id_to_channel_index_gpu, self.time_cdf_x_gpu, self.time_cdf_y_gpu, self.charge_cdf_x_gpu, self.charge_cdf_y_gpu, np.int32(self.nchannels), np.int32(len(detector.time_cdf[0])), np.int32(len(detector.charge_cdf[0])), np.float32(detector.charge_cdf[0][-1] / 2**16) ])
def get_rng_states(size): init_rng_src = """ #include <curand_kernel.h> extern "C" { __global__ void init_rng(int nthreads, curandStateMRG32k3a *s) { int tid = threadIdx.x + (blockIdx.x * blockDim.x); if (tid >= nthreads) { return; } curand_init(tid, 0, 0, &s[tid]); } } // extern "C" """ rng_states = cuda.mem_alloc(size * characterize.sizeof( 'curandStateMRG32k3a', '#include <curand_kernel.h>')) module = SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function('init_rng') init_rng(numpy.int32(size), rng_states, numpy.uint64(0), block=(64, 1, 1), grid=(size // 64 + 1, 1)) return rng_states
def seed(self, seed=None): from pycuda.characterize import sizeof, has_stack import pycuda.driver as cuda import pycuda.gpuarray as gpuarray rng = numpy.random.RandomState() rng.seed(seed) gen_block_size = min( self._initialize.max_threads_per_block, self._sample.max_threads_per_block) gen_grid_size = self._env.device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT) gen_block = (gen_block_size, 1, 1) gen_gsize = (gen_grid_size * gen_block_size, 1, 1) num_gen = gen_block_size * gen_grid_size assert num_gen <= 20000 seeds = gpuarray.to_gpu(rng.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32)) state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>") self.states = gpuarray.GPUArray(num_gen * state_type_size, numpy.uint8) self._initialize.customCall(gen_gsize, gen_block, self.states.gpudata, seeds.gpudata) self._env.synchronize() self.gsize = gen_gsize self.lsize = gen_block
def __init__(self, init_data, n_generators): self.ctx = curr_gpu.make_context() self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True) (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) print(("Global free memory :%i Mo free" % (free / 10 ** 6))) ################################################################################################################ self.width_mat = np.int32(init_data.shape[0]) # self.gpu_init_data = ga.to_gpu(init_data) self.gpu_init_data = cuda.mem_alloc(init_data.nbytes) cuda.memcpy_htod(self.gpu_init_data, init_data) self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32) print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6) (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) print(("Global free memory :%i Mo free" % (free / 10 ** 6))) self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes) cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data) # self.gpu_new_data = ga.to_gpu(self.cpu_new_data) self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32) self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes) cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum) # self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum) ################################################################################################################ self.init_rng = self.module.get_function("init_rng") self.gen_rand_mat = self.module.get_function("gen_rand_mat") self.sum_along_axis = self.module.get_function("sum_along_axis") self.norm_along_axis = self.module.get_function("norm_along_axis") self.init_vect_sum = self.module.get_function("init_vect_sum") self.copy_mat = self.module.get_function("copy_mat") ################################################################################################################ self.n_generators = n_generators seed = 1 self.rng_states = cuda.mem_alloc( n_generators * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>") ) self.init_rng( np.int32(n_generators), self.rng_states, np.uint64(seed), np.uint64(0), block=(64, 1, 1), grid=(n_generators // 64 + 1, 1), ) (free, total) = cuda.mem_get_info() size_block_x = 32 size_block_y = 32 n_blocks_x = int(self.width_mat) // (size_block_x) + 1 n_blocks_y = int(self.width_mat) // (size_block_y) + 1 self.grid = (n_blocks_x, n_blocks_y, 1) self.block = (size_block_x, size_block_y, 1)
def state(self): if self._state is None: from pycuda.characterize import sizeof data_type_size = sizeof(self.state_type, "#include <curand_kernel.h>") self._state = drv.mem_alloc( self.block_count * self.generators_per_block * data_type_size) return self._state
def state(self): if self._state is None: from pycuda.characterize import sizeof data_type_size = sizeof(self.state_type, "#include <curand_kernel.h>") self._state = drv.mem_alloc( self.block_count * self.generators_per_block * data_type_size) return self._state
def get_rng_states(size, seed=1): "Return `size` number of CUDA random number generator states." rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function('init_rng') init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1)) return rng_states
def get_rng_states(size, seed=1): "Return `size` number of CUDA random number generator states." rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function('init_rng') init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1)) return rng_states
def get_pInit(module, x, y, z): "Return `size` number of CUDA random number generator states." pInit = cuda.mem_alloc(characterize.sizeof('vec3', "#include <vec3.h>", include_dirs='/home/thomas/Documents/toy-mc/photon_prob/cuda_tools')) # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, # include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools']) init_pInit = module.get_function('init_pInit') init_pInit(np.uint64(1), np.float32(x), np.float32(y), np.float32(z), pInit, block=(1,1,1), grid=(1,1)) return pInit
def get_times(module, size): "Return `size` number of CUDA random number generator states." times = cuda.mem_alloc(size*characterize.sizeof('float')) # print "times: ", size*characterize.sizeof('float') # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, # include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools']) init_times = module.get_function('init_times') init_times(np.uint64(size), times, block=(64,1,1), grid=(size//64+1,1)) return times
def get_rng_states(module, size, seed=1): "Return `size` number of CUDA random number generator states." rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStatePhilox4_32_10_t', '#include <curand_kernel.h>')) # print "rng_states: ", size*characterize.sizeof('curandStatePhilox4_32_10_t', '#include <curand_kernel.h>') # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, # include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools']) init_rng = module.get_function('init_rng') init_rng(np.uint64(size), rng_states, np.uint64(seed), block=(64,1,1), grid=(size//64+1,1)) return rng_states
def init_rng(seed): global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks from pycuda.characterize import sizeof ds = sizeof("curandState", "#include <curand_kernel.h>") _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds) src = SourceModule(''' #include <curand_kernel.h> extern "C" { __global__ void setup_rng(curandState* rng_state, const unsigned seed) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; curand_init(seed, tid, 0, &rng_state[tid]); } __global__ void dropout_eltw(float* x, const unsigned size, float dropout_rate, curandState* rng_state) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; const unsigned num_threads = gridDim.x*blockDim.x; curandState localState = rng_state[tid]; for (unsigned i = tid; i < size; i += num_threads) x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; rng_state[tid] = localState; } __global__ void saltpepper_eltw(float* x, const unsigned size, float dropout_rate, curandState* rng_state) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; const unsigned num_threads = gridDim.x*blockDim.x; curandState localState = rng_state[tid]; for (unsigned i = tid; i < size; i += num_threads) x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i]; rng_state[tid] = localState; } } ''', no_extern_c=True) setup_rng = src.get_function("setup_rng") setup_rng.prepare("Pi") setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1), _rng_state, np.uint32(seed)) _dropout_kernel = src.get_function("dropout_eltw") _dropout_kernel.prepare("PifP") _saltpepper_kernel = src.get_function("saltpepper_eltw") _saltpepper_kernel.prepare("PifP")
def init_rng(seed): global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks from pycuda.characterize import sizeof ds = sizeof("curandState", "#include <curand_kernel.h>") _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds) src = SourceModule( ''' #include <curand_kernel.h> extern "C" { __global__ void setup_rng(curandState* rng_state, const unsigned seed) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; curand_init(seed, tid, 0, &rng_state[tid]); } __global__ void dropout_eltw(float* x, const unsigned size, float dropout_rate, curandState* rng_state) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; const unsigned num_threads = gridDim.x*blockDim.x; curandState localState = rng_state[tid]; for (unsigned i = tid; i < size; i += num_threads) x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; rng_state[tid] = localState; } __global__ void saltpepper_eltw(float* x, const unsigned size, float dropout_rate, curandState* rng_state) { const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; const unsigned num_threads = gridDim.x*blockDim.x; curandState localState = rng_state[tid]; for (unsigned i = tid; i < size; i += num_threads) x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i]; rng_state[tid] = localState; } } ''', no_extern_c=True) setup_rng = src.get_function("setup_rng") setup_rng.prepare("Pi") setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1), _rng_state, np.uint32(seed)) _dropout_kernel = src.get_function("dropout_eltw") _dropout_kernel.prepare("PifP") _saltpepper_kernel = src.get_function("saltpepper_eltw") _saltpepper_kernel.prepare("PifP")
def _package_material_data_cuda(self, geometry, wavelengths, wavelength_step): material_data = [] material_ptrs = [] geometry_source = cutools.get_cu_source('geometry_types.h') material_struct_size = characterize.sizeof('Material', geometry_source) for i in range(len(geometry.unique_materials)): material = geometry.unique_materials[i] if material is None: raise Exception('one or more triangles is missing a material.') refractive_index = self._interp_material_property( wavelengths, material.refractive_index) refractive_index_gpu = ga.to_gpu(refractive_index) absorption_length = self._interp_material_property( wavelengths, material.absorption_length) absorption_length_gpu = ga.to_gpu(absorption_length) scattering_length = self._interp_material_property( wavelengths, material.scattering_length) scattering_length_gpu = ga.to_gpu(scattering_length) reemission_prob = self._interp_material_property( wavelengths, material.reemission_prob) reemission_prob_gpu = ga.to_gpu(reemission_prob) reemission_cdf = self._interp_material_property( wavelengths, material.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) material_data.append(refractive_index_gpu) material_data.append(absorption_length_gpu) material_data.append(scattering_length_gpu) material_data.append(reemission_prob_gpu) material_data.append(reemission_cdf_gpu) material_gpu = \ make_gpu_struct(material_struct_size, [refractive_index_gpu, absorption_length_gpu, scattering_length_gpu, reemission_prob_gpu, reemission_cdf_gpu, np.uint32(len(wavelengths)), np.float32(wavelength_step), np.float32(wavelengths[0])]) material_ptrs.append(material_gpu) material_pointer_array = make_gpu_struct(8 * len(material_ptrs), material_ptrs) return material_data, material_ptrs, material_pointer_array
def get_doms(module, size, radius, d, dN): "Return `size` number of CUDA random number generator states." d_list = cuda.mem_alloc(size*characterize.sizeof('dom', "#include <dom_RT.h>", include_dirs='/home/thomas/Documents/toy-mc/photon_prob/cuda_tools')) # hits = cuda.mem_alloc(size*characterize.sizeof('int')) # print "doms: ", size*characterize.sizeof('dom', "#include <dom_RT.h>", include_dirs='/home/thomas/Documents/toy-mc/photon_prob/cuda_tools') # module = pycuda.compiler.SourceModule(kernel_code, no_extern_c=True, # include_dirs=['/home/thomas/Documents/toy-mc/photon_prob/cuda_tools']) create_doms = module.get_function('create_doms') d = np.uint32(d) dN = np.uint32(dN) radius = np.float32(radius) create_doms(radius, d, dN, d_list, block=(64,1,1), grid=(size//64+1,1)) return d_list #, hits
def __init__(self, geometry, wavelengths=None, print_usage=False, min_free_gpu_mem=300e6): if wavelengths is None: wavelengths = standard_wavelengths try: wavelength_step = np.unique(np.diff(wavelengths)).item() except ValueError: raise ValueError('wavelengths must be equally spaced apart.') geometry_source = get_cu_source('geometry_types.h') material_struct_size = characterize.sizeof('Material', geometry_source) surface_struct_size = characterize.sizeof('Surface', geometry_source) geometry_struct_size = characterize.sizeof('Geometry', geometry_source) self.material_data = [] self.material_ptrs = [] def interp_material_property(wavelengths, property): # note that it is essential that the material properties be # interpolated linearly. this fact is used in the propagation # code to guarantee that probabilities still sum to one. return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) for i in range(len(geometry.unique_materials)): material = geometry.unique_materials[i] if material is None: raise Exception('one or more triangles is missing a material.') refractive_index = interp_material_property(wavelengths, material.refractive_index) refractive_index_gpu = ga.to_gpu(refractive_index) absorption_length = interp_material_property(wavelengths, material.absorption_length) absorption_length_gpu = ga.to_gpu(absorption_length) scattering_length = interp_material_property(wavelengths, material.scattering_length) scattering_length_gpu = ga.to_gpu(scattering_length) reemission_prob = interp_material_property(wavelengths, material.reemission_prob) reemission_prob_gpu = ga.to_gpu(reemission_prob) reemission_cdf = interp_material_property(wavelengths, material.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) self.material_data.append(refractive_index_gpu) self.material_data.append(absorption_length_gpu) self.material_data.append(scattering_length_gpu) self.material_data.append(reemission_prob_gpu) self.material_data.append(reemission_cdf_gpu) material_gpu = \ make_gpu_struct(material_struct_size, [refractive_index_gpu, absorption_length_gpu, scattering_length_gpu, reemission_prob_gpu, reemission_cdf_gpu, np.uint32(len(wavelengths)), np.float32(wavelength_step), np.float32(wavelengths[0])]) self.material_ptrs.append(material_gpu) self.material_pointer_array = \ make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) self.surface_data = [] self.surface_ptrs = [] for i in range(len(geometry.unique_surfaces)): surface = geometry.unique_surfaces[i] if surface is None: # need something to copy to the surface array struct # that is the same size as a 64-bit pointer. # this pointer will never be used by the simulation. self.surface_ptrs.append(np.uint64(0)) continue detect = interp_material_property(wavelengths, surface.detect) detect_gpu = ga.to_gpu(detect) absorb = interp_material_property(wavelengths, surface.absorb) absorb_gpu = ga.to_gpu(absorb) reemit = interp_material_property(wavelengths, surface.reemit) reemit_gpu = ga.to_gpu(reemit) reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) reflect_specular_gpu = ga.to_gpu(reflect_specular) eta = interp_material_property(wavelengths, surface.eta) eta_gpu = ga.to_gpu(eta) k = interp_material_property(wavelengths, surface.k) k_gpu = ga.to_gpu(k) reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) self.surface_data.append(detect_gpu) self.surface_data.append(absorb_gpu) self.surface_data.append(reemit_gpu) self.surface_data.append(reflect_diffuse_gpu) self.surface_data.append(reflect_specular_gpu) self.surface_data.append(eta_gpu) self.surface_data.append(k_gpu) self.surface_data.append(reemission_cdf_gpu) surface_gpu = \ make_gpu_struct(surface_struct_size, [detect_gpu, absorb_gpu, reemit_gpu, reflect_diffuse_gpu,reflect_specular_gpu, eta_gpu, k_gpu, reemission_cdf_gpu, np.uint32(surface.model), np.uint32(len(wavelengths)), np.uint32(surface.transmissive), np.float32(wavelength_step), np.float32(wavelengths[0]), np.float32(surface.thickness)]) self.surface_ptrs.append(surface_gpu) self.surface_pointer_array = \ make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) self.vertices = mapped_empty(shape=len(geometry.mesh.vertices), dtype=ga.vec.float3, write_combined=True) self.triangles = mapped_empty(shape=len(geometry.mesh.triangles), dtype=ga.vec.uint3, write_combined=True) self.vertices[:] = to_float3(geometry.mesh.vertices) self.triangles[:] = to_uint3(geometry.mesh.triangles) self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin) self.world_scale = np.float32(geometry.bvh.world_coords.world_scale) material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) self.material_codes = ga.to_gpu(material_codes) colors = geometry.colors.astype(np.uint32) self.colors = ga.to_gpu(colors) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) # Limit memory usage by splitting BVH into on and off-GPU parts gpu_free, gpu_total = cuda.mem_get_info() node_array_usage = geometry.bvh.nodes.nbytes # Figure out how many elements we can fit on the GPU, # but no fewer than 100 elements, and no more than the number of actual nodes n_nodes = len(geometry.bvh.nodes) split_index = min( max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100), n_nodes ) self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index]) n_extra = max(1, (n_nodes - split_index)) # forbid zero size self.extra_nodes = mapped_empty(shape=n_extra, dtype=geometry.bvh.nodes.dtype, write_combined=True) if split_index < n_nodes: logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index) self.extra_nodes[:] = geometry.bvh.nodes[split_index:] # See if there is enough memory to put the and/ortriangles back on the GPU gpu_free, gpu_total = cuda.mem_get_info() if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): self.triangles = ga.to_gpu(self.triangles) logger.info('Optimization: Sufficient memory to move triangles onto GPU') gpu_free, gpu_total = cuda.mem_get_info() if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): self.vertices = ga.to_gpu(self.vertices) logger.info('Optimization: Sufficient memory to move vertices onto GPU') self.gpudata = make_gpu_struct(geometry_struct_size, [Mapped(self.vertices), Mapped(self.triangles), self.material_codes, self.colors, self.nodes, Mapped(self.extra_nodes), self.material_pointer_array, self.surface_pointer_array, self.world_origin, self.world_scale, np.int32(len(self.nodes))]) self.geometry = geometry if print_usage: self.print_device_usage() logger.info(self.device_usage_str())
def __init__(self, geometry, wavelengths=None, print_usage=False, min_free_gpu_mem=300e6, cl_context=None, cl_queue=None): log.info("GPUGeometry.__init__ min_free_gpu_mem %s ", min_free_gpu_mem) self.geometry = geometry self.instance_count += 1 assert self.instance_count == 1, traceback.print_stack() self.metadata = Metadata() self.metadata(None, 'preinfo') self.metadata('a', "start") self.metadata['a_min_free_gpu_mem'] = min_free_gpu_mem if wavelengths is None: self.wavelengths = standard_wavelengths else: self.wavelengths = wavelengths try: self.wavelength_step = np.unique(np.diff(self.wavelengths)).item() except ValueError: raise ValueError('wavelengths must be equally spaced apart.') # this is where things get difficult. # pycuda and pyopencl gives us very different methods for working with structs #geometry_struct_size = characterize.sizeof('Geometry', geometry_source) # Note, that unfortunately the data types returned are very different as the if api.is_gpu_api_cuda(): self.material_data, self.material_ptrs, self.material_pointer_array = self._package_material_data_cuda( geometry, self.wavelengths, self.wavelength_step) self.surface_data, self.surface_ptrs, self.surface_pointer_array = self._package_surface_data_cuda( geometry, self.wavelengths, self.wavelength_step) elif api.is_gpu_api_opencl(): self.material_data, materials_bytes_cl = self._package_material_data_cl( cl_context, cl_queue, geometry, self.wavelengths, self.wavelength_step) self.surface_data, surfaces_bytes_cl = self._package_surface_data_cl( cl_context, cl_queue, geometry, self.wavelengths, self.wavelength_step) self.metadata('b', "after materials,surfaces") if api.is_gpu_api_opencl(): self.metadata[ 'b_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl # opencl, we have to track this ourselves # Load Vertices and Triangles if api.is_gpu_api_cuda(): self.vertices = mapped_empty(shape=len(geometry.mesh.vertices), dtype=ga.vec.float3, write_combined=True) self.vertices4 = np.zeros(shape=(len(self.vertices), 4), dtype=np.float32) self.triangles = mapped_empty(shape=len(geometry.mesh.triangles), dtype=ga.vec.uint3, write_combined=True) self.triangles4 = np.zeros(shape=(len(self.triangles), 4), dtype=np.uint32) self.vertices[:] = to_float3(geometry.mesh.vertices) self.vertices4[:, :-1] = self.vertices.ravel().view( np.float32).reshape(len(self.vertices), 3) # for textures self.triangles[:] = to_uint3(geometry.mesh.triangles) self.triangles4[:, :-1] = self.triangles.ravel().view( np.uint32).reshape(len(self.triangles), 3) # for textures elif api.is_gpu_api_opencl(): self.vertices = ga.empty(cl_queue, len(geometry.mesh.vertices), dtype=ga.vec.float3) self.triangles = ga.empty(cl_queue, len(geometry.mesh.triangles), dtype=ga.vec.uint3) self.vertices[:] = to_float3(geometry.mesh.vertices) self.triangles[:] = to_uint3(geometry.mesh.triangles) if api.is_gpu_api_cuda(): self.world_origin = ga.vec.make_float3( *geometry.bvh.world_coords.world_origin) elif api.is_gpu_api_opencl(): self.world_origin = ga.vec.make_float3( *geometry.bvh.world_coords.world_origin) #self.world_origin = geometry.bvh.world_coords.world_origin self.world_origin = ga.to_device(cl_queue, self.world_origin) print type(self.world_origin), self.world_origin self.world_scale = np.float32(geometry.bvh.world_coords.world_scale) # Load material and surface indices into 8-bit codes # check if we've reached a complexity threshold if len(geometry.unique_materials) >= int(0xff): raise ValueError( 'Number of materials to index has hit maximum of %d' % (int(0xff))) if len(geometry.unique_surfaces) >= int(0xff): raise ValueError( 'Number of surfaces to index has hit maximum of %d' % (int(0xff))) # make bit code material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype( np.uint32) if api.is_gpu_api_cuda(): self.material_codes = ga.to_gpu(material_codes) elif api.is_gpu_api_opencl(): self.material_codes = ga.to_device(cl_queue, material_codes) # assign color codes colors = geometry.colors.astype(np.uint32) if api.is_gpu_api_cuda(): self.colors = ga.to_gpu(colors) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) elif api.is_gpu_api_opencl(): self.colors = ga.to_device(cl_queue, colors) self.solid_id_map = ga.to_device( cl_queue, geometry.solid_id.astype(np.uint32)) # Limit memory usage by splitting BVH into on and off-GPU parts self.metadata('c', "after colors, idmap") if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] meshdef_nbytes_cl = self.vertices.nbytes + self.triangles.nbytes + self.world_origin.nbytes + self.world_scale.nbytes + self.material_codes.nbytes + self.colors.nbytes + self.solid_id_map.nbytes self.metadata[ 'c_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl gpu_free = gpu_total - (materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl) # Figure out how many elements we can fit on the GPU, # but no fewer than 100 elements, and no more than the number of actual nodes n_nodes = len(geometry.bvh.nodes) split_index = min( max( int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize), 100), n_nodes) print "split index=", split_index, " vs. total nodes=", n_nodes # push nodes to GPU if api.is_gpu_api_cuda(): self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index]) elif api.is_gpu_api_opencl(): self.nodes = ga.to_device(cl_queue, geometry.bvh.nodes[:split_index]) n_extra = max(1, (n_nodes - split_index)) # forbid zero size # left over nodes if api.is_gpu_api_cuda(): self.extra_nodes = mapped_empty(shape=n_extra, dtype=geometry.bvh.nodes.dtype, write_combined=True) elif api.is_gpu_api_opencl(): self.extra_nodes = ga.empty(cl_queue, shape=n_extra, dtype=geometry.bvh.nodes.dtype) if split_index < n_nodes: log.info('Splitting BVH between GPU and CPU memory at node %d' % split_index) self.extra_nodes[:] = geometry.bvh.nodes[split_index:] splitting = 1 else: splitting = 0 self.metadata('d', "after nodes") if api.is_gpu_api_opencl(): nodes_nbytes_cl = self.nodes.nbytes self.metadata[ 'd_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl + nodes_nbytes_cl self.metadata.array("d_nodes", geometry.bvh.nodes) self.metadata['d_split_index'] = split_index self.metadata['d_extra_nodes_count'] = n_extra self.metadata['d_splitting'] = splitting self.print_device_usage(cl_context=cl_context) # CUDA See if there is enough memory to put the vertices and/or triangles back on the GPU if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] gpu_free = gpu_total - self.metadata['d_gpu_used'] self.metadata.array('e_triangles', self.triangles) if api.is_gpu_api_cuda(): if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): self.triangles = ga.to_gpu(self.triangles) log.info( 'Optimization: Sufficient memory to move triangles onto GPU' ) ftriangles_gpu = 1 else: log.warn('using host mapped memory triangles') ftriangles_gpu = 0 elif api.is_gpu_api_opencl(): if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): #self.triangles = ga.to_device(cl_queue,self.triangles) log.info( 'Optimization: Sufficient memory to move triangles onto GPU' ) ftriangles_gpu = 1 else: log.warn('using host mapped memory triangles') ftriangles_gpu = 0 self.metadata('e', "after triangles") self.metadata['e_triangles_gpu'] = ftriangles_gpu if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] gpu_free = gpu_total - self.metadata['d_gpu_used'] self.metadata.array('f_vertices', self.vertices) if api.is_gpu_api_cuda(): if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): self.vertices = ga.to_gpu(self.vertices) log.info( 'Optimization: Sufficient memory to move vertices onto GPU' ) vertices_gpu = 1 else: log.warn('using host mapped memory vertices') vertices_gpu = 0 elif api.is_gpu_api_opencl(): if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): #self.vertices = ga.to_gpu(self.vertices) log.info( 'Optimization: Sufficient memory to move vertices onto GPU' ) vertices_gpu = 1 else: log.warn('using host mapped memory vertices') vertices_gpu = 0 self.metadata('f', "after vertices") self.metadata['f_vertices_gpu'] = vertices_gpu if api.is_gpu_api_cuda(): geometry_source = cutools.get_cu_source('geometry_types.h') geometry_struct_size = characterize.sizeof('Geometry', geometry_source) self.gpudata = make_gpu_struct(geometry_struct_size, [ Mapped(self.vertices), Mapped(self.triangles), self.material_codes, self.colors, self.nodes, Mapped(self.extra_nodes), self.material_pointer_array, self.surface_pointer_array, self.world_origin, self.world_scale, np.int32(len(self.nodes)) ]) elif api.is_gpu_api_opencl(): # No relevant way to pass struct into OpenCL kernel. We have to pass everything by arrays # We then build a geometry struct later in the kernel # provided below is example/test of passing the data #if True: # for debuggin if False: # print "loading geometry_structs.cl" geostructsmod = cltools.get_cl_module( "geometry_structs.cl", cl_context, options=cltools.cl_options, include_source_directory=True) geostructsfunc = GPUFuncs(geostructsmod) geostructsfunc.make_geostruct( cl_queue, (3, ), None, self.vertices.data, self.triangles.data, self.material_codes.data, self.colors.data, self.nodes.data, self.extra_nodes.data, np.int32(len(geometry.unique_materials)), self.material_data['refractive_index'].data, self.material_data['absorption_length'].data, self.material_data['scattering_length'].data, self.material_data['reemission_prob'].data, self.material_data['reemission_cdf'].data, np.int32(len(geometry.unique_surfaces)), self.surface_data['detect'].data, self.surface_data['absorb'].data, self.surface_data['reemit'].data, self.surface_data['reflect_diffuse'].data, self.surface_data['reflect_specular'].data, self.surface_data['eta'].data, self.surface_data['k'].data, self.surface_data['reemission_cdf'].data, self.surface_data['model'].data, self.surface_data['transmissive'].data, self.surface_data['thickness'].data, self.surface_data['nplanes'].data, self.surface_data['wire_diameter'].data, self.surface_data['wire_pitch'].data, self.world_origin.data, self.world_scale, np.int32(len(self.nodes)), self.material_data['n'], self.material_data['step'], self.material_data["wavelength0"]) cl_queue.finish() self.material_codes.get() raise RuntimeError('bail') if print_usage: self.print_device_usage(cl_context=cl_context) log.info(self.device_usage_str(cl_context=cl_context)) self.metadata('g', "after geometry struct")
def calculation(in_queue, out_queue): device_num, params = in_queue.get() chunk_size = params['chunk_size'] chunks_num = params['chunks_num'] particles = params['particles'] state = params['state'] representation = params['representation'] quantities = params['quantities'] decoherence = params['decoherence'] if decoherence is not None: decoherence_steps = decoherence['steps'] decoherence_coeff = decoherence['coeff'] else: decoherence_steps = 0 decoherence_coeff = 1 binning = params['binning'] if binning is not None: s = set() for names, _, _ in binning: s.update(names) quantities = sorted(list(s)) c_dtype = numpy.complex128 c_ctype = 'double2' s_dtype = numpy.float64 s_ctype = 'double' Fs = [] cuda.init() device = cuda.Device(device_num) ctx = device.make_context() free, total = cuda.mem_get_info() max_chunk_size = float(total) / len(quantities) / numpy.dtype(c_dtype).itemsize / 1.1 max_chunk_size = 10 ** int(numpy.log(max_chunk_size) / numpy.log(10)) #print free, total, max_chunk_size if max_chunk_size > chunk_size: subchunk_size = chunk_size subchunks_num = 1 else: assert chunk_size % max_chunk_size == 0 subchunk_size = max_chunk_size subchunks_num = chunk_size / subchunk_size buffers = [] for quantity in sorted(quantities): buffers.append(GPUArray(subchunk_size, c_dtype)) stream = cuda.Stream() # compile code try: source = TEMPLATE.render( c_ctype=c_ctype, s_ctype=s_ctype, particles=particles, state=state, representation=representation, quantities=quantities, decoherence_coeff=decoherence_coeff) except: print exceptions.text_error_template().render() raise try: module = SourceModule(source, no_extern_c=True) except: for i, l in enumerate(source.split("\n")): print i + 1, ":", l raise kernel_initialize = module.get_function("initialize") kernel_calculate = module.get_function("calculate") kernel_decoherence = module.get_function("decoherence") # prepare call parameters gen_block_size = min( kernel_initialize.max_threads_per_block, kernel_calculate.max_threads_per_block) gen_grid_size = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT) gen_block = (gen_block_size, 1, 1) gen_grid = (gen_grid_size, 1, 1) num_gen = gen_block_size * gen_grid_size assert num_gen <= 20000 # prepare RNG states #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32)) seeds = to_gpu(numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32)) state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>") states = cuda.mem_alloc(num_gen * state_type_size) #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k kernel_initialize(states, seeds.gpudata, block=gen_block, grid=gen_grid, stream=stream) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size) # run calculation args = [states] + [buf.gpudata for buf in buffers] + [numpy.int32(subchunk_size)] if binning is None: results = {quantity:numpy.zeros((decoherence_steps+1, chunks_num * subchunks_num), c_dtype) for quantity in quantities} for i in xrange(chunks_num * subchunks_num): kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) for k in xrange(decoherence_steps + 1): if k > 0: kernel_decoherence(*args, block=gen_block, grid=gen_grid, stream=stream) for j, quantity in enumerate(sorted(quantities)): F = (gpuarray.sum(buffers[j], stream=stream) / buffers[j].size).get() results[quantity][k, i] = F for quantity in sorted(quantities): results[quantity] = results[quantity].reshape( decoherence_steps + 1, chunks_num, subchunks_num).mean(2).real.tolist() out_queue.put(results) else: bin_accums = [numpy.zeros(tuple([binnum] * len(vals)), numpy.int64) for vals, binnum, _ in binning] bin_edges = [None] * len(binning) for i in xrange(chunks_num * subchunks_num): bin_edges = [] kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) results = {quantity:buffers[j].get().real for j, quantity in enumerate(sorted(quantities))} for binparam, bin_accum in zip(binning, bin_accums): qnames, binnum, ranges = binparam sample_lines = [results[quantity] for quantity in qnames] sample = numpy.concatenate([arr.reshape(subchunk_size, 1) for arr in sample_lines], axis=1) hist, edges = numpy.histogramdd(sample, binnum, ranges) bin_accum += hist bin_edges.append(numpy.array(edges)) results = [[acc.tolist(), edges.tolist()] for acc, edges in zip(bin_accums, bin_edges)] out_queue.put(results) #ctx.pop() ctx.detach()
def AlgorithmMCMC(graph_file, file_parameters=None): MyGraph = GraphColor("Graph/" + str(graph_file)) nb_nodes = MyGraph.nb_nodes nb_edges = MyGraph.nb_edges seed = int(time.time()) threadPerBlock = (32, 1, 1) BlockPerGrid = ((nb_nodes + threadPerBlock[0] - 1) / threadPerBlock[0], 1, 1) rand_states = cuda.mem_alloc( nb_nodes * characterize.sizeof('curandState', '#include <curand_kernel.h>')) with open('CUDA/Cuda.cu', 'r') as myfile: cuda_code = myfile.read() mod = SourceModule(cuda_code, no_extern_c=True) func_initCurand = mod.get_function("initCurand") func_initCurand(rand_states, np.uint32(seed), np.uint32(nb_nodes), block=threadPerBlock, grid=BlockPerGrid, time_kernel=True) #default params p_nb_col = MyGraph.maxDeg p_numThreads = 32 p_epsilon = 1e-8 p_lambda = 0.01 p_ratioFreezed = 1e-2 p_maxRip = 250 #params if file given if file_parameters != None: parameters_from_file = parser_parameters.parser_parameters( file_parameters) if parameters_from_file[0] != None: p_epsilon = float(parameters_from_file[0]) if parameters_from_file[1] != None: p_lambda = float(parameters_from_file[1]) if parameters_from_file[2] != None: p_ratioFreezed = float(parameters_from_file[2]) if parameters_from_file[3] != None: p_maxRip = int(parameters_from_file[3]) if parameters_from_file[4] != None: p_numThreads = int(parameters_from_file[4]) #configuration grille threadsPerBlock = (p_numThreads, 1, 1) blocksPerGrid = ((nb_nodes + p_numThreads - 1) / p_numThreads, 1, 1) #blocksPerGrid_nCol = ((p_nb_col + threadsPerBlock[0] - 1)/threadsPerBlock[0], 1, 1) #blocksPerGrid_half = (((nb_nodes / 2) + threadsPerBlock[0] - 1) / threadsPerBlock[0], 1, 1) blocksPerGrid_edges = ((nb_edges + threadsPerBlock[0] - 1) / threadsPerBlock[0], 1, 1) blocksPerGrid_half_edges = (((nb_edges / 2) + threadsPerBlock[0] - 1) / threadsPerBlock[0], 1, 1) ############################################## #compute and print the allocation memory on the GPU sizeof_uint32 = sizeof("uint32_t", "#include <stdint.h>") sizeof_float = sizeof("float") sizeof_bool = sizeof("bool") free_mem, tot_mem = cuda.mem_get_info() print "total mem: " + str(tot_mem) + " free mem: " + str(free_mem) tot = nb_nodes * sizeof_uint32 * 3 print "nb_nodes * sizeof(uint32_t): " + str( nb_nodes * sizeof_uint32) + " x3" tot = tot + nb_nodes * sizeof_float * 2 print "nb_nodes * sizeof(np.float32): " + str( nb_nodes * sizeof_float) + " x2" tot = tot + nb_edges * sizeof_uint32 print "nb_edges * sizeof(np.uint32): " + str( nb_edges * sizeof_uint32) + " x1" tot = tot + nb_nodes * p_nb_col * sizeof_bool print "nb_nodes * p_nb_col * sizeof(np.bool): " + str( nb_nodes * p_nb_col * sizeof_bool) + " x1" tot = tot + nb_nodes * p_nb_col * sizeof_uint32 print "nb_nodes * p_nb_col * sizeof(np.uint32): " + str( nb_nodes * p_nb_col * sizeof_uint32) + " x1" print "TOTAL: " + str(tot) + " bytes" ############################################################### #Cuda Allocation coloring_d = gpuarray.zeros(nb_nodes, np.uint32) starColoring_d = gpuarray.zeros(nb_nodes, np.uint32) qStar_d = gpuarray.zeros(nb_nodes, np.float32) conflictCounter_d = gpuarray.zeros(nb_edges, np.uint32) colorsChecker_d = gpuarray.zeros(nb_nodes * p_nb_col, np.bool) orderedColors_d = gpuarray.zeros(nb_nodes * p_nb_col, np.uint32) free_mem, tot_mem = cuda.mem_get_info() print "total memory: " + str(tot_mem) + " free memory: " + str(free_mem) print "ColoringMCMC GPU" print "nbCol: " + str(p_nb_col) print "numThreads: " + str(p_numThreads) print "epsilon: " + str(p_epsilon) print "lambda: " + str(p_lambda) print "ratioFreezed: " + str(p_ratioFreezed) print "maxRip: " + str(p_maxRip) # logFile = open( "Out/" + str(nb_nodes) + "-" + str(nb_edges) + "-logFile.txt", "wt") resultsFile = open( "Out/" + str(nb_nodes) + "-" + str(nb_edges) + "-resultsFile.txt", "wt") logFile.write("nbCol: " + str(p_nb_col) + "\n") logFile.write("epsilon: " + str(p_epsilon) + "\n") logFile.write("lambda: " + str(p_lambda) + "\n") logFile.write("ratioFreezed: " + str(p_ratioFreezed) + "\n") logFile.write("maxRip: " + str(p_maxRip) + "\n") resultsFile.write("nbCol: " + str(p_nb_col) + "\n") resultsFile.write("epsilon: " + str(p_epsilon) + "\n") resultsFile.write("lambda: " + str(p_lambda) + "\n") resultsFile.write("ratioFreezed: " + str(p_ratioFreezed) + "\n") resultsFile.write("maxRip: " + str(p_maxRip) + "\n") #################################################################### #Init a color for nb_nodes nodes func_initColoring = mod.get_function("initColoring") func_initColoring(np.uint32(nb_nodes), coloring_d, np.float32(p_nb_col), rand_states, block=threadsPerBlock, grid=blocksPerGrid, time_kernel=True) ##################################################################### #run algorithm MCMC rip = 0 colors = coloring_d.get() #print "Couleurs des sommets initial: " #print colors resultsFile.write("\n_______________\n") resultsFile.write("Initial colors:\n\n") for index in range(len(colors)): resultsFile.write("node " + str(index) + ": " + str(colors[index]) + "\n") ############ #np.set_printoptions(threshold=np.nan) #print "nb edges=" + str(nb_edges) #print conflictCounter_d.get() #print coloring_d.get() #print MyGraph.cuda_edges.get() ####################### func_conflictChecker = mod.get_function("conflictChecker") func_sumReduction = mod.get_function("sumReduction") func_selectStarColoring = mod.get_function("selectStarColoring") tStart = tm.time() # compute nb of conflict before a tentative func_conflictChecker(np.uint32(nb_edges), conflictCounter_d, coloring_d, MyGraph.cuda_edges, grid=blocksPerGrid_edges, block=threadsPerBlock, time_kernel=True) # print conflictCounter_d.get() func_sumReduction(np.uint32(nb_edges), conflictCounter_d, grid=blocksPerGrid_half_edges, block=threadsPerBlock, shared=(threadsPerBlock[0] * sizeof_uint32), time_kernel=True) conflictCounter_h = conflictCounter_d.get() conflictCounter = 0 for i in range(blocksPerGrid_half_edges[0]): conflictCounter += conflictCounter_h[i] while (rip < p_maxRip): rip = rip + 1 if conflictCounter == 0: break print "<<< Tentative numero: " + str(rip) + " >>>" print "conflits relatifs avant: " + str(conflictCounter) logFile.write("<<< Tentative numero: " + str(rip) + " >>>\n") logFile.write("conflits relatifs avant: " + str(conflictCounter) + "\n") #resultsFile .write("iteration " + str(rip) + "\n") #resultsFile.write("iteration_" + str(rip) + "_conflits " + str(conflictCounter) + "\n") colorsChecker_d.fill(0) orderedColors_d.fill(0) func_selectStarColoring(np.uint32(nb_nodes), starColoring_d, qStar_d, np.uint32(p_nb_col), coloring_d, MyGraph.cuda_listCumulDeg, MyGraph.cuda_listNeighbors, colorsChecker_d, orderedColors_d, rand_states, np.uint32(p_epsilon), grid=blocksPerGrid, block=threadsPerBlock, time_kernel=True) temp = coloring_d coloring_d = starColoring_d starColoring_d = temp #compute nb of conflict after a tentative func_conflictChecker(np.uint32(nb_edges), conflictCounter_d, coloring_d, MyGraph.cuda_edges, grid=blocksPerGrid_edges, block=threadsPerBlock, time_kernel=True) #print conflictCounter_d.get() func_sumReduction(np.uint32(nb_edges), conflictCounter_d, grid=blocksPerGrid_half_edges, block=threadsPerBlock, shared=(threadsPerBlock[0] * sizeof_uint32), time_kernel=True) conflictCounter_h = conflictCounter_d.get() conflictCounter = 0 for i in range(blocksPerGrid_half_edges[0]): conflictCounter += conflictCounter_h[i] print "conflits relatifs apres: " + str(conflictCounter) + "\n" logFile.write("conflits relatifs apres: " + str(conflictCounter) + "\n") #fin algorithme print('Fin MCMC en : %.3f s' % (tm.time() - tStart)) logFile.write("Fin MCMC\n") colors = coloring_d.get() #print "Couleurs des sommets final: " #print colors resultsFile.write("\n_____________\n") resultsFile.write("Final colors:\n\n") for index in range(len(colors)): resultsFile.write("node " + str(index) + ": " + str(colors[index]) + "\n") resultsFile.write("\n___________________________\n") resultsFile.write("Counter of each color used:\n") counter_colors_used = Counter(colors) for key, value in counter_colors_used.iteritems(): resultsFile.write("\nColor " + str(key) + ": " + str(value))
def get_rng_states(size_output, seed=1): init_rng_src = """ #include <curand_kernel.h> extern "C" { __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) { int id = blockIdx.x*blockDim.x + threadIdx.x; if (id >= nthreads) return; curand_init(seed, id, offset, &s[id]); } __global__ void make_rand(int nthreads, curandState *state, int *randArray) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int id_rng = blockIdx.x; double mean = 10; if (idx<= nthreads){ randArray[idx] = curand_poisson(&state[id_rng], mean); //randArray[idx] = curand_uniform(&state[idx]); } } } // extern "C" """ "Return `size_rng` number of CUDA random number generator states." curr_gpu.make_context() gpu_vect_rand = ga.GPUArray((size_output,), dtype=np.int32) cpu_vect_rand = np.ones((size_output,), dtype=np.int32) (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) # module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True,arch="sm_30") module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function("init_rng") make_rand = module.get_function("make_rand") size_block = 1024 n_blocks = size_output // size_block + 1 rng_states = cuda.mem_alloc( n_blocks * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>") ) init_rng( np.int32(n_blocks), rng_states, np.uint64(seed), np.uint64(0), block=(64, 1, 1), grid=(n_blocks // 64 + 1, 1), ) try: make_rand( np.int32(size_output), rng_states, gpu_vect_rand, block=(size_block, 1, 1), grid=(n_blocks, 1), ) except cuda.LogicError: print("random number generation failed ...") (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) rng_states.free() gpu_vect_rand.get(ary=cpu_vect_rand) cuda.Context.pop() return cpu_vect_rand
def __init__(self, geometry, wavelengths=None, times=None, print_usage=False, min_free_gpu_mem=300e6): if wavelengths is None: wavelengths = standard_wavelengths try: wavelength_step = np.unique(np.diff(wavelengths)).item() except ValueError: raise ValueError('wavelengths must be equally spaced apart.') if times is None: time_step = 0.05 times = np.arange(0,1000,time_step) else: try: time_step = np.unique(np.diff(times)).item() except ValueError: raise ValueError('times must be equally spaced apart.') geometry_source = get_cu_source('geometry_types.h') material_struct_size = characterize.sizeof('Material', geometry_source) surface_struct_size = characterize.sizeof('Surface', geometry_source) dichroicprops_struct_size = characterize.sizeof('DichroicProps', geometry_source) geometry_struct_size = characterize.sizeof('Geometry', geometry_source) self.material_data = [] self.material_ptrs = [] def interp_material_property(wavelengths, property): # note that it is essential that the material properties be # interpolated linearly. this fact is used in the propagation # code to guarantee that probabilities still sum to one. return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) for i in range(len(geometry.unique_materials)): material = geometry.unique_materials[i] if material is None: raise Exception('one or more triangles is missing a material.') refractive_index = interp_material_property(wavelengths, material.refractive_index) refractive_index_gpu = ga.to_gpu(refractive_index) absorption_length = interp_material_property(wavelengths, material.absorption_length) absorption_length_gpu = ga.to_gpu(absorption_length) scattering_length = interp_material_property(wavelengths, material.scattering_length) scattering_length_gpu = ga.to_gpu(scattering_length) num_comp = len(material.comp_reemission_prob) comp_reemission_prob_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_prob] self.material_data.append(comp_reemission_prob_gpu) comp_reemission_prob_gpu = np.uint64(0) if len(comp_reemission_prob_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_prob_gpu), comp_reemission_prob_gpu) assert num_comp == len(material.comp_reemission_wvl_cdf), 'component arrays must be same length' comp_reemission_wvl_cdf_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_wvl_cdf] self.material_data.append(comp_reemission_wvl_cdf_gpu) comp_reemission_wvl_cdf_gpu = np.uint64(0) if len(comp_reemission_wvl_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_wvl_cdf_gpu), comp_reemission_wvl_cdf_gpu) assert num_comp == len(material.comp_reemission_time_cdf), 'component arrays must be same length' comp_reemission_time_cdf_gpu = [ga.to_gpu(interp_material_property(times, component)) for component in material.comp_reemission_time_cdf] self.material_data.append(comp_reemission_time_cdf_gpu) comp_reemission_time_cdf_gpu = np.uint64(0) if len(comp_reemission_time_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_time_cdf_gpu), comp_reemission_time_cdf_gpu) assert num_comp == len(material.comp_absorption_length), 'component arrays must be same length' comp_absorption_length_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_absorption_length] self.material_data.append(comp_absorption_length_gpu) comp_absorption_length_gpu = np.uint64(0) if len(comp_absorption_length_gpu) == 0 else make_gpu_struct(8*len(comp_absorption_length_gpu), comp_absorption_length_gpu) self.material_data.append(refractive_index_gpu) self.material_data.append(absorption_length_gpu) self.material_data.append(scattering_length_gpu) self.material_data.append(comp_reemission_prob_gpu) self.material_data.append(comp_reemission_wvl_cdf_gpu) self.material_data.append(comp_reemission_time_cdf_gpu) self.material_data.append(comp_absorption_length_gpu) material_gpu = \ make_gpu_struct(material_struct_size, [refractive_index_gpu, absorption_length_gpu, scattering_length_gpu, comp_reemission_prob_gpu, comp_reemission_wvl_cdf_gpu, comp_reemission_time_cdf_gpu, comp_absorption_length_gpu, np.uint32(num_comp), np.uint32(len(wavelengths)), np.float32(wavelength_step), np.float32(wavelengths[0]), np.uint32(len(times)), np.float32(time_step), np.float32(times[0])]) self.material_ptrs.append(material_gpu) self.material_pointer_array = \ make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) self.surface_data = [] self.surface_ptrs = [] for i in range(len(geometry.unique_surfaces)): surface = geometry.unique_surfaces[i] if surface is None: # need something to copy to the surface array struct # that is the same size as a 64-bit pointer. # this pointer will never be used by the simulation. self.surface_ptrs.append(np.uint64(0)) continue detect = interp_material_property(wavelengths, surface.detect) detect_gpu = ga.to_gpu(detect) absorb = interp_material_property(wavelengths, surface.absorb) absorb_gpu = ga.to_gpu(absorb) reemit = interp_material_property(wavelengths, surface.reemit) reemit_gpu = ga.to_gpu(reemit) reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) reflect_specular_gpu = ga.to_gpu(reflect_specular) eta = interp_material_property(wavelengths, surface.eta) eta_gpu = ga.to_gpu(eta) k = interp_material_property(wavelengths, surface.k) k_gpu = ga.to_gpu(k) reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) if surface.dichroic_props: props = surface.dichroic_props transmit_pointers = [] reflect_pointers = [] angles_gpu = ga.to_gpu(np.asarray(props.angles,dtype=np.float32)) self.surface_data.append(angles_gpu) for i,angle in enumerate(props.angles): dichroic_reflect = interp_material_property(wavelengths, props.dichroic_reflect[i]) dichroic_reflect_gpu = ga.to_gpu(dichroic_reflect) self.surface_data.append(dichroic_reflect_gpu) reflect_pointers.append(dichroic_reflect_gpu) dichroic_transmit = interp_material_property(wavelengths, props.dichroic_transmit[i]) dichroic_transmit_gpu = ga.to_gpu(dichroic_transmit) self.surface_data.append(dichroic_transmit_gpu) transmit_pointers.append(dichroic_transmit_gpu) reflect_arr_gpu = make_gpu_struct(8*len(reflect_pointers),reflect_pointers) self.surface_data.append(reflect_arr_gpu) transmit_arr_gpu = make_gpu_struct(8*len(transmit_pointers), transmit_pointers) self.surface_data.append(transmit_arr_gpu) dichroic_props = make_gpu_struct(dichroicprops_struct_size,[angles_gpu,reflect_arr_gpu,transmit_arr_gpu,np.uint32(len(props.angles))]) else: dichroic_props = np.uint64(0) #NULL self.surface_data.append(detect_gpu) self.surface_data.append(absorb_gpu) self.surface_data.append(reemit_gpu) self.surface_data.append(reflect_diffuse_gpu) self.surface_data.append(reflect_specular_gpu) self.surface_data.append(eta_gpu) self.surface_data.append(k_gpu) self.surface_data.append(dichroic_props) surface_gpu = \ make_gpu_struct(surface_struct_size, [detect_gpu, absorb_gpu, reemit_gpu, reflect_diffuse_gpu,reflect_specular_gpu, eta_gpu, k_gpu, reemission_cdf_gpu, dichroic_props, np.uint32(surface.model), np.uint32(len(wavelengths)), np.uint32(surface.transmissive), np.float32(wavelength_step), np.float32(wavelengths[0]), np.float32(surface.thickness)]) self.surface_ptrs.append(surface_gpu) self.surface_pointer_array = \ make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) self.vertices = mapped_empty(shape=len(geometry.mesh.vertices), dtype=ga.vec.float3, write_combined=True) self.triangles = mapped_empty(shape=len(geometry.mesh.triangles), dtype=ga.vec.uint3, write_combined=True) self.vertices[:] = to_float3(geometry.mesh.vertices) self.triangles[:] = to_uint3(geometry.mesh.triangles) self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin) self.world_scale = np.float32(geometry.bvh.world_coords.world_scale) material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) self.material_codes = ga.to_gpu(material_codes) colors = geometry.colors.astype(np.uint32) self.colors = ga.to_gpu(colors) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) # Limit memory usage by splitting BVH into on and off-GPU parts gpu_free, gpu_total = cuda.mem_get_info() node_array_usage = geometry.bvh.nodes.nbytes # Figure out how many elements we can fit on the GPU, # but no fewer than 100 elements, and no more than the number of actual nodes n_nodes = len(geometry.bvh.nodes) split_index = min( max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100), n_nodes ) self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index]) n_extra = max(1, (n_nodes - split_index)) # forbid zero size self.extra_nodes = mapped_empty(shape=n_extra, dtype=geometry.bvh.nodes.dtype, write_combined=True) if split_index < n_nodes: logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index) self.extra_nodes[:] = geometry.bvh.nodes[split_index:] # See if there is enough memory to put the and/ortriangles back on the GPU gpu_free, gpu_total = cuda.mem_get_info() if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): self.triangles = ga.to_gpu(self.triangles) logger.info('Optimization: Sufficient memory to move triangles onto GPU') gpu_free, gpu_total = cuda.mem_get_info() if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): self.vertices = ga.to_gpu(self.vertices) logger.info('Optimization: Sufficient memory to move vertices onto GPU') self.gpudata = make_gpu_struct(geometry_struct_size, [Mapped(self.vertices), Mapped(self.triangles), self.material_codes, self.colors, self.nodes, Mapped(self.extra_nodes), self.material_pointer_array, self.surface_pointer_array, self.world_origin, self.world_scale, np.int32(len(self.nodes))]) self.geometry = geometry if print_usage: self.print_device_usage() logger.info(self.device_usage_str())
def calculation(in_queue, out_queue): device_num, params = in_queue.get() chunk_size = params['chunk_size'] chunks_num = params['chunks_num'] particles = params['particles'] state = params['state'] representation = params['representation'] quantities = params['quantities'] decoherence = params['decoherence'] if decoherence is not None: decoherence_steps = decoherence['steps'] decoherence_coeff = decoherence['coeff'] else: decoherence_steps = 0 decoherence_coeff = 1 binning = params['binning'] if binning is not None: s = set() for names, _, _ in binning: s.update(names) quantities = sorted(list(s)) c_dtype = numpy.complex128 c_ctype = 'double2' s_dtype = numpy.float64 s_ctype = 'double' Fs = [] cuda.init() device = cuda.Device(device_num) ctx = device.make_context() free, total = cuda.mem_get_info() max_chunk_size = float(total) / len(quantities) / numpy.dtype( c_dtype).itemsize / 1.1 max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10)) #print free, total, max_chunk_size if max_chunk_size > chunk_size: subchunk_size = chunk_size subchunks_num = 1 else: assert chunk_size % max_chunk_size == 0 subchunk_size = max_chunk_size subchunks_num = chunk_size / subchunk_size buffers = [] for quantity in sorted(quantities): buffers.append(GPUArray(subchunk_size, c_dtype)) stream = cuda.Stream() # compile code try: source = TEMPLATE.render(c_ctype=c_ctype, s_ctype=s_ctype, particles=particles, state=state, representation=representation, quantities=quantities, decoherence_coeff=decoherence_coeff) except: print exceptions.text_error_template().render() raise try: module = SourceModule(source, no_extern_c=True) except: for i, l in enumerate(source.split("\n")): print i + 1, ":", l raise kernel_initialize = module.get_function("initialize") kernel_calculate = module.get_function("calculate") kernel_decoherence = module.get_function("decoherence") # prepare call parameters gen_block_size = min(kernel_initialize.max_threads_per_block, kernel_calculate.max_threads_per_block) gen_grid_size = device.get_attribute( cuda.device_attribute.MULTIPROCESSOR_COUNT) gen_block = (gen_block_size, 1, 1) gen_grid = (gen_grid_size, 1, 1) num_gen = gen_block_size * gen_grid_size assert num_gen <= 20000 # prepare RNG states #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32)) seeds = to_gpu( numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32)) state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>") states = cuda.mem_alloc(num_gen * state_type_size) #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k kernel_initialize(states, seeds.gpudata, block=gen_block, grid=gen_grid, stream=stream) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size) # run calculation args = [states] + [buf.gpudata for buf in buffers] + [numpy.int32(subchunk_size)] if binning is None: results = { quantity: numpy.zeros( (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype) for quantity in quantities } for i in xrange(chunks_num * subchunks_num): kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) for k in xrange(decoherence_steps + 1): if k > 0: kernel_decoherence(*args, block=gen_block, grid=gen_grid, stream=stream) for j, quantity in enumerate(sorted(quantities)): F = (gpuarray.sum(buffers[j], stream=stream) / buffers[j].size).get() results[quantity][k, i] = F for quantity in sorted(quantities): results[quantity] = results[quantity].reshape( decoherence_steps + 1, chunks_num, subchunks_num).mean(2).real.tolist() out_queue.put(results) else: bin_accums = [ numpy.zeros(tuple([binnum] * len(vals)), numpy.int64) for vals, binnum, _ in binning ] bin_edges = [None] * len(binning) for i in xrange(chunks_num * subchunks_num): bin_edges = [] kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) results = { quantity: buffers[j].get().real for j, quantity in enumerate(sorted(quantities)) } for binparam, bin_accum in zip(binning, bin_accums): qnames, binnum, ranges = binparam sample_lines = [results[quantity] for quantity in qnames] sample = numpy.concatenate( [arr.reshape(subchunk_size, 1) for arr in sample_lines], axis=1) hist, edges = numpy.histogramdd(sample, binnum, ranges) bin_accum += hist bin_edges.append(numpy.array(edges)) results = [[acc.tolist(), edges.tolist()] for acc, edges in zip(bin_accums, bin_edges)] out_queue.put(results) #ctx.pop() ctx.detach()
def _package_surface_data_cuda(self, geometry, wavelengths, wavelength_step): surface_data = [] surface_ptrs = [] geometry_source = cutools.get_cu_source('geometry_types.h') surface_struct_size = characterize.sizeof('Surface', geometry_source) for i in range(len(geometry.unique_surfaces)): surface = geometry.unique_surfaces[i] if surface is None: # need something to copy to the surface array struct # that is the same size as a 64-bit pointer. # this pointer will never be used by the simulation. surface_ptrs.append(np.uint64(0)) continue detect = self._interp_material_property(wavelengths, surface.detect) detect_gpu = ga.to_gpu(detect) absorb = self._interp_material_property(wavelengths, surface.absorb) absorb_gpu = ga.to_gpu(absorb) reemit = self._interp_material_property(wavelengths, surface.reemit) reemit_gpu = ga.to_gpu(reemit) reflect_diffuse = self._interp_material_property( wavelengths, surface.reflect_diffuse) reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) reflect_specular = self._interp_material_property( wavelengths, surface.reflect_specular) reflect_specular_gpu = ga.to_gpu(reflect_specular) eta = self._interp_material_property(wavelengths, surface.eta) eta_gpu = ga.to_gpu(eta) k = self._interp_material_property(wavelengths, surface.k) k_gpu = ga.to_gpu(k) reemission_cdf = self._interp_material_property( wavelengths, surface.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) surface_data.append(detect_gpu) surface_data.append(absorb_gpu) surface_data.append(reemit_gpu) surface_data.append(reflect_diffuse_gpu) surface_data.append(reflect_specular_gpu) surface_data.append(eta_gpu) surface_data.append(k_gpu) surface_data.append(reemission_cdf_gpu) surface_gpu = \ make_gpu_struct(surface_struct_size, [detect_gpu, absorb_gpu, reemit_gpu, reflect_diffuse_gpu,reflect_specular_gpu, eta_gpu, k_gpu, reemission_cdf_gpu, np.uint32(surface.model), np.uint32(len(wavelengths)), np.uint32(surface.transmissive), np.float32(wavelength_step), np.float32(wavelengths[0]), np.float32(surface.thickness), np.float32(surface.nplanes), np.float32(surface.wire_diameter), np.float32(surface.wire_pitch)] ) surface_ptrs.append(surface_gpu) surface_pointer_array = make_gpu_struct(8 * len(surface_ptrs), surface_ptrs) return surface_data, surface_ptrs, surface_pointer_array
} // end extern "C" """ mod = SourceModule(kernel_code, no_extern_c = True) # Get kernel functions local = mod.get_function('local_diffuse') non_local = mod.get_function('non_local_diffuse') survival_layer = mod.get_function('survival_of_the_fittest') population_layer = mod.get_function('population_growth') init_generators = mod.get_function('init_generators') # Initialize random number generator generator = curandom.XORWOWRandomNumberGenerator() data_type_size = sizeof(generator.state_type, "#include <curand_kernel.h>") generator._state = drv.mem_alloc((matrix_size * matrix_size) * data_type_size) seed = 123456789 init_generators(generator.state, np.int32(seed), np.int32(matrix_size), grid = (grid_dims, grid_dims), block = (block_dims, block_dims, 1)) # Run n_iters of the Brown Marmorated Stink Bug (BMSB) Diffusion Simulation run_primitive( empty_grid.vars(matrix_size) == initialize_grid.vars(matrix_size, initial_population, survival_probabilities, generator) == bmsb_stop_condition.vars(n_iters) <= local_diffusion.vars(local, matrix_size, p_local, grid_dims, block_dims) == non_local_diffusion.vars(non_local, matrix_size, p_non_local, mu, gamma, grid_dims, block_dims) == survival_function.vars(survival_layer, matrix_size, grid_dims, block_dims) == population_growth.vars(population_layer, matrix_size, growth_rate, grid_dims, block_dims) == bmsb_stop >=
def __init__(self, detector, wavelengths=None, print_usage=False, cl_context=None, cl_queue=None): GPUGeometry.__init__(self, detector, wavelengths=wavelengths, print_usage=False, cl_context=cl_context, cl_queue=cl_queue) if api.is_gpu_api_cuda(): self.solid_id_to_channel_index_gpu = ga.to_gpu( detector.solid_id_to_channel_index.astype(np.int32)) self.solid_id_to_channel_id_gpu = ga.to_gpu( detector.solid_id_to_channel_id.astype(np.int32)) self.nchannels = detector.num_channels() self.time_cdf_x_gpu = ga.to_gpu(detector.time_cdf[0].astype( np.float32)) self.time_cdf_y_gpu = ga.to_gpu(detector.time_cdf[1].astype( np.float32)) self.charge_cdf_x_gpu = ga.to_gpu(detector.charge_cdf[0].astype( np.float32)) self.charge_cdf_y_gpu = ga.to_gpu(detector.charge_cdf[1].astype( np.float32)) detector_source = cutools.get_cu_source('detector.h') detector_struct_size = characterize.sizeof('Detector', detector_source) self.detector_gpu = make_gpu_struct(detector_struct_size, [ self.solid_id_to_channel_index_gpu, self.time_cdf_x_gpu, self.time_cdf_y_gpu, self.charge_cdf_x_gpu, self.charge_cdf_y_gpu, np.int32(self.nchannels), np.int32(len(detector.time_cdf[0])), np.int32(len(detector.charge_cdf[0])), np.float32(detector.charge_cdf[0][-1] / 2**16) ]) elif api.is_gpu_api_opencl(): self.solid_id_to_channel_index_gpu = ga.to_device( cl_queue, detector.solid_id_to_channel_index.astype(np.int32)) self.solid_id_to_channel_id_gpu = ga.to_device( cl_queue, detector.solid_id_to_channel_id.astype(np.int32)) self.nchannels = np.int32(detector.num_channels()) self.time_cdf_x_gpu = ga.to_device( cl_queue, detector.time_cdf[0].astype(np.float32)) self.time_cdf_y_gpu = ga.to_device( cl_queue, detector.time_cdf[1].astype(np.float32)) self.charge_cdf_x_gpu = ga.to_device( cl_queue, detector.charge_cdf[0].astype(np.float32)) self.charge_cdf_y_gpu = ga.to_device( cl_queue, detector.charge_cdf[1].astype(np.float32)) self.time_cdf_len = np.int32(len(detector.time_cdf[0])) self.charge_cdf_len = np.int32(len(detector.charge_cdf[0])) self.charge_unit = np.float32(detector.charge_cdf[0][-1] / 2**16) else: raise RuntimeError("GPU API is neither OpenCL nor CUDA")
__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) { int id = blockIdx.x*blockDim.x + threadIdx.x; if (id >= nthreads) return; curand_init(seed+id, id, offset, &s[id]); } } // extern "C" """ rng_states_gpu = cuda.mem_alloc( NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK * characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) module = SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function('init_rng') init_rng(np.int32(NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK), rng_states_gpu, np.uint32(time.time()), np.uint64(0), block=(32, NUM_RUNS_PER_BLOCK, 1), grid=(NUM_RUNS, 1)) is_simiulation = 0 if NUM_RUNS == 1: is_simulation = 1 defines = "#define NUM_ROUTES " + str(NUM_ROUTES) + "\n" +\ "#define NUM_STOPS " + str(NUM_STOPS) + "\n" +\ "#define NUM_STOPS_INTS " + str(NUM_CHARGER_INTS) + "\n" +\
from generate_random_graph import generate_filepath_pickle from pycuda import (autoinit, characterize, compiler, curandom, driver, gpuarray, tools) from timer import (cumulative_runtimes, execution_counts, find_k_seeds_runtimes, runtimes, timeit, to_csv) L_CONSTANT = 1 EPSILON_CONSTANT = 0.2 K_CONSTANT = 2 BLOCK_SIZE = 1024 TILE_X = 1 TILE_Y = 32 TILE_Z = 32 SIZEOF_GENERATOR = characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>') TWITTER_DATASET_FILEPATH = './datasets/twitter' TWITTER_DATASET_PICKLE_FILEPATH = './datasets/twitter.pickle' EDGE_FILE_SUFFIX = '.edges' RANDOM_CSR_GRAPH_FILEPATH = './datasets/random_graph.pickle' GENERATE_RR_SETS_CUDA_CODE_FILEPATH = 'node_selection.cu' # Compile kernel code with open(GENERATE_RR_SETS_CUDA_CODE_FILEPATH, "r") as fp: content = fp.read() mod = compiler.SourceModule(content, no_extern_c=True) @timeit def width(graph, nodes):