def build(self, source): if self.options.cuda_nvcc_opts: import shlex options = shlex.split(self.options.cuda_nvcc_opts) else: options = [] if not self.options.cuda_fermi_highprec and self._device.compute_capability()[0] >= 2: options.append('--prec-div=false') options.append('--prec-sqrt=false') if self.options.cuda_disable_l1: options.extend(['-Xptxas', '-dlcm=cg']) if cuda.get_driver_version() >= 5000: # Generate annotated PTX code. options.append('-src-in-ptx') if self.options.cuda_cache: cache = None else: cache = False return pycuda.compiler.SourceModule(source, options=options, nvcc=self.options.cuda_nvcc, keep=self.options.cuda_keep_temp, cache_dir=cache) #options=['-Xopencc', '-O0']) #, options=['--use_fast_math'])
def get_pycuda_info(): init_all_devices() return {"version" : pycuda.VERSION, "version.text" : pycuda.VERSION_TEXT, "version.status" : pycuda.VERSION_STATUS, "driver.version" : driver.get_version(), "driver.driver_version" : driver.get_driver_version()}
def build(self, source): if self.options.cuda_nvcc_opts: import shlex options = shlex.split(self.options.cuda_nvcc_opts) else: options = [] if not self.options.cuda_fermi_highprec and self._device.compute_capability( )[0] >= 2: options.append('--prec-div=false') options.append('--prec-sqrt=false') if self.options.cuda_disable_l1: options.extend(['-Xptxas', '-dlcm=cg']) if cuda.get_driver_version() >= 5000: # Generate annotated PTX code. options.append('-src-in-ptx') if self.options.cuda_cache: cache = None else: cache = False return pycuda.compiler.SourceModule( source, options=options, nvcc=self.options.cuda_nvcc, keep=self.options.cuda_keep_temp, cache_dir=cache ) #options=['-Xopencc', '-O0']) #, options=['--use_fast_math'])
def init_all_devices(): global DEVICES if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() log.info("CUDA %s / PyCUDA %s, found %s device(s):", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus) da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None try: device = driver.Device(i) log(" + testing device %s: %s", i, device_info(device)) host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", device_info(device)) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free / 1024 / 1024), int(total / 1024 / 1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor << 4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) try: DEVICES.append(i) log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100 * free / total, SMmajor, SMminor) finally: context.pop() except Exception, e: log.error("error on device %s: %s", (device or i), e)
def get_info(): return { "version": pycuda.VERSION, "version.text": pycuda.VERSION_TEXT, "version.status": pycuda.VERSION_STATUS, "driver.version": driver.get_version(), "driver.driver_version": driver.get_driver_version() }
def get_cuda_info(): init_all_devices() return { "driver" : { "version" : driver.get_version(), "driver_version" : driver.get_driver_version(), } }
def get_cuda_info(): init_all_devices() return { "driver": { "version": driver.get_version(), "driver_version": driver.get_driver_version(), } }
def get_info(): return { "version": pycuda.VERSION, "version.text": pycuda.VERSION_TEXT, "version.status": pycuda.VERSION_STATUS, "driver.version": driver.get_version(), "driver.driver_version": driver.get_driver_version(), }
def get_pycuda_info(): init_all_devices() return { "version": pycuda.VERSION, "version.text": pycuda.VERSION_TEXT, "version.status": pycuda.VERSION_STATUS, "driver.version": driver.get_version(), "driver.driver_version": driver.get_driver_version() }
def init_all_devices(): global DEVICES, DEVICE_INFO if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] DEVICE_INFO = {} log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() if ngpus==0: log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT) return DEVICES da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None devinfo = "gpu %i" % i try: device = driver.Device(i) devinfo = device_info(device) log(" + testing device %s: %s", i, devinfo) DEVICE_INFO[i] = devinfo host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", devinfo) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) try: log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) if i==0: #we print the list info "header" from inside the loop #so that the log output is bunched up together log.info("CUDA %s / PyCUDA %s, found %s device%s:", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus)) DEVICES.append(i) log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100*free/total, SMmajor, SMminor) finally: context.pop() except Exception as e: log.error("error on device %s: %s", devinfo, e) return DEVICES
def init_all_devices(): global DEVICES if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] log("CUDA driver version=%s", driver.get_driver_version()) log.info("PyCUDA version=%s", pycuda.VERSION_TEXT) ngpus = driver.Device.count() log.info("CUDA version=%s found %s device(s):", ".".join([str(x) for x in driver.get_version()]), ngpus) da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None try: device = driver.Device(i) log(" + testing device %s: %s", i, device_info(device)) host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", device_info(device)) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) try: DEVICES.append(i) log.info(" + %s (memory %s%% free, compute %#x)", device_info(device), 100*free/total, compute) finally: context.pop() except Exception, e: log.error("error on device %s: %s", (device or i), e)
def gpu_info(): """Show GPU information """ print("CUDA Version: " + format_tuple(cuda.get_version())) print("CUDA Driver Version: " + str(cuda.get_driver_version())) print("Number of CUDA devices: " + str(cuda.Device.count())) for i in range(0, cuda.Device(0).count()): print("Device number " + str(i)) print(" Name of CUDA device: " + str(cuda.Device(i).name())) print(" Compute capability: " + format_tuple(cuda.Device(i).compute_capability())) print(" Total Memory: " + str(cuda.Device(i).total_memory() / (1024.0**2)) + " MB") print(" Maximum number of threads per block: " + str(cuda.Device(i).max_threads_per_block)) print(" PCI Bus ID: " + str(cuda.Device(i).pci_bus_id())) for (k, v) in cuda.Device(i).get_attributes().items(): print(" " + str(k) + ": " + str(v))
def driver_init(): global driver_init_done if driver_init_done is None: log.info("CUDA initialization (this may take a few seconds)") try: driver.init() driver_init_done = True log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() if ngpus==0: log.info("CUDA %s / PyCUDA %s, no devices found", ".".join(str(x) for x in driver.get_version()), pycuda.VERSION_TEXT) driver_init_done = True except Exception as e: log.error("Error: cannot initialize CUDA") log.error(" %s", e) driver_init_done = False return driver_init_done
def init_gl(self, width, height): super(DenseDemo, self).init_gl(width, height) import pycuda.gl.autoinit print "CUDA version: %s" % str(drv.get_version()) print "CUDA driver version: %s" % drv.get_driver_version() print "CUDA device: %s" % pycuda.gl.autoinit.device.name() print "\tCompute capability: %s" % str(pycuda.gl.autoinit.device.compute_capability()) print "\tTotal memory: %s" % pycuda.gl.autoinit.device.total_memory() self.ffusion = FreenectFusion(kc.K_ir, kc.K_rgb, kc.T, side=128) self.bbox = self.ffusion.get_bounding_box() #freenect.sync_set_led(2) # Create a texture. self.gl_rgb_texture = gl.glGenTextures(1) gl.glBindTexture(gl.GL_TEXTURE_2D, self.gl_rgb_texture) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
def init_gl(self, width, height): super(DenseDemo, self).init_gl(width, height) import pycuda.gl.autoinit print "CUDA version: %s" % str(drv.get_version()) print "CUDA driver version: %s" % drv.get_driver_version() print "CUDA device: %s" % pycuda.gl.autoinit.device.name() print "\tCompute capability: %s" % str( pycuda.gl.autoinit.device.compute_capability()) print "\tTotal memory: %s" % pycuda.gl.autoinit.device.total_memory() self.ffusion = FreenectFusion(kc.K_ir, kc.K_rgb, kc.T, side=128) self.bbox = self.ffusion.get_bounding_box() #freenect.sync_set_led(2) # Create a texture. self.gl_rgb_texture = gl.glGenTextures(1) gl.glBindTexture(gl.GL_TEXTURE_2D, self.gl_rgb_texture) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
def __init__(self, blocking=False, use_cache=True): self.blocking = blocking self.use_cache = use_cache self.logger = logging.getLogger(__name__) self.kernels = {} self.module_path = os.path.dirname(os.path.realpath(__file__)) #Initialize cuda (must be first call to PyCUDA) cuda.init(flags=0) self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT)) #Print some info about CUDA self.logger.info("CUDA version %s", str(cuda.get_version())) self.logger.info("Driver version %s", str(cuda.get_driver_version())) self.cuda_device = cuda.Device(0) self.logger.info("Using '%s' GPU", self.cuda_device.name()) self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) # Create the CUDA context if (self.blocking): self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC) self.logger.warning("Using blocking context") else: self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO) free, total = cuda.mem_get_info() self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024))) self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) #Create cache dir for cubin files if (self.use_cache): self.cache_path = os.path.join(self.module_path, "cuda_cache") if not os.path.isdir(self.cache_path): os.mkdir(self.cache_path) self.logger.info("Using CUDA cache dir %s", self.cache_path)
def __init__(self, blocking=False, use_cache=True): self.blocking = blocking self.use_cache = use_cache self.logger = logging.getLogger(__name__) self.kernels = {} self.module_path = os.path.dirname(os.path.realpath(__file__)) #Initialize cuda (must be first call to PyCUDA) cuda.init(flags=0) self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT)) #Print some info about CUDA self.logger.info("CUDA version %s", str(cuda.get_version())) self.logger.info("Driver version %s", str(cuda.get_driver_version())) self.cuda_device = cuda.Device(0) self.logger.info("Using '%s' GPU", self.cuda_device.name()) self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024)) # Create the CUDA context if (self.blocking): self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC) self.logger.warning("Using blocking context") else: self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO) self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) #Create cache dir for cubin files if (self.use_cache): self.cache_path = os.path.join(self.module_path, "cuda_cache") if not os.path.isdir(self.cache_path): os.mkdir(self.cache_path) self.logger.debug("Using CUDA cache dir %s", self.cache_path)
def __init__(self, device=None, context_flags=None, use_cache=True, autotuning=True): """ Create a new CUDA context Set device to an id or pci_bus_id to select a specific GPU Set context_flags to cuda.ctx_flags.SCHED_BLOCKING_SYNC for a blocking context """ self.use_cache = use_cache self.logger = logging.getLogger(__name__) self.modules = {} self.module_path = os.path.dirname(os.path.realpath(__file__)) #Initialize cuda (must be first call to PyCUDA) cuda.init(flags=0) self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT)) #Print some info about CUDA self.logger.info("CUDA version %s", str(cuda.get_version())) self.logger.info("Driver version %s", str(cuda.get_driver_version())) if device is None: device = 0 self.cuda_device = cuda.Device(device) self.logger.info("Using device %d/%d '%s' (%s) GPU", device, cuda.Device.count(), self.cuda_device.name(), self.cuda_device.pci_bus_id()) self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) # Create the CUDA context if context_flags is None: context_flags = cuda.ctx_flags.SCHED_AUTO self.cuda_context = self.cuda_device.make_context(flags=context_flags) free, total = cuda.mem_get_info() self.logger.debug(" => memory: %d / %d MB available", int(free / (1024 * 1024)), int(total / (1024 * 1024))) self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) #Create cache dir for cubin files self.cache_path = os.path.join(self.module_path, "cuda_cache") if (self.use_cache): if not os.path.isdir(self.cache_path): os.mkdir(self.cache_path) self.logger.info("Using CUDA cache dir %s", self.cache_path) self.autotuner = None if (autotuning): self.logger.info( "Autotuning enabled. It may take several minutes to run the code the first time: have patience" ) self.autotuner = Autotuner.Autotuner()
def init_all_devices(): global DEVICES, DEVICE_INFO if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") DEVICES = [] DEVICE_INFO = {} try: driver.init() except Exception as e: log.error("Error: cannot initialize CUDA") log.error(" %s", e) return DEVICES log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() if ngpus == 0: log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT) return DEVICES cuda_device_blacklist = get_pref("blacklist") da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None devinfo = "gpu %i" % i try: device = driver.Device(i) devinfo = device_info(device) if cuda_device_blacklist: blacklisted = [ x for x in cuda_device_blacklist if x and devinfo.find(x) >= 0 ] log("blacklisted(%s / %s)=%s", devinfo, cuda_device_blacklist, blacklisted) if blacklisted: log.warn( "Warning: device '%s' is blacklisted and will not be used", devinfo) continue log(" + testing device %s: %s", i, devinfo) DEVICE_INFO[i] = devinfo host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", devinfo) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) try: log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free / 1024 / 1024), int(total / 1024 / 1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor << 4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) if i == 0: #we print the list info "header" from inside the loop #so that the log output is bunched up together log.info("CUDA %s / PyCUDA %s, found %s device%s:", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus)) if SMmajor >= 2: DEVICES.append(i) else: log.info(" this device is too old!") log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100 * free / total, SMmajor, SMminor) finally: context.pop() except Exception as e: log.error("error on device %s: %s", devinfo, e) return DEVICES
# # Example based on dnorm from RCUDA # Timing code from http://wiki.tiker.net/PyCuda/Examples/SimpleSpeedTest # import pycuda.autoinit import pycuda.driver as drv import numpy as np import scipy as sp from scipy.stats import norm from pycuda.compiler import SourceModule # Versions: drv.get_version() drv.get_driver_version() m = SourceModule(""" #include <stdio.h> __global__ void dnorm_kernel(float *vals, float *x, int N, float mu, float sigma, int dbg) { int myblock = blockIdx.x; // 1D-grid int blocksize = blockDim.x; // 1D-block int subthread = threadIdx.x; int idx = myblock * blocksize + subthread; if (idx < N) { if (dbg){ printf("thread idx: %04d\\t x[%d] = %f\\t (n=%d,mu=%f,sigma=%f)\\n",idx,idx,x[idx],N,mu,sigma); } float std = (x[idx] - mu)/sigma; float e = exp( - 0.5 * std * std); vals[idx] = e / ( sigma * sqrt(2 * 3.141592653589793));
def log_sys_info(): log.info("PyCUDA version=%s", ".".join([str(x) for x in driver.get_version()])) log.info("PyCUDA driver version=%s", driver.get_driver_version())
print('testing PyCUDA...') import pycuda import pycuda.driver as cuda import pycuda.autoinit print('PyCUDA version: ' + str(pycuda.VERSION_TEXT)) print('CUDA build version: ' + str(cuda.get_version())) print('CUDA driver version: ' + str(cuda.get_driver_version())) dev = cuda.Device(0) print('CUDA device name: ' + str(dev.name())) print('CUDA device memory: ' + str((int)(dev.total_memory()/1048576)) + ' MB') print('CUDA device compute: ' + str(dev.compute_capability())) print('PyCUDA OK\n')